diff -Nru spades-3.10.1+dfsg/changelog.html spades-3.11.1+dfsg/changelog.html --- spades-3.10.1+dfsg/changelog.html 2017-02-28 14:55:56.000000000 +0000 +++ spades-3.11.1+dfsg/changelog.html 2017-09-28 09:05:59.000000000 +0000 @@ -3,6 +3,38 @@

SPAdes Genome Assembler changelog

+

SPAdes 3.11.1, 1 October 2017

+ +

FIX: Handling spaces in path during mismatch correction.

+ +

FIX: Python3 support in rnaSPAdes.

+ +

FIX: K value estimation for long reads.

+ +

FIX: Processing long reads alignments.

+ +

SPAdes 3.11.0, 1 September 2017

+ +

NEW: Support for strand-specific RNA-Seq data in rnaSPAdes.

+ +

NEW: Coverage based isoform detection in rnaSPAdes.

+ +

NEW: Reworked IonHammer read error correction module.

+ +

CHANGE: Improved tandem repeat resolution accuracy.

+ +

CHANGE: Better performance of exSPAnder module.

+ +

CHANGE: metaSPAdes pipeline improvements.

+ +

CHANGE: Better running time and RAM consumption for the entire pipeline.

+ +

FIX: Incomplete paths in GFA output.

+ +

FIX: Mismatch and indel rate in careful mode for isolate datasets (esp. low covered ones).

+ +

FIX: Occasional hanging of edge disconnection procedure in metaSPAdes.

+

SPAdes 3.10.1, 1 March 2017

FIX: Build for MacOS.

@@ -29,7 +61,6 @@

CHANGE: Improved isoform detection algorithm in rnaSPAdes.

-

SPAdes 3.9.1, 4 December 2016

FIX: macOS Sierra crash.

diff -Nru spades-3.10.1+dfsg/configs/corrector/corrector.info spades-3.11.1+dfsg/configs/corrector/corrector.info --- spades-3.10.1+dfsg/configs/corrector/corrector.info 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/configs/corrector/corrector.info 2017-09-28 09:05:14.000000000 +0000 @@ -3,5 +3,6 @@ work_dir: ./test_dataset/input/corrected/tmp, output_dir: ./test_dataset/input/corrected, max_nthreads: 16, -strategy: mapped_squared +strategy: mapped_squared, +log_filename: log.properties } diff -Nru spades-3.10.1+dfsg/configs/corrector/log.properties spades-3.11.1+dfsg/configs/corrector/log.properties --- spades-3.10.1+dfsg/configs/corrector/log.properties 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/configs/corrector/log.properties 2017-09-28 09:05:14.000000000 +0000 @@ -0,0 +1,2 @@ +default=INFO +ContigProcessor=INFO diff -Nru spades-3.10.1+dfsg/configs/debruijn/careful_mda_mode.info spades-3.11.1+dfsg/configs/debruijn/careful_mda_mode.info --- spades-3.10.1+dfsg/configs/debruijn/careful_mda_mode.info 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/configs/debruijn/careful_mda_mode.info 2017-09-28 09:05:14.000000000 +0000 @@ -0,0 +1,39 @@ +simp +{ + ; bulge remover: + br + { + max_relative_coverage 1.1 ; bulge_cov < this * not_bulge_cov + } + + ; complex bulge remover + cbr + { + enabled false + } + + final_tc + { + condition "" + } + + ; bulge remover: + final_br + { + enabled false + } + + init_clean + { + early_it_only true + + activation_cov -1. + ier + { + enabled false + } + + tip_condition "" + ec_condition "" + } +} diff -Nru spades-3.10.1+dfsg/configs/debruijn/careful_mode.info spades-3.11.1+dfsg/configs/debruijn/careful_mode.info --- spades-3.10.1+dfsg/configs/debruijn/careful_mode.info 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/configs/debruijn/careful_mode.info 2017-09-28 09:05:14.000000000 +0000 @@ -1,12 +1,10 @@ - simp { ; bulge remover: br { - max_coverage 1000000.0 - max_relative_coverage 1.5 ; bulge_cov < this * not_bulge_cov - parallel false + max_relative_coverage 0.5 ; bulge_cov < this * not_bulge_cov + ; parallel false } ; complex bulge remover @@ -14,6 +12,12 @@ { enabled false } + + ; bulge remover: + final_br + { + enabled false + } ; relative coverage erroneous component remover: rcc diff -Nru spades-3.10.1+dfsg/configs/debruijn/config.info spades-3.11.1+dfsg/configs/debruijn/config.info --- spades-3.10.1+dfsg/configs/debruijn/config.info 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/configs/debruijn/config.info 2017-09-28 09:05:14.000000000 +0000 @@ -14,10 +14,10 @@ ;FIXME remove! run_mode false project_name TOY_DATASET -dataset ./configs/debruijn/datasets_archive/toy.info +dataset ./configs/debruijn/toy.info log_filename log.properties -output_base ./data/debruijn/ +output_base ./spades_output tmp_dir spades_tmp/ main_iteration true @@ -30,11 +30,11 @@ temp_bin_reads_dir .bin_reads/ max_threads 8 max_memory 120; in Gigabytes -buffer_size 512; in Megabytes +buffer_size 512; in Megabytes entry_point construction ;entry_point simplification -;entry_point pacbio_aligning +;entry_point hybrid_aligning ;entry_point late_pair_info_count ;entry_point distance_estimation ;entry_point repeat_resolving @@ -69,9 +69,6 @@ ; End of developer_mode parameters -; use unipaths as additional contigs instead of just graph edges -use_unipaths false - ;if true simple mismatches are corrected correct_mismatches true @@ -95,6 +92,12 @@ avoid_rc_connections true +calculate_coverage_for_each_lib false +strand_specificity { + ss_enabled false + antisense false +} + contig_output { contigs_name final_contigs scaffolds_name scaffolds @@ -133,6 +136,13 @@ coverage_threshold 10.0 } +; low covered edges remover +lcer +{ + lcer_enabled false + lcer_coverage_threshold 0.0 +} + pacbio_processor { ;align and traverse. @@ -172,3 +182,4 @@ ;flanking coverage range flanking_range 55 series_analysis "" +save_gp false diff -Nru spades-3.10.1+dfsg/configs/debruijn/log.properties spades-3.11.1+dfsg/configs/debruijn/log.properties --- spades-3.10.1+dfsg/configs/debruijn/log.properties 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/configs/debruijn/log.properties 2017-09-28 09:05:14.000000000 +0000 @@ -50,6 +50,8 @@ #LoopDetectingPathExtender=DEBUG #SimpleExtender=DEBUG #ScaffoldingPathExtender=DEBUG +#CoordCoverageExtensionChooser=DEBUG +#SimpleCoverageExtensionChooser=DEBUG #BWAPairInfo=TRACE #LongReadMapper=TRACE diff -Nru spades-3.10.1+dfsg/configs/debruijn/mda_mode.info spades-3.11.1+dfsg/configs/debruijn/mda_mode.info --- spades-3.10.1+dfsg/configs/debruijn/mda_mode.info 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/configs/debruijn/mda_mode.info 2017-09-28 09:05:14.000000000 +0000 @@ -39,7 +39,7 @@ rcc { enabled true - coverage_gap 20. + coverage_gap 10. max_length_coeff 2.0 max_length_with_tips_coeff 3.0 max_vertex_cnt 30 diff -Nru spades-3.10.1+dfsg/configs/debruijn/meta_mode.info spades-3.11.1+dfsg/configs/debruijn/meta_mode.info --- spades-3.10.1+dfsg/configs/debruijn/meta_mode.info 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/configs/debruijn/meta_mode.info 2017-09-28 09:05:14.000000000 +0000 @@ -32,7 +32,7 @@ { ; rctc: tip_cov < rctc * not_tip_cov ; tc_lb: max_tip_length = max((min(k, read_length / 2) * tc_lb), read_length); - condition "{ tc_lb 3.5, , cb 1000000, rctc 2.0 } { tc_lb 6., cb 2.5, rctc 1.0 }" + condition "{ tc_lb 3.5, cb 1000000, rctc 2.0 } { tc_lb 6., cb 2.5, rctc 1.0 }" } ; relative coverage erroneous component remover: @@ -57,16 +57,17 @@ relative_ed { enabled true - diff_mult 10. + diff_mult 5. } ; bulge remover: br { max_coverage 1000000.0 - max_relative_coverage 100000. ; bulge_cov < this * not_bulge_cov + max_relative_coverage 5. ; bulge_cov < this * not_bulge_cov max_delta 10 max_relative_delta 0.1 + dijkstra_vertex_limit 3000 parallel true } @@ -90,14 +91,6 @@ max_relative_delta 0.1 } - ; second final bulge remover: - ; only in meta mode, inherits settings of final_br - second_final_br - { - max_delta 1500 - max_number_edges 3 - } - ; hidden ec remover her { @@ -169,8 +162,10 @@ } params { - remove_overlaps true - cut_all_overlaps true + overlap_removal { + enabled true + cut_all true + } scaffolding_mode old_pe_2015 @@ -187,6 +182,12 @@ } use_coordinated_coverage true + + coordinated_coverage + { + min_path_len 10000 + } + } } @@ -195,6 +196,10 @@ params { scaffolding_mode old + overlap_removal { + enabled false + } + use_coordinated_coverage false remove_overlaps false scaffolding2015 { diff -Nru spades-3.10.1+dfsg/configs/debruijn/moleculo_mode.info spades-3.11.1+dfsg/configs/debruijn/moleculo_mode.info --- spades-3.10.1+dfsg/configs/debruijn/moleculo_mode.info 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/configs/debruijn/moleculo_mode.info 2017-09-28 09:05:14.000000000 +0000 @@ -101,7 +101,11 @@ pe { params { normalize_weight true - cut_all_overlaps true + + overlap_removal { + enabled true + cut_all true + } scaffolding_mode old diff -Nru spades-3.10.1+dfsg/configs/debruijn/pe_params.info spades-3.11.1+dfsg/configs/debruijn/pe_params.info --- spades-3.10.1+dfsg/configs/debruijn/pe_params.info 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/configs/debruijn/pe_params.info 2017-09-28 09:05:14.000000000 +0000 @@ -18,9 +18,12 @@ multi_path_extend false ; old | 2015 | combined | old_pe_2015 scaffolding_mode old_pe_2015 - - remove_overlaps true - cut_all_overlaps false + + overlap_removal { + enabled true + end_start_only false + cut_all false + } split_edge_length 99 normalize_weight true @@ -61,25 +64,20 @@ ;next param should be 0.51 - 1.0 if use_old_score = true and 3.0 otherwise min_gap_score 0.7 - max_must_overlap -2 - max_can_overlap 0.5 + max_can_overlap 1. short_overlap 6 artificial_gap 10 - use_old_score true min_overlap_length 10 - flank_addition_coefficient -5.9 - flank_multiplication_coefficient 0.97 + flank_multiplication_coefficient .5 + flank_addition_coefficient 5 var_coeff 3.0 basic_overlap_coeff 2.0 } - path_cleaning - { - enabled false - } - + path_cleaning_presets "" + use_coordinated_coverage false coordinated_coverage { @@ -88,6 +86,14 @@ min_path_len 1000 } + + simple_coverage_resolver { + enabled false + coverage_delta 0.5 + min_upper_coverage 5 + } + + scaffolding2015 { ; (median * (1+variation) > unique > median * (1 - variation)) relative_weight_cutoff 2.0 @@ -112,6 +118,9 @@ genome_consistency_checker { max_gap 1000 relative_max_gap 0.2 + use_main_storage true ; if set to true, next two parameters are set to min_unique_length + unresolvable_jump 1000 ; length of unresolvable repeats + unique_length 500 ; spelling genome in the alphabet of edges longer than this } uniqueness_analyser { diff -Nru spades-3.10.1+dfsg/configs/debruijn/rna_fast_mode.info spades-3.11.1+dfsg/configs/debruijn/rna_fast_mode.info --- spades-3.10.1+dfsg/configs/debruijn/rna_fast_mode.info 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/configs/debruijn/rna_fast_mode.info 2017-09-28 09:05:14.000000000 +0000 @@ -0,0 +1,12 @@ +simp +{ + ier + { + enabled true + use_rl_for_max_length true ; max_length will be taken max with read_length + use_rl_for_max_length_any_cov false ; use_rl_for_max_length_any_cov will be taken max with read_length + max_length 110 + max_coverage 2 + max_length_any_cov 0 + } +} diff -Nru spades-3.10.1+dfsg/configs/debruijn/rna_mode.info spades-3.11.1+dfsg/configs/debruijn/rna_mode.info --- spades-3.10.1+dfsg/configs/debruijn/rna_mode.info 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/configs/debruijn/rna_mode.info 2017-09-28 09:05:14.000000000 +0000 @@ -2,6 +2,13 @@ preserve_raw_paired_index true +calculate_coverage_for_each_lib true +strand_specificity { + ss_enabled false + antisense false +} + + contig_output { scaffolds_name transcripts ; none --- do not output broken scaffolds | break_gaps --- break only by N steches | break_all --- break all with overlap < k @@ -62,14 +69,13 @@ max_ec_length_coefficient 30 max_coverage_coeff 5.0 } + ;all topology based erroneous connection removers are off ier { enabled false - max_length 100 - max_coverage 2 - max_length_any_cov 0 ; will be taken max with read_length } + ; hidden ec remover her { @@ -100,12 +106,19 @@ } pe { +debug_output true + params { multi_path_extend true - remove_overlaps false scaffolding_mode old + overlap_removal { + enabled true + end_start_only true + cut_all true + } + extension_options { use_default_single_threshold true @@ -115,20 +128,65 @@ scaffolder { cutoff 1 hard_cutoff 10 - - cluster_info false - - min_overlap_for_rna_scaffolding 10 + cluster_info false + min_overlap_for_rna_scaffolding 10 } + path_cleaning_presets "default soft hard" + ; All length cutoffs presented in nucleotides + ; So edges less than or equal to (relative cutoff * RL - K) or (absolute cutoff - K) will be deleted path_cleaning { enabled true - min_length 30 - isolated_min_length 50 - min_length_for_low_covered 150 + min_length 110 + isolated_min_length 130 + isolated_min_cov 4 + min_length_for_low_covered 140 + rel_cutoff 1.3 + rel_isolated_cutoff 1.5 + rel_low_covered_cutoff 1.6 min_coverage 2 } + ; All length cutoffs presented in nucleotides + hard_path_cleaning + { + enabled true + min_length 130 + isolated_min_length 180 + isolated_min_cov 8 + min_length_for_low_covered 180 + rel_cutoff 1.5 + rel_isolated_cutoff 2.0 + rel_low_covered_cutoff 2.0 + min_coverage 3 + } + + ; All length cutoffs presented in nucleotides + soft_path_cleaning + { + enabled true + min_length 85 + isolated_min_length 100 + isolated_min_cov 2 + min_length_for_low_covered 130 + rel_cutoff 1.05 + rel_isolated_cutoff 1.2 + rel_low_covered_cutoff 1.5 + min_coverage 1 + } + + use_coordinated_coverage false + coordinated_coverage { + max_edge_length_repeat 1000 + delta 0.5 + min_path_len 300 + } + + simple_coverage_resolver { + enabled true + coverage_delta 0.5 + min_upper_coverage 2 + } } } diff -Nru spades-3.10.1+dfsg/configs/debruijn/simplification.info spades-3.11.1+dfsg/configs/debruijn/simplification.info --- spades-3.10.1+dfsg/configs/debruijn/simplification.info 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/configs/debruijn/simplification.info 2017-09-28 09:05:14.000000000 +0000 @@ -20,23 +20,24 @@ condition "{ tc_lb 3.5, cb 1000000, rctc 2.0 } { tc_lb 10., cb auto }" } - ; bulge remover: - br - { - enabled true + ; bulge remover: + br + { + enabled true main_iteration_only false - max_bulge_length_coefficient 3. ; max_bulge_length = max_bulge_length_coefficient * k - max_additive_length_coefficient 100 - max_coverage 1000.0 - max_relative_coverage 1.1 ; bulge_cov < this * not_bulge_cov - max_delta 3 - max_relative_delta 0.1 + max_bulge_length_coefficient 3. ; max_bulge_length = max_bulge_length_coefficient * k + max_additive_length_coefficient 100 + max_coverage 1000.0 + max_relative_coverage 1.1 ; bulge_cov < this * not_bulge_cov + max_delta 3 + max_relative_delta 0.1 max_number_edges 1000 + dijkstra_vertex_limit 3000 parallel true buff_size 10000 buff_cov_diff 2. buff_cov_rel_diff 0.2 - } + } ; erroneous connections remover: ec @@ -94,6 +95,7 @@ max_delta 3 max_relative_delta 0.1 max_number_edges 1000 + dijkstra_vertex_limit 3000 parallel true buff_size 10000 buff_cov_diff 2. @@ -143,9 +145,11 @@ ier { enabled true - max_length 0 + use_rl_for_max_length false ; max_length will be taken max with read_length + use_rl_for_max_length_any_cov true ; use_rl_for_max_length_any_cov will be taken max with read_length + max_length 0 ; will be taken max with read_length if option above is set max_coverage 2 - max_length_any_cov 150 ; will be taken max with read_length + max_length_any_cov 150 ; will be taken max with read_length if option above is set } ; topology tip clipper: @@ -193,9 +197,11 @@ ier { enabled true - max_length 0 + use_rl_for_max_length false ; max_length will be taken max with read_length + use_rl_for_max_length_any_cov true ; use_rl_for_max_length_any_cov will be taken max with read_length + max_length 0 ; will be taken max with read_length if option above is set max_coverage 0 - max_length_any_cov 0 ; will be taken max with read_length + max_length_any_cov 0 ; will be taken max with read_length if option above is set } tip_condition "{ tc_lb 3.5, cb auto }" diff -Nru spades-3.10.1+dfsg/configs/debruijn/toy.info spades-3.11.1+dfsg/configs/debruijn/toy.info --- spades-3.10.1+dfsg/configs/debruijn/toy.info 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/configs/debruijn/toy.info 2017-09-28 09:05:14.000000000 +0000 @@ -0,0 +1,4 @@ +reads toy.yaml +single_cell false +; RL 100 + diff -Nru spades-3.10.1+dfsg/configs/debruijn/toy.yaml spades-3.11.1+dfsg/configs/debruijn/toy.yaml --- spades-3.10.1+dfsg/configs/debruijn/toy.yaml 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/configs/debruijn/toy.yaml 2017-09-28 09:05:14.000000000 +0000 @@ -0,0 +1,4 @@ +- left reads: [../../test_dataset/ecoli_1K_1.fq.gz] + orientation: fr + right reads: [../../test_dataset/ecoli_1K_2.fq.gz] + type: paired-end diff -Nru spades-3.10.1+dfsg/debian/changelog spades-3.11.1+dfsg/debian/changelog --- spades-3.10.1+dfsg/debian/changelog 2017-08-10 14:39:50.000000000 +0000 +++ spades-3.11.1+dfsg/debian/changelog 2017-12-18 00:29:26.000000000 +0000 @@ -1,3 +1,18 @@ +spades (3.11.1+dfsg-1) unstable; urgency=medium + + [ Sascha Steinbiss ] + * New upstream release. + * Bump Standards-Version. + * Remove obsolete Testsuite: entry in d/control + * Add d/copyright entry for new included BBHash headers. + + [ Steffen Moeller ] + * debian/upstream/metadata: + - added references to registries + - yamllint cleanliness + + -- Sascha Steinbiss Mon, 18 Dec 2017 00:29:26 +0000 + spades (3.10.1+dfsg-3) unstable; urgency=medium * Fix FTBFS with GCC 7. diff -Nru spades-3.10.1+dfsg/debian/control spades-3.11.1+dfsg/debian/control --- spades-3.10.1+dfsg/debian/control 2017-08-10 14:39:50.000000000 +0000 +++ spades-3.11.1+dfsg/debian/control 2017-12-18 00:28:02.000000000 +0000 @@ -26,8 +26,7 @@ bwa, libbwa-dev, lynx -Standards-Version: 4.0.0 -Testsuite: autopkgtest +Standards-Version: 4.1.2 Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/spades.git Vcs-Git: https://anonscm.debian.org/git/debian-med/spades.git Homepage: http://cab.spbu.ru/software/spades/ diff -Nru spades-3.10.1+dfsg/debian/copyright spades-3.11.1+dfsg/debian/copyright --- spades-3.10.1+dfsg/debian/copyright 2017-08-08 14:54:45.000000000 +0000 +++ spades-3.11.1+dfsg/debian/copyright 2017-12-18 00:29:26.000000000 +0000 @@ -267,6 +267,12 @@ Copyright: (c) 2014, 2015, 2016 Jarryd Beck License: MIT +Files: ext/include/boomphf/BooPHF.h +Copyright: (c) 2015 Guillaume Rizk +License: MIT +Comment: + MIT licensed as part of BBHash (https://github.com/rizkg/BBHash) + License: MIT Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff -Nru spades-3.10.1+dfsg/debian/patches/0003_accept-system-bwa.patch spades-3.11.1+dfsg/debian/patches/0003_accept-system-bwa.patch --- spades-3.10.1+dfsg/debian/patches/0003_accept-system-bwa.patch 2017-08-08 14:54:45.000000000 +0000 +++ spades-3.11.1+dfsg/debian/patches/0003_accept-system-bwa.patch 2017-12-18 00:25:36.000000000 +0000 @@ -13,9 +13,9 @@ In case you consider taking over the patch I'd suggest to take it over into some common import file. ---- spades.orig/src/spades_pipeline/support.py -+++ spades/src/spades_pipeline/support.py -@@ -58,6 +58,22 @@ +--- a/src/spades_pipeline/support.py ++++ b/src/spades_pipeline/support.py +@@ -63,6 +63,22 @@ sys.stdout.write("\n\n" + prefix + " " + warn_str + "\n\n\n") sys.stdout.flush() @@ -37,8 +37,8 @@ + return None def check_python_version(): - if sys.version[0:3] not in options_storage.SUPPORTED_PYTHON_VERSIONS: -@@ -75,8 +91,14 @@ + def __next_version(version): +@@ -101,8 +117,14 @@ for binary in ["hammer", "ionhammer", "spades", "bwa-spades", "dipspades"]: binary_path = os.path.join(binary_dir, binary) if not os.path.isfile(binary_path): @@ -55,8 +55,8 @@ def check_file_existence(input_filename, message="", log=None, dipspades=False): filename = abspath(expanduser(input_filename)) ---- spades.orig/spades.py -+++ spades/spades.py +--- a/spades.py ++++ b/spades.py @@ -156,6 +156,22 @@ print_value(cfg, "common", "max_memory", "Memory limit (in Gb)", " ") log.info("") @@ -79,8 +79,8 @@ + return None def fill_cfg(options_to_parse, log, secondary_filling=False): - skip_output_dir=secondary_filling -@@ -470,7 +486,11 @@ + skip_output_dir = secondary_filling +@@ -510,7 +526,11 @@ if (not options_storage.only_error_correction) and options_storage.mismatch_corrector: cfg["mismatch_corrector"] = empty_config() cfg["mismatch_corrector"].__dict__["skip-masked"] = None diff -Nru spades-3.10.1+dfsg/debian/patches/0111_use_full_config_dirs.patch spades-3.11.1+dfsg/debian/patches/0111_use_full_config_dirs.patch --- spades-3.10.1+dfsg/debian/patches/0111_use_full_config_dirs.patch 2017-08-08 14:54:45.000000000 +0000 +++ spades-3.11.1+dfsg/debian/patches/0111_use_full_config_dirs.patch 2017-12-18 00:25:36.000000000 +0000 @@ -2,9 +2,9 @@ This patch makes sure that the config files are not dependent on local directory structure. Author: Sascha Steinbiss ---- spades.orig/src/spades_pipeline/spades_logic.py -+++ spades/src/spades_pipeline/spades_logic.py -@@ -185,6 +185,20 @@ +--- a/src/spades_pipeline/spades_logic.py ++++ b/src/spades_pipeline/spades_logic.py +@@ -209,6 +209,20 @@ add_configs(command, dst_configs) @@ -25,7 +25,7 @@ #print("Calling: " + " ".join(command)) support.sys_call(command, log) -@@ -232,6 +246,19 @@ +@@ -256,6 +270,19 @@ command = [os.path.join(execution_home, "scaffold_correction"), cfg_file_name] add_configs(command, dst_configs) log.info(str(command)) diff -Nru spades-3.10.1+dfsg/debian/patches/0114_use_debian_provided_libs.patch spades-3.11.1+dfsg/debian/patches/0114_use_debian_provided_libs.patch --- spades-3.10.1+dfsg/debian/patches/0114_use_debian_provided_libs.patch 2017-08-08 14:54:45.000000000 +0000 +++ spades-3.11.1+dfsg/debian/patches/0114_use_debian_provided_libs.patch 2017-12-18 00:25:36.000000000 +0000 @@ -8,8 +8,8 @@ does not report the 'stats.cactive_max' mallctl stat anymore. Hence to avoid a NULL dereference the max rss value is logged. Author: Sascha Steinbiss ---- spades.orig/src/cmake/includes.cmake -+++ spades/src/cmake/includes.cmake +--- a/src/cmake/includes.cmake ++++ b/src/cmake/includes.cmake @@ -3,6 +3,8 @@ set(CMAKE_INCLUDE_CURRENT_DIR ON) set(CMAKE_INCLUDE_SYSTEM_FLAG_C "-isystem ") @@ -19,8 +19,8 @@ include_directories(${SPADES_MAIN_INCLUDE_DIR} ${SPADES_BUILT_INCLUDE_DIR} ${CMAKE_SOURCE_DIR} ${SPADES_MODULES_DIR}) include_directories(SYSTEM "${EXT_DIR}/include") include_directories(SYSTEM "${ZLIB_INCLUDE_DIRS}") ---- spades.orig/src/cmake/flags.cmake -+++ spades/src/cmake/flags.cmake +--- a/src/cmake/flags.cmake ++++ b/src/cmake/flags.cmake @@ -53,3 +53,5 @@ if (SPADES_USE_TCMALLOC) add_compile_options(-fno-omit-frame-pointer) @@ -28,16 +28,16 @@ + +add_definitions(-DJEMALLOC_NO_DEMANGLE) \ No newline at end of file ---- spades.orig/src/common/assembly_graph/CMakeLists.txt -+++ spades/src/common/assembly_graph/CMakeLists.txt +--- a/src/common/assembly_graph/CMakeLists.txt ++++ b/src/common/assembly_graph/CMakeLists.txt @@ -9,4 +9,4 @@ add_library(assembly_graph STATIC components/connected_component.cpp paths/bidirectional_path.cpp paths/bidirectional_path_io/io_support.cpp paths/bidirectional_path_io/bidirectional_path_output.cpp graph_support/scaff_supplementary.cpp ../modules/alignment/edge_index_refiller.cpp graph_support/coverage_uniformity_analyzer.cpp) -target_link_libraries(assembly_graph hattrie) +target_link_libraries(assembly_graph hat-trie) ---- spades.orig/src/common/modules/alignment/kmer_map.hpp -+++ spades/src/common/modules/alignment/kmer_map.hpp +--- a/src/common/modules/alignment/kmer_map.hpp ++++ b/src/common/modules/alignment/kmer_map.hpp @@ -9,7 +9,7 @@ #include "sequence/rtseq.hpp" @@ -47,137 +47,50 @@ #include namespace debruijn_graph { ---- spades.orig/src/common/modules/path_extend/overlap_analysis.hpp -+++ spades/src/common/modules/path_extend/overlap_analysis.hpp +--- a/src/common/modules/path_extend/overlap_analysis.hpp ++++ b/src/common/modules/path_extend/overlap_analysis.hpp @@ -2,7 +2,7 @@ #include "utils/logger/logger.hpp" - #include "utils/range.hpp" + #include "sequence/range.hpp" -#include "ssw/ssw_cpp.h" +#include "ssw_cpp.h" namespace debruijn_graph { - using omnigraph::Range; ---- spades.orig/src/common/utils/coverage_model/kmer_coverage_model.cpp -+++ spades/src/common/utils/coverage_model/kmer_coverage_model.cpp -@@ -18,7 +18,7 @@ - #include - #include - --#include -+#include - - #include ---- spades.orig/src/common/utils/logger/logger_impl.cpp -+++ spades/src/common/utils/logger/logger_impl.cpp +--- a/src/common/utils/logger/logger_impl.cpp ++++ b/src/common/utils/logger/logger_impl.cpp @@ -13,6 +13,7 @@ #include #include "utils/logger/logger.hpp" -+#include "utils/memory_limit.hpp" ++#include "utils/perf/memory_limit.hpp" #include "config.hpp" -@@ -101,14 +102,15 @@ +@@ -104,15 +105,16 @@ + size_t max_rss; - void logger::log(level desired_level, const char* file, size_t line_num, const char* source, const char* msg) { - double time = timer_.time(); + #ifdef SPADES_USE_JEMALLOC - const size_t *cmem = 0, *cmem_max = 0; -- size_t clen = sizeof(cmem); -+ const size_t *cmem = 0; -+ size_t clen = sizeof(cmem), cmem_max = 0; ++ const size_t *cmem = 0;//, *cmem_max = 0; + size_t clen = sizeof(cmem); je_mallctl("stats.cactive", &cmem, &clen, NULL, 0); - je_mallctl("stats.cactive_max", &cmem_max, &clen, NULL, 0); + //je_mallctl("stats.cactive_max", &cmem_max, &clen, NULL, 0); -+ cmem_max = get_max_rss(); - - for (auto it = writers_.begin(); it != writers_.end(); ++it) -- (*it)->write_msg(time, (*cmem) / 1024, (*cmem_max) / 1024, desired_level, file, line_num, source, msg); -+ (*it)->write_msg(time, (*cmem) / 1024, cmem_max, desired_level, file, line_num, source, msg); - } + mem = (*cmem) / 1024; +- max_rss = (*cmem_max) / 1024; ++ //max_rss = (*cmem_max) / 1024; ++ max_rss = utils::get_max_rss(); #else - void logger::log(level desired_level, const char* file, size_t line_num, const char* source, const char* msg) { ---- spades.orig/src/common/utils/memory_limit.hpp -+++ spades/src/common/utils/memory_limit.hpp -@@ -20,6 +20,7 @@ - #include - - #include "config.hpp" -+#include "verify.hpp" - - #ifdef SPADES_USE_JEMALLOC - ---- spades.orig/src/projects/cclean/brute_force_clean.cpp -+++ spades/src/projects/cclean/brute_force_clean.cpp -@@ -12,7 +12,7 @@ - #include - - #include "adapter_index.hpp" --#include // Striped Smith-Waterman aligner -+#include "ssw_cpp.h" // Striped Smith-Waterman aligner - #include "additional.cpp" - #include "output.hpp" - ---- spades.orig/src/projects/cclean/job_wrappers.cpp -+++ spades/src/projects/cclean/job_wrappers.cpp -@@ -13,7 +13,7 @@ - #include "valid_kmer_generator.hpp" - #include "adapter_index.hpp" - #include "output.hpp" --#include "ssw/ssw_cpp.h" -+#include "ssw_cpp.h" - #include "utils.hpp" - - using cclean_output::print_alignment; ---- spades.orig/src/projects/cclean/main.cpp -+++ spades/src/projects/cclean/main.cpp -@@ -21,7 +21,7 @@ - - #include "valid_kmer_generator.hpp" - #include "io/read_processor.hpp" --#include "modules/ssw_cpp.h" -+#include "ssw_cpp.h" - #include "additional.cpp" - - #include "job_wrappers.hpp" ---- spades.orig/src/projects/cclean/output.hpp -+++ spades/src/projects/cclean/output.hpp -@@ -14,7 +14,7 @@ - #include - #include - #include "comparator.hpp" --#include "modules/ssw_cpp.h" -+#include "ssw_cpp.h" - - namespace cclean_output { - ---- spades.orig/src/projects/cclean/utils.cpp -+++ spades/src/projects/cclean/utils.cpp -@@ -12,8 +12,7 @@ - #include +- max_rss = get_max_rss(); ++ max_rss = utils::get_max_rss(); + #endif - #include "utils.hpp" --#include --#include // Striped Smith-Waterman aligner -+#include // Striped Smith-Waterman aligner - #include - #include "additional.cpp" - ---- spades.orig/src/projects/cclean/utils.hpp -+++ spades/src/projects/cclean/utils.hpp -@@ -8,7 +8,7 @@ - #ifndef UTILS_HPP - #define UTILS_HPP - --#include // Striped Smith-Waterman aligner -+#include // Striped Smith-Waterman aligner - #include - #include "additional.cpp" - #include "running_modes.hpp" ---- spades.orig/ext/src/CMakeLists.txt -+++ spades/ext/src/CMakeLists.txt + for (auto it = writers_.begin(); it != writers_.end(); ++it) +--- a/ext/src/CMakeLists.txt ++++ b/ext/src/CMakeLists.txt @@ -1,13 +1,9 @@ # -*- cmake -*- @@ -192,3 +105,24 @@ add_subdirectory(getopt_pp) add_subdirectory(llvm) -add_subdirectory(htrie) +--- a/src/common/modules/coverage_model/kmer_coverage_model.cpp ++++ b/src/common/modules/coverage_model/kmer_coverage_model.cpp +@@ -18,7 +18,7 @@ + #include + #include + +-#include ++#include + + #include + +--- a/src/common/utils/perf/memory_limit.hpp ++++ b/src/common/utils/perf/memory_limit.hpp +@@ -28,6 +28,7 @@ + #endif + + #include ++#include + + namespace utils { + diff -Nru spades-3.10.1+dfsg/debian/patches/gcc7.patch spades-3.11.1+dfsg/debian/patches/gcc7.patch --- spades-3.10.1+dfsg/debian/patches/gcc7.patch 2017-08-10 14:39:50.000000000 +0000 +++ spades-3.11.1+dfsg/debian/patches/gcc7.patch 2017-12-18 00:25:36.000000000 +0000 @@ -3,11 +3,11 @@ Forwarded: https://github.com/ablab/spades/issues/26 --- a/src/common/utils/segfault_handler.hpp +++ b/src/common/utils/segfault_handler.hpp -@@ -12,6 +12,7 @@ - #include "boost/noncopyable.hpp" +@@ -13,6 +13,7 @@ + #include #include +#include - struct segfault_handler : boost::noncopyable { - typedef std::function callback_t; + namespace utils { + diff -Nru spades-3.10.1+dfsg/debian/patches/spelling spades-3.11.1+dfsg/debian/patches/spelling --- spades-3.10.1+dfsg/debian/patches/spelling 2017-08-08 14:54:45.000000000 +0000 +++ spades-3.11.1+dfsg/debian/patches/spelling 2017-12-18 00:25:36.000000000 +0000 @@ -1,8 +1,8 @@ From: Michael R. Crusoe Subject: Spelling fixes ---- spades.orig/manual.html -+++ spades/manual.html -@@ -760,7 +760,7 @@ +--- a/manual.html ++++ b/manual.html +@@ -761,7 +761,7 @@ Make sure that files with right reads are given in the same order as corresponding files with left reads.

@@ -11,7 +11,7 @@

  
      lib_pe1_left_1.fastq
-@@ -989,7 +989,7 @@
+@@ -990,7 +990,7 @@
  

@@ -20,7 +20,7 @@

  
-@@ -1138,7 +1138,7 @@
+@@ -1139,7 +1139,7 @@
  

@@ -29,8 +29,8 @@

      
      NODE_3_length_237403_cov_243.207_ID_45
---- spades.orig/src/common/assembly_graph/stats/picture_dump.hpp
-+++ spades/src/common/assembly_graph/stats/picture_dump.hpp
+--- a/src/common/assembly_graph/stats/picture_dump.hpp
++++ b/src/common/assembly_graph/stats/picture_dump.hpp
 @@ -123,7 +123,7 @@
          INFO("Genome mapping results:");
          INFO("Covered k+1-mers:" << covered_kp1mers << " of " << (genome_.size() - k_) << " which is "
@@ -40,9 +40,9 @@
          INFO("Continuity failtures " << fail);
      }
  };
---- spades.orig/src/common/modules/alignment/pacbio/pac_index.hpp
-+++ spades/src/common/modules/alignment/pacbio/pac_index.hpp
-@@ -581,7 +581,7 @@
+--- a/src/common/modules/alignment/pacbio/pac_index.hpp
++++ b/src/common/modules/alignment/pacbio/pac_index.hpp
+@@ -593,7 +593,7 @@
                              || !IsConsistent(*(iter->second),
                                               *(next_iter->second))) {
                          if (next_iter != cur_cluster.end()) {
@@ -51,8 +51,8 @@
                              DEBUG("on "<< iter->second->str(g_));
                  DEBUG("and " << next_iter->second->str(g_));
                          }
---- spades.orig/src/projects/dipspades/kmer_gluing/equal_sequence_gluer.hpp
-+++ spades/src/projects/dipspades/kmer_gluing/equal_sequence_gluer.hpp
+--- a/src/projects/dipspades/kmer_gluing/equal_sequence_gluer.hpp
++++ b/src/projects/dipspades/kmer_gluing/equal_sequence_gluer.hpp
 @@ -74,9 +74,9 @@
                  pos2 = graph_.length(edge1) - tmp - 1;
              }
@@ -65,8 +65,8 @@
              TRACE("New edge1: " << graph_.int_id(split_edges.first) << ", length: " << graph_.length(split_edges.first));
              TRACE("New edge2: " << graph_.int_id(split_edges.second) << ", length: " << graph_.length(split_edges.second));
              edge1 = split_edges.first;
---- spades.orig/src/projects/dipspades/polymorphic_bulge_remover/bulge_gluer.hpp
-+++ spades/src/projects/dipspades/polymorphic_bulge_remover/bulge_gluer.hpp
+--- a/src/projects/dipspades/polymorphic_bulge_remover/bulge_gluer.hpp
++++ b/src/projects/dipspades/polymorphic_bulge_remover/bulge_gluer.hpp
 @@ -74,7 +74,7 @@
          auto splitted_bulge = splitter_.SplitBulge(directed_bulge, glue_def_res);
  
@@ -76,8 +76,8 @@
              GlueSplittedBulge(splitted_bulge);
              return true;
          }
---- spades.orig/src/projects/dipspades/polymorphic_bulge_remover/bulge_splitter.hpp
-+++ spades/src/projects/dipspades/polymorphic_bulge_remover/bulge_splitter.hpp
+--- a/src/projects/dipspades/polymorphic_bulge_remover/bulge_splitter.hpp
++++ b/src/projects/dipspades/polymorphic_bulge_remover/bulge_splitter.hpp
 @@ -260,7 +260,7 @@
          SplitOppositeEdge(split_res, split_paths.second, split_partlens.second);
  
diff -Nru spades-3.10.1+dfsg/debian/upstream/metadata spades-3.11.1+dfsg/debian/upstream/metadata
--- spades-3.10.1+dfsg/debian/upstream/metadata	2017-08-08 14:54:45.000000000 +0000
+++ spades-3.11.1+dfsg/debian/upstream/metadata	2017-12-18 00:25:26.000000000 +0000
@@ -1,6 +1,13 @@
 Reference:
- - Author: Anton Bankevich and Sergey Nurk and Dmitry Antipov and Alexey A. Gurevich and Mikhail Dvorkin and Alexander S. Kulikov and Valery M. Lesin and Sergey I. Nikolenko and Son Pham and Andrey D. Prjibelski and Alexey V. Pyshkin and Alexander V. Sirotkin and Nikolay Vyahhi and Glenn Tesler and Max A. Alekseyev and Pavel A. Pevzner 
-   Title: "SPAdes: A New Genome Assembly Algorithm and Its Applications to Single-Cell Sequencing" 
+ - Author: >
+    Anton Bankevich and Sergey Nurk and Dmitry Antipov and Alexey
+    A. Gurevich and Mikhail Dvorkin and Alexander S. Kulikov and Valery
+    M. Lesin and Sergey I. Nikolenko and Son Pham and Andrey D. Prjibelski
+    and Alexey V. Pyshkin and Alexander V. Sirotkin and Nikolay Vyahhi
+    and Glenn Tesler and Max A. Alekseyev and Pavel A. Pevzner
+   Title: >
+    SPAdes: A New Genome Assembly Algorithm and Its Applications to
+    Single-Cell Sequencing
    Journal: Journal of Computational Biology
    Year: 2012
    Volume: 19
@@ -10,7 +17,12 @@
    PMID: 22506599
    URL: http://online.liebertpub.com/doi/abs/10.1089/cmb.2012.0021
    eprint: http://online.liebertpub.com/doi/pdfplus/10.1089/cmb.2012.0021
- - Author: Sergey Nurk and Anton Bankevich and Dmitry Antipov and Alexey Gurevich and Anton Korobeynikov and Alla Lapidus and Andrey Prjibelsky and Alexey Pyshkin and Alexander Sirotkin and Yakov Sirotkin and Ramunas Stepanauskas and Jeffrey McLean and Roger Lasken and Scott Clingenpeel and Tanja Woyke and Glenn Tesler and Max Alekseyev and Pavel Pevzner
+ - Author: >
+    Sergey Nurk and Anton Bankevich and Dmitry Antipov and Alexey Gurevich
+    and Anton Korobeynikov and Alla Lapidus and Andrey Prjibelsky and
+    Alexey Pyshkin and Alexander Sirotkin and Yakov Sirotkin and Ramunas
+    Stepanauskas and Jeffrey McLean and Roger Lasken and Scott Clingenpeel
+    and Tanja Woyke and Glenn Tesler and Max Alekseyev and Pavel Pevzner
    Title: Assembling Genomes and Mini-metagenomes from Highly Chimeric Reads
    Journal: Lecture Notes in Computer Science
    Year: 2013
@@ -18,4 +30,10 @@
    Pages: 158-170
    DOI: 10.1007/978-3-642-37195-0_13
    URL: http://link.springer.com/chapter/10.1007%2F978-3-642-37195-0_13
-
+Registry:
+ - Name: RRID
+   Entry: SCR_000131
+ - Name: OMICtools
+   Entry: OMICS_01502
+ - Name: bio.tools
+   Entry: Spades
diff -Nru spades-3.10.1+dfsg/ext/include/boomphf/BooPHF.h spades-3.11.1+dfsg/ext/include/boomphf/BooPHF.h
--- spades-3.10.1+dfsg/ext/include/boomphf/BooPHF.h	1970-01-01 00:00:00.000000000 +0000
+++ spades-3.11.1+dfsg/ext/include/boomphf/BooPHF.h	2017-09-28 09:05:18.000000000 +0000
@@ -0,0 +1,1422 @@
+// BooPHF library
+// intended to be a minimal perfect hash function with fast and low memory
+// construction, at the cost of (slightly) higher bits/elem than other state of
+// the art libraries once built.  should work with arbitray large number of
+// elements, based on a cascade of "collision-free" bit arrays
+
+#pragma once
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include  // for make_shared
+#include 
+
+//#define CCDEBUG
+//#define PDEBUG
+
+namespace boomphf {
+
+inline uint64_t printPt( pthread_t pt) {
+    unsigned char *ptc = (unsigned char*)(void*)(&pt);
+    uint64_t res =0;
+    for (size_t i=0; i
+class bfile_iterator : public std::iterator{
+  public:
+
+    bfile_iterator()
+            : _is(nullptr)
+            , _pos(0) ,_inbuff (0), _cptread(0)
+    {
+        _buffsize = 10000;
+        _buffer = (basetype *) malloc(_buffsize*sizeof(basetype));
+    }
+
+    bfile_iterator(const bfile_iterator& cr)
+    {
+        _buffsize = cr._buffsize;
+        _pos = cr._pos;
+        _is = cr._is;
+        _buffer = (basetype *) malloc(_buffsize*sizeof(basetype));
+        memcpy(_buffer,cr._buffer,_buffsize*sizeof(basetype) );
+        _inbuff = cr._inbuff;
+        _cptread = cr._cptread;
+        _elem = cr._elem;
+    }
+
+    bfile_iterator(FILE* is): _is(is) , _pos(0) ,_inbuff (0), _cptread(0)
+    {
+        //printf("bf it %p\n",_is);
+        _buffsize = 10000;
+        _buffer = (basetype *) malloc(_buffsize*sizeof(basetype));
+        int reso = fseek(_is,0,SEEK_SET);
+        advance();
+    }
+
+    ~bfile_iterator()
+    {
+        if(_buffer!=NULL)
+            free(_buffer);
+    }
+
+
+    basetype const& operator*()  {  return _elem;  }
+
+    bfile_iterator& operator++()
+    {
+        advance();
+        return *this;
+    }
+
+    friend bool operator==(bfile_iterator const& lhs, bfile_iterator const& rhs)
+    {
+        if (!lhs._is || !rhs._is)  {  if (!lhs._is && !rhs._is) {  return true; } else {  return false;  } }
+        assert(lhs._is == rhs._is);
+        return rhs._pos == lhs._pos;
+    }
+
+    friend bool operator!=(bfile_iterator const& lhs, bfile_iterator const& rhs)  {  return !(lhs == rhs);  }
+
+  private:
+    void advance()
+    {
+
+        //printf("_cptread %i _inbuff %i \n",_cptread,_inbuff);
+
+        _pos++;
+
+        if(_cptread >= _inbuff)
+        {
+
+            int res = fread(_buffer,sizeof(basetype),_buffsize,_is);
+
+            //printf("read %i new elem last %llu  %p\n",res,_buffer[res-1],_is);
+            _inbuff = res; _cptread = 0;
+
+            if(res == 0)
+            {
+                _is = nullptr;
+                _pos = 0;
+                return;
+            }
+        }
+
+        _elem = _buffer[_cptread];
+        _cptread ++;
+    }
+    basetype _elem;
+    FILE * _is;
+    unsigned long _pos;
+
+    basetype * _buffer; // for buffered read
+    int _inbuff, _cptread;
+    int _buffsize;
+};
+
+
+template 
+class file_binary{
+  public:
+
+    file_binary(const char* filename)
+    {
+        _is = fopen(filename, "rb");
+
+        if (!_is) {
+            throw std::invalid_argument("Error opening " + std::string(filename));
+        }
+    }
+
+    ~file_binary()
+    {
+        fclose(_is);
+    }
+
+    bfile_iterator begin() const
+    {
+        return bfile_iterator(_is);
+    }
+
+    bfile_iterator end() const {return bfile_iterator(); }
+
+    size_t        size () const  {  return 0;  }//todo ?
+
+  private:
+    FILE * _is;
+};
+
+
+#define L8 0x0101010101010101ULL // Every lowest 8th bit set: 00000001...
+#define G2 0xAAAAAAAAAAAAAAAAULL // Every highest 2nd bit: 101010...
+#define G4 0x3333333333333333ULL // 00110011 ... used to group the sum of 4 bits.
+#define G8 0x0F0F0F0F0F0F0F0FULL
+
+static inline unsigned popcount_64(uint64_t x) {
+    // Step 1:  00 - 00 = 0;  01 - 00 = 01; 10 - 01 = 01; 11 - 01 = 10;
+    x = x - ((x & G2) >> 1);
+    // step 2:  add 2 groups of 2.
+    x = (x & G4) + ((x >> 2) & G4);
+    // 2 groups of 4.
+    x = (x + (x >> 4)) & G8;
+    // Using a multiply to collect the 8 groups of 8 together.
+    x = x * L8 >> 56;
+    return x;
+}
+
+///// progress bar
+class Progress
+{
+  public:
+    int timer_mode;
+    struct timeval timestamp;
+    double heure_debut, heure_actuelle ;
+    std::string   message;
+
+    uint64_t done;
+    uint64_t todo;
+    int subdiv ; // progress printed every 1/subdiv of total to do
+    double partial;
+    int _nthreads;
+    std::vector partial_threaded;
+    std::vector done_threaded;
+
+    double steps ; //steps = todo/subidv
+
+    void init(uint64_t ntasks, const char * msg,int nthreads =1)
+    {
+        _nthreads = nthreads;
+        message = std::string(msg);
+        gettimeofday(×tamp, NULL);
+        heure_debut = timestamp.tv_sec +(timestamp.tv_usec/1000000.0);
+
+        //fprintf(stderr,"| %-*s |\n",98,msg);
+
+        todo= ntasks;
+        done = 0;
+        partial =0;
+
+        partial_threaded.resize(_nthreads);
+        done_threaded.resize(_nthreads);
+
+        for (int ii=0; ii<_nthreads;ii++) partial_threaded[ii]=0;
+        for (int ii=0; ii<_nthreads;ii++) done_threaded[ii]=0;
+        subdiv= 1000;
+        steps = (double)todo / (double)subdiv;
+
+        if(!timer_mode)
+        {
+            fprintf(stderr,"[");fflush(stderr);
+        }
+    }
+
+    void finish()
+    {
+        set(todo);
+        if(timer_mode)
+            fprintf(stderr,"\n");
+        else
+            fprintf(stderr,"]\n");
+
+        fflush(stderr);
+        todo= 0;
+        done = 0;
+        partial =0;
+
+    }
+    void finish_threaded()// called by only one of the threads
+    {
+        done = 0;
+        double rem = 0;
+        for (int ii=0; ii<_nthreads;ii++) done += (done_threaded[ii] );
+        for (int ii=0; ii<_nthreads;ii++) partial += (partial_threaded[ii] );
+
+        finish();
+
+    }
+    void inc(uint64_t ntasks_done)
+    {
+        done += ntasks_done;
+        partial += ntasks_done;
+
+
+        while(partial >= steps)
+        {
+            if(timer_mode)
+            {
+                gettimeofday(×tamp, NULL);
+                heure_actuelle = timestamp.tv_sec +(timestamp.tv_usec/1000000.0);
+                double elapsed = heure_actuelle - heure_debut;
+                double speed = done / elapsed;
+                double rem = (todo-done) / speed;
+                if(done>todo) rem=0;
+                int min_e  = (int)(elapsed / 60) ;
+                elapsed -= min_e*60;
+                int min_r  = (int)(rem / 60) ;
+                rem -= min_r*60;
+
+                fprintf(stderr,"%c[%s]  %-5.3g%%   elapsed: %3i min %-2.0f sec   remaining: %3i min %-2.0f sec",13,
+                        message.c_str(),
+                        100*(double)done/todo,
+                        min_e,elapsed,min_r,rem);
+
+            }
+            else
+            {
+                fprintf(stderr,"-");fflush(stderr);
+            }
+            partial -= steps;
+        }
+
+
+    }
+
+    void inc(uint64_t ntasks_done, int tid) //threads collaborate to this same progress bar
+    {
+        partial_threaded[tid] += ntasks_done;
+        done_threaded[tid] += ntasks_done;
+        while(partial_threaded[tid] >= steps)
+        {
+            if(timer_mode)
+            {
+                struct timeval timet;
+                double now;
+                gettimeofday(&timet, NULL);
+                now = timet.tv_sec +(timet.tv_usec/1000000.0);
+                uint64_t total_done  = 0;
+                for (int ii=0; ii<_nthreads;ii++) total_done += (done_threaded[ii] );
+                double elapsed = now - heure_debut;
+                double speed = total_done / elapsed;
+                double rem = (todo-total_done) / speed;
+                if(total_done > todo) rem =0;
+                int min_e  =  (int)(elapsed / 60) ;
+                elapsed -= min_e*60;
+                int min_r  =  (int)(rem / 60) ;
+                rem -= min_r*60;
+
+                fprintf(stderr,"%c[%s]  %-5.3g%%   elapsed: %3i min %-2.0f sec   remaining: %3i min %-2.0f sec",13,
+                        message.c_str(),
+                        100*(double)total_done/todo,
+                        min_e,elapsed,min_r,rem);
+            }
+            else
+            {
+                fprintf(stderr,"-");fflush(stderr);
+            }
+            partial_threaded[tid] -= steps;
+
+        }
+
+    }
+
+    void set(uint64_t ntasks_done) {
+        if (ntasks_done > done)
+            inc(ntasks_done-done);
+    }
+    Progress () :     timer_mode(0) {}
+    //include timer, to print ETA ?
+};
+
+
+
+////////////////////////////////////////////////////////////////
+#pragma mark -
+#pragma mark hasher
+////////////////////////////////////////////////////////////////
+
+typedef std::array hash_pair_t;
+
+typedef hash_pair_t internal_hash_t; // ou hash_pair_t directement ?  __uint128_t
+
+typedef decltype(std::declval >().begin()) diskit_hash128_t;
+typedef decltype(std::declval< std::vector< internal_hash_t> >().begin()) vectorit_hash128_t;
+
+struct internalHasher {
+    uint64_t operator()(const internal_hash_t& key) const {
+        uint64_t s0 = key[0];
+        uint64_t s1 = key[1];
+        s1 ^= s1 << 23;
+        return  (s1 ^ s0 ^ (s1 >> 17) ^ (s0 >> 26)) + s0;
+    }
+};
+
+template class XorshiftHashFunctors {
+    /*  Xorshift128*
+        Written in 2014 by Sebastiano Vigna (vigna@acm.org)
+
+        To the extent possible under law, the author has dedicated all copyright
+        and related and neighboring rights to this software to the public domain
+        worldwide. This software is distributed without any warranty.
+
+        See .
+
+        This is the fastest generator passing BigCrush without
+        systematic failures, but due to the relatively short period it is
+        acceptable only for applications with a mild amount of parallelism;
+        otherwise, use a xorshift1024* generator.
+
+        The state must be seeded so that it is not everywhere zero. If you have
+        a nonzero 64-bit seed, we suggest to pass it twice through
+        MurmurHash3's avalanching function. */
+  public:
+    template
+    hash_pair_t hashpair128(const Item& key) const {
+        auto h = singleHasher(key);
+        return { h.first, h.second };
+    }
+
+    //return next hash an update state s
+    uint64_t next(hash_pair_t &s) const {
+        uint64_t s1 = s[0];
+        const uint64_t s0 = s[1];
+        s[0] = s0;
+        s1 ^= s1 << 23; // a
+        return (s[1] = (s1 ^ s0 ^ (s1 >> 17) ^ (s0 >> 26))) + s0; // b, c
+    }
+
+  private:
+    SingleHasher_t singleHasher;
+};
+
+
+////////////////////////////////////////////////////////////////
+#pragma mark -
+#pragma mark iterators
+////////////////////////////////////////////////////////////////
+
+template 
+struct iter_range {
+    iter_range(Iterator b, Iterator e)
+            : m_begin(b), m_end(e) {}
+
+    Iterator begin() const { return m_begin; }
+    Iterator end() const { return m_end; }
+
+    Iterator m_begin, m_end;
+};
+
+template 
+iter_range range(Iterator begin, Iterator end) {
+    return iter_range(begin, end);
+}
+
+////////////////////////////////////////////////////////////////
+#pragma mark -
+#pragma mark BitVector
+////////////////////////////////////////////////////////////////
+
+class bitVector {
+
+  public:
+
+    bitVector() : _size(0)
+    {
+        _bitArray = nullptr;
+    }
+
+    bitVector(uint64_t n) : _size(n)
+    {
+        _nchar  = (1ULL+n/64ULL);
+        _bitArray =  (uint64_t *) calloc (_nchar,sizeof(uint64_t));
+    }
+
+    ~bitVector()
+    {
+        if(_bitArray != nullptr)
+            free(_bitArray);
+    }
+
+    //copy constructor
+    bitVector(bitVector const &r)
+    {
+        _size =  r._size;
+        _nchar = r._nchar;
+        _ranks = r._ranks;
+        _bitArray = (uint64_t *) calloc (_nchar,sizeof(uint64_t));
+        memcpy(_bitArray, r._bitArray, _nchar*sizeof(uint64_t) );
+    }
+
+    // Copy assignment operator
+    bitVector &operator=(bitVector const &r)
+    {
+        if (&r != this)
+        {
+            _size =  r._size;
+            _nchar = r._nchar;
+            _ranks = r._ranks;
+            if(_bitArray != nullptr)
+                free(_bitArray);
+            _bitArray = (uint64_t *) calloc (_nchar,sizeof(uint64_t));
+            memcpy(_bitArray, r._bitArray, _nchar*sizeof(uint64_t) );
+        }
+        return *this;
+    }
+
+    // Move assignment operator
+    bitVector &operator=(bitVector &&r)
+    {
+        //printf("bitVector move assignment \n");
+        if (&r != this)
+        {
+            if(_bitArray != nullptr)
+                free(_bitArray);
+
+            _size =  std::move (r._size);
+            _nchar = std::move (r._nchar);
+            _ranks = std::move (r._ranks);
+            _bitArray = r._bitArray;
+            r._bitArray = nullptr;
+        }
+        return *this;
+    }
+    // Move constructor
+    bitVector(bitVector &&r) : _bitArray ( nullptr),_size(0)
+    {
+        *this = std::move(r);
+    }
+
+
+    void resize(uint64_t newsize)
+    {
+        //printf("bitvector resize from  %llu bits to %llu \n",_size,newsize);
+        _nchar  = (1ULL+newsize/64ULL);
+        _bitArray = (uint64_t *) realloc(_bitArray,_nchar*sizeof(uint64_t));
+        _size = newsize;
+    }
+
+    size_t size() const
+    {
+        return _size;
+    }
+
+    uint64_t bitSize() const {return (_nchar*64ULL + _ranks.capacity()*64ULL );}
+
+    //clear whole array
+    void clear()
+    {
+        memset(_bitArray,0,_nchar*sizeof(uint64_t));
+    }
+
+    //clear collisions in interval, only works with start and size multiple of 64
+    void clearCollisions(uint64_t start, size_t size, bitVector * cc)
+    {
+        assert( (start & 63) ==0);
+        assert( (size & 63) ==0);
+        uint64_t ids = (start/64ULL);
+        for(uint64_t ii =0;  ii< (size/64ULL); ii++ )
+        {
+            _bitArray[ids+ii] =  _bitArray[ids+ii] & (~ (cc->get64(ii)) );
+        }
+
+        cc->clear();
+    }
+
+
+    //clear interval, only works with start and size multiple of 64
+    void clear(uint64_t start, size_t size)
+    {
+        assert( (start & 63) ==0);
+        assert( (size & 63) ==0);
+        memset(_bitArray + (start/64ULL),0,(size/64ULL)*sizeof(uint64_t));
+    }
+
+    //for debug purposes
+    void print() const
+    {
+        printf("bit array of size %lli: \n",_size);
+        for(uint64_t ii = 0; ii< _size; ii++)
+        {
+            if(ii%10==0)
+                printf(" (%llu) ",ii);
+            int val = (_bitArray[ii >> 6] >> (ii & 63 ) ) & 1;
+            printf("%i",val);
+        }
+        printf("\n");
+
+        printf("rank array : size %lu \n",_ranks.size());
+        for (uint64_t ii = 0; ii< _ranks.size(); ii++)
+        {
+            printf("%llu :  %lli,  ",ii,_ranks[ii]);
+        }
+        printf("\n");
+    }
+
+    // return value at pos
+    uint64_t operator[](uint64_t pos) const {
+        //unsigned char * _bitArray8 = (unsigned char *) _bitArray;
+        //return (_bitArray8[pos >> 3ULL] >> (pos & 7 ) ) & 1;
+        return (_bitArray[pos >> 6ULL] >> (pos & 63)) & 1;
+    }
+
+    //atomically   return old val and set to 1
+    uint64_t atomic_test_and_set(uint64_t pos) {
+        uint64_t oldval =   __sync_fetch_and_or(_bitArray + (pos >> 6), (uint64_t) (1ULL << (pos & 63)) );
+        return (oldval >> (pos & 63)) & 1;
+    }
+
+
+    uint64_t get(uint64_t pos) const {
+        return (*this)[pos];
+    }
+
+    uint64_t get64(uint64_t cell64) const {
+        return _bitArray[cell64];
+    }
+
+    //set bit pos to 1
+    void set(uint64_t pos) {
+        assert(pos<_size);
+        //_bitArray [pos >> 6] |=   (1ULL << (pos & 63) ) ;
+        __sync_fetch_and_or (_bitArray + (pos >> 6ULL), (1ULL << (pos & 63)) );
+    }
+
+    //set bit pos to 0
+    void reset(uint64_t pos) {
+        //_bitArray [pos >> 6] &=   ~(1ULL << (pos & 63) ) ;
+        __sync_fetch_and_and (_bitArray + (pos >> 6ULL), ~(1ULL << (pos & 63) ));
+    }
+
+    // return value of last rank
+    // add offset to all ranks computed
+    uint64_t build_ranks(uint64_t offset = 0) {
+        _ranks.reserve(2 + _size/_nb_bits_per_rank_sample);
+
+        uint64_t curent_rank = offset;
+        for (size_t ii = 0; ii < _nchar; ii++) {
+            if (((ii*64)  % _nb_bits_per_rank_sample) == 0) {
+                _ranks.push_back(curent_rank);
+            }
+            curent_rank +=  popcount_64(_bitArray[ii]);
+        }
+
+        return curent_rank;
+    }
+
+    uint64_t rank(uint64_t pos) const {
+        uint64_t word_idx = pos / 64ULL;
+        uint64_t word_offset = pos % 64;
+        uint64_t block = pos / _nb_bits_per_rank_sample;
+        uint64_t r = _ranks[block];
+        for (uint64_t w = block * _nb_bits_per_rank_sample / 64; w < word_idx; ++w)
+            r += popcount_64(_bitArray[w]);
+        uint64_t mask = (uint64_t(1) << word_offset ) - 1;
+        r += popcount_64( _bitArray[word_idx] & mask);
+
+        return r;
+    }
+
+
+
+    void save(std::ostream& os) const {
+        os.write(reinterpret_cast(&_size), sizeof(_size));
+        os.write(reinterpret_cast(&_nchar), sizeof(_nchar));
+        os.write(reinterpret_cast(_bitArray), (std::streamsize)(sizeof(uint64_t) * _nchar));
+        size_t sizer = _ranks.size();
+        os.write(reinterpret_cast(&sizer),  sizeof(size_t));
+        os.write(reinterpret_cast(_ranks.data()), (std::streamsize)(sizeof(_ranks[0]) * _ranks.size()));
+    }
+
+    void load(std::istream& is) {
+        is.read(reinterpret_cast(&_size), sizeof(_size));
+        is.read(reinterpret_cast(&_nchar), sizeof(_nchar));
+        this->resize(_size);
+        is.read(reinterpret_cast(_bitArray), (std::streamsize)(sizeof(uint64_t) * _nchar));
+
+        size_t sizer;
+        is.read(reinterpret_cast(&sizer),  sizeof(size_t));
+        _ranks.resize(sizer);
+        is.read(reinterpret_cast(_ranks.data()), (std::streamsize)(sizeof(_ranks[0]) * _ranks.size()));
+    }
+
+
+  protected:
+    uint64_t*  _bitArray;
+    //uint64_t* _bitArray;
+    uint64_t _size;
+    uint64_t _nchar;
+
+    // epsilon =  64 / _nb_bits_per_rank_sample   bits
+    // additional size for rank is epsilon * _size
+    static const uint64_t _nb_bits_per_rank_sample = 512; //512 seems ok
+    std::vector _ranks;
+};
+
+////////////////////////////////////////////////////////////////
+#pragma mark -
+#pragma mark level
+////////////////////////////////////////////////////////////////
+
+
+static inline uint64_t fastrange64(uint64_t word, uint64_t p) {
+    //return word %  p;
+    return (uint64_t)(((__uint128_t)word * (__uint128_t)p) >> 64);
+}
+
+class level{
+  public:
+    level() {}
+
+    ~level() {}
+
+    uint64_t get(uint64_t hash_raw) const {
+        //	uint64_t hashi =    hash_raw %  hash_domain; //
+        //uint64_t hashi = (uint64_t)(  ((__uint128_t) hash_raw * (__uint128_t) hash_domain) >> 64ULL);
+        uint64_t hashi = fastrange64(hash_raw,hash_domain);
+        return bitset.get(hashi);
+    }
+
+    uint64_t idx_begin;
+    uint64_t hash_domain;
+    bitVector bitset;
+};
+
+
+////////////////////////////////////////////////////////////////
+#pragma mark -
+#pragma mark mphf
+////////////////////////////////////////////////////////////////
+
+#define NBBUFF 10000
+//#define NBBUFF 2
+
+template
+struct thread_args {
+    void * boophf;
+    Range const * range;
+    std::shared_ptr it_p; /* used to be "Iterator it" but because of fastmode, iterator is polymorphic; TODO: think about whether it should be a unique_ptr actually */
+    std::shared_ptr until_p; /* to cache the "until" variable */
+    int level;
+};
+
+//forward declaration
+
+template 
+void * thread_processLevel(void * args);
+
+/* Hasher_t returns a single hash when operator()(elem_t key) is called.
+   if used with XorshiftHashFunctors, it must have the following operator: operator()(elem_t key, uint64_t seed) */
+template
+class mphf {
+    /* this mechanisms gets P hashes out of Hasher_t */
+    typedef XorshiftHashFunctors MultiHasher_t ;
+
+  public:
+    mphf()
+            : _built(false) {}
+
+    ~mphf() {}
+
+    // allow perc_elem_loaded  elements to be loaded in ram for faster construction (default 3%), set to 0 to desactivate
+    template 
+    mphf(size_t n, const Range &input_range,
+         int num_thread = 1,
+         double gamma = 2.0,
+         bool writeEach = true, bool progress =true, float perc_elem_loaded = 0.03)
+            :
+            _nb_levels(0), _gamma(gamma), _hash_domain(size_t(ceil(double(n) * gamma))), _nelem(n), _num_thread(num_thread), _percent_elem_loaded_for_fastMode (perc_elem_loaded), _withprogress(progress) {
+        if (n ==0)
+            return;
+
+        _fastmode = false;
+        if (_percent_elem_loaded_for_fastMode > 0.0)
+            _fastmode =true;
+
+        if (writeEach) {
+            _writeEachLevel =true;
+            _fastmode = false;
+        } else {
+            _writeEachLevel = false;
+        }
+
+        setup();
+
+        if (_withprogress) {
+            _progressBar.timer_mode=1;
+
+            double total_raw = _nb_levels;
+
+            double sum_geom_read =  ( 1.0 / (1.0 - _proba_collision));
+            double total_writeEach = sum_geom_read + 1.0;
+
+            double total_fastmode_ram =  (_fastModeLevel+1) +  ( pow(_proba_collision,_fastModeLevel)) * (_nb_levels-(_fastModeLevel+1))   ;
+
+            printf("for info, total work write each  : %.3f    total work inram from level %i : %.3f  total work raw : %.3f \n",total_writeEach,_fastModeLevel,total_fastmode_ram,total_raw);
+
+            if(writeEach)
+            {
+                _progressBar.init(_nelem * total_writeEach, "Building BooPHF",num_thread);
+            }
+            else if(_fastmode)
+                _progressBar.init( _nelem * total_fastmode_ram, "Building BooPHF",num_thread);
+            else
+                _progressBar.init( _nelem * _nb_levels ,"Building BooPHF",num_thread);
+        }
+
+        uint64_t offset = 0;
+        for(int ii = 0; ii< _nb_levels; ii++)
+        {
+            _tempBitset =  new bitVector(_levels[ii].hash_domain); // temp collision bitarray for this level
+
+            processLevel(input_range,ii);
+
+            _levels[ii].bitset.clearCollisions(0 , _levels[ii].hash_domain , _tempBitset);
+
+            offset = _levels[ii].bitset.build_ranks(offset);
+
+            delete _tempBitset;
+        }
+
+        if(_withprogress)
+            _progressBar.finish_threaded();
+
+
+        _lastbitsetrank = offset ;
+
+        //printf("used temp ram for construction : %lli MB \n",setLevelFastmode.capacity()* sizeof(elem_t) /1024ULL/1024ULL);
+
+        std::vector().swap(setLevelFastmode);   // clear setLevelFastmode reallocating
+
+
+        pthread_mutex_destroy(&_mutex);
+
+        _built = true;
+    }
+
+
+    template
+    uint64_t lookup(elem_t elem) {
+        if (!_built) return ULLONG_MAX;
+
+        //auto hashes = _hasher(elem);
+        uint64_t non_minimal_hp,minimal_hp;
+        int level;
+
+        hash_pair_t bbhash = _hasher.hashpair128(elem);
+        uint64_t level_hash = getLevel(bbhash, elem, &level);
+
+        if (level == (_nb_levels-1)) {
+            //auto in_final_map  = _final_hash.find (elem);
+            auto in_final_map  = _final_hash.find(bbhash);
+            if (in_final_map == _final_hash.end()) {
+                //elem was not in orignal set of keys
+                return ULLONG_MAX; //  means elem not in set
+            } else {
+                minimal_hp =  in_final_map->second + _lastbitsetrank;
+                //printf("lookup %llu  level %i   --> %llu \n",elem,level,minimal_hp);
+                return minimal_hp;
+            }
+            //				minimal_hp = _final_hash[elem] + _lastbitsetrank;
+            //				return minimal_hp;
+        } else {
+            //non_minimal_hp =  level_hash %  _levels[level].hash_domain; // in fact non minimal hp would be  + _levels[level]->idx_begin
+            non_minimal_hp = fastrange64(level_hash,_levels[level].hash_domain);
+        }
+        minimal_hp = _levels[level].bitset.rank(non_minimal_hp);
+        //	printf("lookup %llu  level %i   --> %llu \n",elem,level,minimal_hp);
+
+        return minimal_hp;
+    }
+
+    uint64_t size() const {
+        return _nelem;
+    }
+
+    uint64_t mem_size() const {
+        uint64_t totalsizeBitset = 0;
+        for (int ii = 0; ii < _nb_levels; ii++)
+            totalsizeBitset += _levels[ii].bitset.bitSize();
+
+        uint64_t totalsize = totalsizeBitset +  _final_hash.size()*42*8 ;  // unordered map takes approx 42B per elem [personal test] (42B with uint64_t key, would be larger for other type of elem)
+
+        /*
+        printf("Bitarray    %12llu  bits (%.2f %%)   (array + ranks )\n",
+               totalsizeBitset, 100*(float)totalsizeBitset/totalsize);
+        printf("final hash  %12lu  bits (%.2f %%) (nb in final hash %lu)\n",
+               _final_hash.size()*42*8, 100*(float)(_final_hash.size()*42*8)/totalsize,
+               _final_hash.size() );
+        */
+
+        return totalsize / 8;
+    }
+
+    template  void fillBuffer(std::vector &buffer, std::shared_ptr shared_it, std::shared_ptr until_p,
+                                                 uint64_t &inbuff, bool & isRunning) {
+        auto until = *until_p;
+        pthread_mutex_lock(&_mutex);
+        for (; inbuff &buffer,
+                     std::shared_ptr shared_it, std::shared_ptr until_p, uint64_t & inbuff, bool & isRunning) {
+        fillBufferCommon128(buffer,shared_it,until_p,inbuff,isRunning);
+    }
+    void fillBuffer (std::vector  & buffer,std::shared_ptr shared_it, std::shared_ptr until_p,uint64_t & inbuff, bool & isRunning) {
+        fillBufferCommon128(buffer,shared_it,until_p,inbuff,isRunning);
+    }
+
+    template  void fillBufferCommon128(std::vector  & buffer,std::shared_ptr shared_it, std::shared_ptr until_p,
+                                                          uint64_t &inbuff, bool &isRunning) {
+        auto until = *until_p;
+        pthread_mutex_lock(&_mutex);
+        for (; inbuff  //typename Range,
+    void pthread_processLevel(std::vector &buffer, std::shared_ptr shared_it, std::shared_ptr until_p, int i) {
+        uint64_t nb_done =0;
+        int tid =  __sync_fetch_and_add (&_nb_living, 1);
+        uint64_t inbuff =0;
+
+        uint64_t writebuff =0;
+        std::vector< internal_hash_t > & myWriteBuff = bufferperThread[tid];
+        for (bool isRunning=true;  isRunning ; ) {
+            //safely copy n items into buffer
+            //call to specialized function accordin to iterator type (may be iterator over keys (first 2 levels), or iterator over 128 bit hashes)
+            fillBuffer(buffer,shared_it,until_p,inbuff,isRunning);
+
+            //do work on the n elems of the buffer
+            for (uint64_t ii=0; ii= setLevelFastmode.size())
+                            _fastmode = false;
+                        else
+                            setLevelFastmode[idxl2] = val; // create set for fast mode
+                    }
+
+                    //insert to level i+1 : either next level of the cascade or final hash if last level reached
+                    if(i == _nb_levels-1) //stop cascade here, insert into exact hash
+                    {
+
+                        uint64_t hashidx =  __sync_fetch_and_add(&_hashidx, 1);
+
+                        pthread_mutex_lock(&_mutex); //see later if possible to avoid this, mais pas bcp item vont la
+                        // calc rank de fin  precedent level qq part, puis init hashidx avec ce rank, direct minimal, pas besoin inser ds bitset et rank
+
+                        if (_final_hash.count(val)) // key already in final hash
+                        {
+                            fprintf(stderr,"The impossible happened : collision on 128 bit hashes... please switch to safe branch, and play the lottery.");
+                            fprintf(stderr,"Another more likely explanation might be that you have duplicate keys in your input.\
+                                        If so, you can ignore this message, but be aware that too many duplicate keys will increase ram usage\n");
+                        }
+                        _final_hash[val] = hashidx;
+
+
+                        pthread_mutex_unlock(&_mutex);
+                    }
+                    else
+                    {
+
+                        //ils ont reach ce level
+                        //insert elem into curr level on disk --> sera utilise au level+1 , (mais encore besoin filtre)
+
+                        if(_writeEachLevel && i > 0 && i < _nb_levels -1)
+                        {
+                            if(writebuff>=NBBUFF)
+                            {
+                                //flush buffer
+                                flockfile(_currlevelFile);
+                                fwrite(myWriteBuff.data(),sizeof(internal_hash_t),writebuff,_currlevelFile);
+                                funlockfile(_currlevelFile);
+                                writebuff = 0;
+
+                            }
+
+                            myWriteBuff[writebuff++] = val;
+
+                        }
+
+
+
+                        //#ifdef PDEBUG
+                        //							printf("inserting into level %i ",i);
+                        //#endif
+
+
+                        // computes next hash
+                        if (level == 0)
+                            level_hash = bbhash[0];
+                        else if (level == 1)
+                            level_hash = bbhash[1];
+                        else
+                            level_hash = _hasher.next(bbhash);
+                        insertIntoLevel(level_hash,i); //should be safe
+                    }
+                }
+
+                nb_done++;
+                if ((nb_done&1023) ==0  && _withprogress) {_progressBar.inc(nb_done,tid);nb_done=0; }
+
+            }
+
+            inbuff = 0;
+        }
+
+        if(_writeEachLevel && writebuff>0) {
+            //flush buffer
+            flockfile(_currlevelFile);
+            fwrite(myWriteBuff.data(),sizeof(internal_hash_t),writebuff,_currlevelFile);
+            funlockfile(_currlevelFile);
+            writebuff = 0;
+        }
+
+    }
+
+
+    void save(std::ostream& os) const {
+        os.write(reinterpret_cast(&_gamma), sizeof(_gamma));
+        os.write(reinterpret_cast(&_nb_levels), sizeof(_nb_levels));
+        os.write(reinterpret_cast(&_lastbitsetrank), sizeof(_lastbitsetrank));
+        os.write(reinterpret_cast(&_nelem), sizeof(_nelem));
+        for(int ii=0; ii<_nb_levels; ii++)
+        {
+            _levels[ii].bitset.save(os);
+        }
+
+        //save final hash
+        size_t final_hash_size = _final_hash.size();
+
+        os.write(reinterpret_cast(&final_hash_size), sizeof(size_t));
+
+        for (auto it = _final_hash.begin(); it != _final_hash.end(); ++it )
+        {
+            os.write(reinterpret_cast(&(it->first)), sizeof(internal_hash_t));
+            os.write(reinterpret_cast(&(it->second)), sizeof(uint64_t));
+        }
+
+    }
+
+    void load(std::istream& is) {
+        is.read(reinterpret_cast(&_gamma), sizeof(_gamma));
+        is.read(reinterpret_cast(&_nb_levels), sizeof(_nb_levels));
+        is.read(reinterpret_cast(&_lastbitsetrank), sizeof(_lastbitsetrank));
+        is.read(reinterpret_cast(&_nelem), sizeof(_nelem));
+
+        _levels.resize(_nb_levels);
+
+
+        for(int ii=0; ii<_nb_levels; ii++)
+        {
+            //_levels[ii].bitset = new bitVector();
+            _levels[ii].bitset.load(is);
+        }
+
+
+
+        //mini setup, recompute size of each level
+        _proba_collision = 1.0 -  pow(((_gamma*(double)_nelem -1 ) / (_gamma*(double)_nelem)),_nelem-1);
+        uint64_t previous_idx =0;
+        _hash_domain = (size_t)  (ceil(double(_nelem) * _gamma)) ;
+        for(int ii=0; ii<_nb_levels; ii++)
+        {
+            //_levels[ii] = new level();
+            _levels[ii].idx_begin = previous_idx;
+            _levels[ii].hash_domain =  (( (uint64_t) (_hash_domain * pow(_proba_collision,ii)) + 63) / 64 ) * 64;
+            if(_levels[ii].hash_domain == 0 )
+                _levels[ii].hash_domain  = 64 ;
+            previous_idx += _levels[ii].hash_domain;
+        }
+
+        //restore final hash
+
+        _final_hash.clear();
+        size_t final_hash_size ;
+
+        is.read(reinterpret_cast(&final_hash_size), sizeof(size_t));
+
+        for(unsigned int ii=0; ii(&key), sizeof(internal_hash_t));
+            is.read(reinterpret_cast(&value), sizeof(uint64_t));
+
+            _final_hash[key] = value;
+        }
+        _built = true;
+    }
+
+
+  private :
+
+    void setup()
+    {
+        pthread_mutex_init(&_mutex, NULL);
+
+        _pid = getpid() + printPt(pthread_self()) ;// + pthread_self();
+        //printf("pt self %llu  pid %i \n",printPt(pthread_self()),_pid);
+
+        _cptTotalProcessed=0;
+
+
+        if(_fastmode)
+        {
+            setLevelFastmode.resize(_percent_elem_loaded_for_fastMode * (double)_nelem );
+        }
+
+
+        bufferperThread.resize(_num_thread);
+        if(_writeEachLevel)
+        {
+            for(int ii=0; ii<_num_thread; ii++)
+            {
+                bufferperThread[ii].resize(NBBUFF);
+            }
+        }
+
+        _proba_collision = 1.0 -  pow(((_gamma*(double)_nelem -1 ) / (_gamma*(double)_nelem)),_nelem-1);
+
+        double sum_geom =_gamma * ( 1.0 +  _proba_collision / (1.0 - _proba_collision));
+        //printf("proba collision %f  sum_geom  %f   \n",_proba_collision,sum_geom);
+
+        _nb_levels = 25; // 25
+        _levels.resize(_nb_levels);
+
+        //build levels
+        uint64_t previous_idx =0;
+        for(int ii=0; ii<_nb_levels; ii++)
+        {
+
+            _levels[ii].idx_begin = previous_idx;
+
+            // round size to nearest superior multiple of 64, makes it easier to clear a level
+            _levels[ii].hash_domain =  (( (uint64_t) (_hash_domain * pow(_proba_collision,ii)) + 63) / 64 ) * 64;
+            if(_levels[ii].hash_domain == 0 ) _levels[ii].hash_domain  = 64 ;
+            previous_idx += _levels[ii].hash_domain;
+
+            //printf("build level %i bit array : start %12llu, size %12llu  ",ii,_levels[ii]->idx_begin,_levels[ii]->hash_domain );
+            //printf(" expected elems : %.2f %% total \n",100.0*pow(_proba_collision,ii));
+
+        }
+
+        for(int ii=0; ii<_nb_levels; ii++)
+        {
+            if(pow(_proba_collision,ii) < _percent_elem_loaded_for_fastMode)
+            {
+                _fastModeLevel = ii;
+                //printf("fast mode level :  %i \n",ii);
+                break;
+            }
+        }
+    }
+
+    //overload getLevel with either elem_t or internal_hash_t
+    template
+    uint64_t getLevel(hash_pair_t bbhash, elem_t val, int *res_level, int maxlevel = 100, int minlevel = 0) const {
+        int level = 0;
+        uint64_t hash_raw=0;
+
+        for (int ii = 0; ii < (_nb_levels-1) && ii < maxlevel ; ii++) {
+            //calc le hash suivant
+            if (ii == 0)
+                hash_raw = bbhash[0];
+            else if (ii == 1)
+                hash_raw = bbhash[1];
+            else
+                hash_raw = _hasher.next(bbhash);
+
+            if (ii >= minlevel && _levels[ii].get(hash_raw))
+                break;
+
+            level++;
+        }
+
+        *res_level = level;
+        return hash_raw;
+    }
+
+
+    // compute level and returns hash of last level reached
+    // FIXME: The usage of getLevel here is *super* confusing, really.
+    uint64_t getLevel(internal_hash_t &bbhash,int * res_level, int maxlevel = 100, int minlevel =0) const {
+        int level = 0;
+        uint64_t hash_raw=0;
+
+        for (int ii = 0; ii<(_nb_levels-1) &&  ii < maxlevel ; ii++) {
+            //calc le hash suivant
+            if (ii == 0)
+                hash_raw = bbhash[0];
+            else if (ii == 1)
+                hash_raw = bbhash[1];
+            else
+                hash_raw = _hasher.next(bbhash);
+
+            if (ii >= minlevel && _levels[ii].get(hash_raw))
+                break;
+
+            level++;
+        }
+
+        *res_level = level;
+        return hash_raw;
+    }
+
+
+    //insert into bitarray
+    void insertIntoLevel(uint64_t level_hash, int i) {
+        //	uint64_t hashl =  level_hash % _levels[i].hash_domain;
+        uint64_t hashl = fastrange64( level_hash,_levels[i].hash_domain);
+
+        //#ifdef PDEBUG
+        //			printf(" :  %llu / %llu \n ",hashl,_levels[i].hash_domain);
+        //#endif
+
+        if (_levels[i].bitset.atomic_test_and_set(hashl))
+            _tempBitset->atomic_test_and_set(hashl);
+    }
+
+    //loop to insert into level i
+    template 
+    void processLevel(Range const& input_range,int i) {
+        ////alloc the bitset for this level
+        _levels[i].bitset = bitVector(_levels[i].hash_domain);
+
+        //printf("---process level %i   wr %i fast %i ---\n",i,_writeEachLevel,_fastmode);
+
+        char fname_old[1000];
+        sprintf(fname_old,"temp_p%i_level_%i",_pid,i-2);
+
+        char fname_curr[1000];
+        sprintf(fname_curr,"temp_p%i_level_%i",_pid,i);
+
+        char fname_prev[1000];
+        sprintf(fname_prev,"temp_p%i_level_%i",_pid,i-1);
+
+        if (_writeEachLevel) {
+            //file management :
+            if(i>2) //delete previous file
+            {
+                unlink(fname_old);
+            }
+
+            if(i< _nb_levels-1 && i > 0 ) //create curr file
+            {
+                _currlevelFile = fopen(fname_curr,"w");
+            }
+        }
+
+
+        _cptLevel = 0;
+        _hashidx = 0;
+        _idxLevelsetLevelFastmode =0;
+        _nb_living =0;
+        //create  threads
+        pthread_t *tab_threads= new pthread_t [_num_thread];
+        typedef decltype(input_range.begin()) it_type;
+        thread_args t_arg; // meme arg pour tous
+        t_arg.boophf = this;
+        t_arg.range = &input_range;
+        t_arg.it_p = std::static_pointer_cast(std::make_shared(input_range.begin()));
+        t_arg.until_p = std::static_pointer_cast(std::make_shared(input_range.end()));
+
+        t_arg.level = i;
+
+        if (_writeEachLevel && (i > 1)) {
+
+            auto data_iterator_level = file_binary(fname_prev);
+
+            //typedef decltype(data_iterator_level.begin()) disklevel_it_type;
+            //diskit_hash128_t
+
+            //data_iterator_level.begin();
+
+            t_arg.it_p = std::static_pointer_cast(std::make_shared(data_iterator_level.begin()));
+            t_arg.until_p = std::static_pointer_cast(std::make_shared(data_iterator_level.end()));
+
+            for (int ii=0;ii<_num_thread;ii++)
+                pthread_create(&tab_threads[ii], NULL, thread_processLevel, &t_arg); //&t_arg[ii]
+
+
+            //must join here before the block is closed and file_binary is destroyed (and closes the file)
+            for(int ii=0;ii<_num_thread;ii++)
+            {
+                pthread_join(tab_threads[ii], NULL);
+            }
+
+        } else {
+            if (_fastmode && i >= (_fastModeLevel+1)) {
+                //   we'd like to do t_arg.it = data_iterator.begin() but types are different;
+                //   so, casting to (void*) because of that; and we remember the type in the template
+                //	typedef decltype(setLevelFastmode.begin()) fastmode_it_type; // vectorit_hash128_t
+                t_arg.it_p =  std::static_pointer_cast(std::make_shared(setLevelFastmode.begin()));
+                t_arg.until_p =  std::static_pointer_cast(std::make_shared(setLevelFastmode.end()));
+
+                //       we'd like to do t_arg.it = data_iterator.begin() but types are different;
+                //       so, casting to (void*) because of that; and we remember the type in the template
+
+                for (int ii=0;ii<_num_thread;ii++)
+                    pthread_create (&tab_threads[ii], NULL,  thread_processLevel, &t_arg); //&t_arg[ii]
+
+            } else {
+                //printf(" _ _ basic mode \n");
+                for(int ii=0;ii<_num_thread;ii++)
+                    pthread_create (&tab_threads[ii], NULL,  thread_processLevel, &t_arg); //&t_arg[ii]
+            }
+            //joining
+            for(int ii=0;ii<_num_thread;ii++)
+            {
+                pthread_join(tab_threads[ii], NULL);
+            }
+        }
+
+#ifdef CCDEBUG
+        printf("\ngoing to level %i  : %llu elems  %.2f %%  expected : %.2f %% \n",i,_cptLevel,100.0* _cptLevel/(float)_nelem,100.0* pow(_proba_collision,i) );
+#endif
+        //printf("\ncpt total processed %llu \n",_cptTotalProcessed);
+        if(_fastmode && i == _fastModeLevel) //shrink to actual number of elements in set
+        {
+            //printf("\nresize setLevelFastmode to %lli \n",_idxLevelsetLevelFastmode);
+            setLevelFastmode.resize(_idxLevelsetLevelFastmode);
+        }
+        delete [] tab_threads;
+
+        if(_writeEachLevel)
+        {
+            if(i< _nb_levels-1 && i>0)
+            {
+                fflush(_currlevelFile);
+                fclose(_currlevelFile);
+            }
+
+            if(i== _nb_levels- 1) //delete last file
+            {
+                unlink(fname_prev);
+            }
+        }
+
+    }
+
+  private:
+    std::vector _levels;
+    int _nb_levels;
+    MultiHasher_t _hasher;
+    bitVector * _tempBitset;
+
+    double _gamma;
+    uint64_t _hash_domain;
+    uint64_t _nelem;
+    std::unordered_map _final_hash; // internalHasher   Hasher_t
+    Progress _progressBar;
+    int _nb_living;
+    int _num_thread;
+    uint64_t _hashidx;
+    double _proba_collision;
+    uint64_t _lastbitsetrank;
+    uint64_t _idxLevelsetLevelFastmode;
+    uint64_t _cptLevel;
+    uint64_t _cptTotalProcessed;
+
+    // fast build mode , requires  that _percent_elem_loaded_for_fastMode %   elems are loaded in ram
+    float _percent_elem_loaded_for_fastMode ;
+    bool _fastmode;
+    std::vector< internal_hash_t > setLevelFastmode;
+    //	std::vector< internal_hash_t > setLevelFastmode_next; // todo shrinker le set e nram a chaque niveau  ?
+
+    std::vector< std::vector< internal_hash_t > > bufferperThread;
+
+    int _fastModeLevel;
+    bool _withprogress;
+    bool _built;
+    bool _writeEachLevel;
+    FILE * _currlevelFile;
+    int _pid;
+  public:
+    pthread_mutex_t _mutex;
+};
+
+////////////////////////////////////////////////////////////////
+#pragma mark -
+#pragma mark threading
+////////////////////////////////////////////////////////////////
+
+template
+void *thread_processLevel(void * args) {
+    if (args ==NULL) return NULL;
+
+    thread_args *targ = (thread_args*) args;
+    mphf * obw = (mphf *) targ->boophf;
+    int level = targ->level;
+    std::vector buffer;
+    buffer.resize(NBBUFF);
+
+    pthread_mutex_t * mutex =  & obw->_mutex;
+
+    pthread_mutex_lock(mutex); // from comment above: "//get starting iterator for this thread, must be protected (must not be currently used by other thread to copy elems in buff)"
+    std::shared_ptr startit = std::static_pointer_cast(targ->it_p);
+    std::shared_ptr until_p = std::static_pointer_cast(targ->until_p);
+    pthread_mutex_unlock(mutex);
+
+    obw->pthread_processLevel(buffer, startit, until_p, level);
+
+    return NULL;
+}
+}
diff -Nru spades-3.10.1+dfsg/ext/include/ConsensusCore/Feature.hpp spades-3.11.1+dfsg/ext/include/ConsensusCore/Feature.hpp
--- spades-3.10.1+dfsg/ext/include/ConsensusCore/Feature.hpp	2017-02-28 14:55:42.000000000 +0000
+++ spades-3.11.1+dfsg/ext/include/ConsensusCore/Feature.hpp	1970-01-01 00:00:00.000000000 +0000
@@ -1,166 +0,0 @@
-// Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
-//
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted (subject to the limitations in the
-// disclaimer below) provided that the following conditions are met:
-//
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//
-//  * Redistributions in binary form must reproduce the above
-//    copyright notice, this list of conditions and the following
-//    disclaimer in the documentation and/or other materials provided
-//    with the distribution.
-//
-//  * Neither the name of Pacific Biosciences nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-// SUCH DAMAGE.
-
-// Author: David Alexander
-
-#pragma once
-
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-
-#include "Types.hpp"
-
-namespace ConsensusCore
-{
-    // Feature/Features object usage caveats:
-    //  - Feature and Features objects _must_ be stored by value, not reference
-    //  - The underlying array must be allocated using new[]
-    template 
-    class Feature : private boost::shared_array
-    {
-    public:
-        // \brief Allocate a new feature object, copying content from ptr.
-        Feature(const T* ptr, int length)
-            : boost::shared_array(new T[length]),
-              length_(length)
-        {
-            assert(length >= 0);
-            std::copy(ptr, ptr + length, get());
-        }
-
-        // \brief Allocate and zero-fill a new feature object of given length.
-        explicit Feature(int length)
-            : boost::shared_array(new T[length]()),
-              length_(length)
-        {
-            assert(length >= 0);
-        }
-
-        int Length() const
-        {
-            return length_;
-        }
-
-        const T& operator[](int i) const
-        {
-            return this->boost::shared_array::operator[](i);
-        }
-
-        T& operator[](int i)
-        {
-            return this->boost::shared_array::operator[](i);
-        }
-
-        T ElementAt(int i) const
-        {
-            return (*this)[i];
-        }
-
-    private:
-        int length_;
-
-#ifndef SWIG
-    public:
-        T* get()
-        {
-            return this->boost::shared_array::get();
-        }
-
-        const T* get() const
-        {
-            return this->boost::shared_array::get();
-        }
-
-        operator std::string() const;
-#endif  // !SWIG
-    };
-
-
-#ifndef SWIG
-    //
-    // Support for boost::foreach
-    //
-    template
-    inline const T* range_begin(const Feature& f)
-    {
-        return f.get();
-    }
-
-    template
-    inline const T* range_end(const Feature& f)
-    {
-        return f.get() + f.Length();
-    }
-
-    template
-    inline T* range_begin(Feature& f) // NOLINT
-    {
-        return f.get();
-    }
-
-    template
-    inline T* range_end(Feature& f)  // NOLINT
-    {
-        return f.get() + f.Length();
-    }
-#endif  // !SWIG
-
-    typedef Feature FloatFeature;
-    typedef Feature CharFeature;
-    typedef Feature IntFeature;
-}
-
-
-#ifndef SWIG
-namespace boost
-{
-    template
-    struct range_const_iterator >
-    {
-        typedef const T* type;
-    };
-
-    template
-    struct range_mutable_iterator >
-    {
-        typedef T* type;
-    };
-}
-#endif  // !SWIG
diff -Nru spades-3.10.1+dfsg/ext/include/ConsensusCore/Features.hpp spades-3.11.1+dfsg/ext/include/ConsensusCore/Features.hpp
--- spades-3.10.1+dfsg/ext/include/ConsensusCore/Features.hpp	2017-02-28 14:55:42.000000000 +0000
+++ spades-3.11.1+dfsg/ext/include/ConsensusCore/Features.hpp	1970-01-01 00:00:00.000000000 +0000
@@ -1,68 +0,0 @@
-// Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
-//
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted (subject to the limitations in the
-// disclaimer below) provided that the following conditions are met:
-//
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//
-//  * Redistributions in binary form must reproduce the above
-//    copyright notice, this list of conditions and the following
-//    disclaimer in the documentation and/or other materials provided
-//    with the distribution.
-//
-//  * Neither the name of Pacific Biosciences nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-// SUCH DAMAGE.
-
-// Author: David Alexander
-
-#pragma once
-
-#include 
-#include 
-#include 
-#include 
-#include 
-
-#include "Feature.hpp"
-#include "Types.hpp"
-
-namespace ConsensusCore
-{
-    /// \brief An object containing observed features from a sequencing run.
-    struct SequenceFeatures
-    {
-    public:
-        explicit SequenceFeatures(const std::string& seq);
-        int Length() const             { return sequence_.Length(); }
-        Feature Sequence() const { return sequence_; }
-
-        /// Access to the sequence bases
-        const char& operator[] (int i) const { return sequence_[i]; }
-        char ElementAt(int i) const          { return (*this)[i]; }
-
-
-    private:
-        Feature sequence_;
-    };
-}
-
diff -Nru spades-3.10.1+dfsg/ext/include/ConsensusCore/Matrix/SparseMatrix-inl.hpp spades-3.11.1+dfsg/ext/include/ConsensusCore/Matrix/SparseMatrix-inl.hpp
--- spades-3.10.1+dfsg/ext/include/ConsensusCore/Matrix/SparseMatrix-inl.hpp	2017-02-28 14:55:42.000000000 +0000
+++ spades-3.11.1+dfsg/ext/include/ConsensusCore/Matrix/SparseMatrix-inl.hpp	2017-09-28 09:05:19.000000000 +0000
@@ -38,7 +38,6 @@
 #pragma once
 
 #include 
-#include 
 #include 
 #include 
 
diff -Nru spades-3.10.1+dfsg/ext/include/ConsensusCore/Poa/PoaGraph.hpp spades-3.11.1+dfsg/ext/include/ConsensusCore/Poa/PoaGraph.hpp
--- spades-3.10.1+dfsg/ext/include/ConsensusCore/Poa/PoaGraph.hpp	2017-02-28 14:55:42.000000000 +0000
+++ spades-3.11.1+dfsg/ext/include/ConsensusCore/Poa/PoaGraph.hpp	2017-09-28 09:05:19.000000000 +0000
@@ -37,10 +37,10 @@
 
 #pragma once
 
-#include 
 #include 
 #include 
 #include 
+#include 
 
 #include "Types.hpp"
 #include "Mutation.hpp"
@@ -65,7 +65,7 @@
         // TODO(dalexander): move this method to PoaConsensus so we don't have to use a tuple
         // interface here (which was done to avoid a circular dep on PoaConsensus).
 #ifndef SWIG
-        boost::tuple >* >
+        std::tuple >* >
         FindConsensus(const PoaConfig& config) const;
 #endif  // !SWIG
 
diff -Nru spades-3.10.1+dfsg/ext/include/ConsensusCore/Version.hpp spades-3.11.1+dfsg/ext/include/ConsensusCore/Version.hpp
--- spades-3.10.1+dfsg/ext/include/ConsensusCore/Version.hpp	2017-02-28 14:55:42.000000000 +0000
+++ spades-3.11.1+dfsg/ext/include/ConsensusCore/Version.hpp	2017-09-28 09:05:19.000000000 +0000
@@ -37,7 +37,6 @@
 
 #include 
 #include 
-#include 
 
 #pragma once
 
diff -Nru spades-3.10.1+dfsg/ext/include/llvm/ADT/IntrusiveRefCntPtr.h spades-3.11.1+dfsg/ext/include/llvm/ADT/IntrusiveRefCntPtr.h
--- spades-3.10.1+dfsg/ext/include/llvm/ADT/IntrusiveRefCntPtr.h	2017-02-28 14:55:41.000000000 +0000
+++ spades-3.11.1+dfsg/ext/include/llvm/ADT/IntrusiveRefCntPtr.h	2017-09-28 09:05:17.000000000 +0000
@@ -108,7 +108,7 @@
 
   void Release() const {
     int NewRefCount = --RefCount;
-    assert(NewRefCount >= 0 && "Reference count was already zero.");
+    // assert(NewRefCount >= 0 && "Reference count was already zero.");
     if (NewRefCount == 0)
       delete static_cast(this);
   }
diff -Nru spades-3.10.1+dfsg/ext/include/llvm/Support/MathExtras.h spades-3.11.1+dfsg/ext/include/llvm/Support/MathExtras.h
--- spades-3.10.1+dfsg/ext/include/llvm/Support/MathExtras.h	2017-02-28 14:55:41.000000000 +0000
+++ spades-3.11.1+dfsg/ext/include/llvm/Support/MathExtras.h	2017-09-28 09:05:17.000000000 +0000
@@ -579,6 +579,52 @@
 ///
 /// Examples:
 /// \code
+///   alignTo(5, 8) = 8
+///   alignTo(17, 8) = 24
+///   alignTo(~0LL, 8) = 0
+///   alignTo(321, 255) = 510
+///
+///   alignTo(5, 8, 7) = 7
+///   alignTo(17, 8, 1) = 17
+///   alignTo(~0LL, 8, 3) = 3
+///   alignTo(321, 255, 42) = 552
+/// \endcode
+inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
+  assert(Align != 0u && "Align can't be 0.");
+  Skew %= Align;
+  return (Value + Align - 1 - Skew) / Align * Align + Skew;
+}
+
+/// Returns the next integer (mod 2**64) that is greater than or equal to
+/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
+template  constexpr inline uint64_t alignTo(uint64_t Value) {
+  static_assert(Align != 0u, "Align must be non-zero");
+  return (Value + Align - 1) / Align * Align;
+}
+
+/// \c alignTo for contexts where a constant expression is required.
+/// \sa alignTo
+///
+/// \todo FIXME: remove when \c constexpr becomes really \c constexpr
+template 
+struct AlignTo {
+  static_assert(Align != 0u, "Align must be non-zero");
+  template 
+  struct from_value {
+    static const uint64_t value = (Value + Align - 1) / Align * Align;
+  };
+};
+
+/// Returns the next integer (mod 2**64) that is greater than or equal to
+/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
+///
+/// If non-zero \p Skew is specified, the return value will be a minimal
+/// integer that is greater than or equal to \p Value and equal to
+/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
+/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
+///
+/// Examples:
+/// \code
 ///   RoundUpToAlignment(5, 8) = 8
 ///   RoundUpToAlignment(17, 8) = 24
 ///   RoundUpToAlignment(~0LL, 8) = 0
diff -Nru spades-3.10.1+dfsg/ext/include/llvm/Support/TrailingObjects.h spades-3.11.1+dfsg/ext/include/llvm/Support/TrailingObjects.h
--- spades-3.10.1+dfsg/ext/include/llvm/Support/TrailingObjects.h	1970-01-01 00:00:00.000000000 +0000
+++ spades-3.11.1+dfsg/ext/include/llvm/Support/TrailingObjects.h	2017-09-28 09:05:18.000000000 +0000
@@ -0,0 +1,401 @@
+//===--- TrailingObjects.h - Variable-length classes ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This header defines support for implementing classes that have
+/// some trailing object (or arrays of objects) appended to them. The
+/// main purpose is to make it obvious where this idiom is being used,
+/// and to make the usage more idiomatic and more difficult to get
+/// wrong.
+///
+/// The TrailingObject template abstracts away the reinterpret_cast,
+/// pointer arithmetic, and size calculations used for the allocation
+/// and access of appended arrays of objects, and takes care that they
+/// are all allocated at their required alignment. Additionally, it
+/// ensures that the base type is final -- deriving from a class that
+/// expects data appended immediately after it is typically not safe.
+///
+/// Users are expected to derive from this template, and provide
+/// numTrailingObjects implementations for each trailing type except
+/// the last, e.g. like this sample:
+///
+/// \code
+/// class VarLengthObj : private TrailingObjects {
+///   friend TrailingObjects;
+///
+///   unsigned NumInts, NumDoubles;
+///   size_t numTrailingObjects(OverloadToken) const { return NumInts; }
+///  };
+/// \endcode
+///
+/// You can access the appended arrays via 'getTrailingObjects', and
+/// determine the size needed for allocation via
+/// 'additionalSizeToAlloc' and 'totalSizeToAlloc'.
+///
+/// All the methods implemented by this class are are intended for use
+/// by the implementation of the class, not as part of its interface
+/// (thus, private inheritance is suggested).
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_TRAILINGOBJECTS_H
+#define LLVM_SUPPORT_TRAILINGOBJECTS_H
+
+#include "llvm/Support/AlignOf.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/type_traits.h"
+#include 
+#include 
+
+namespace llvm {
+
+namespace trailing_objects_internal {
+/// Helper template to calculate the max alignment requirement for a set of
+/// objects.
+template  class AlignmentCalcHelper {
+private:
+  enum {
+    FirstAlignment = alignof(First),
+    RestAlignment = AlignmentCalcHelper::Alignment,
+  };
+
+public:
+  enum {
+    Alignment = FirstAlignment > RestAlignment ? FirstAlignment : RestAlignment
+  };
+};
+
+template  class AlignmentCalcHelper {
+public:
+  enum { Alignment = alignof(First) };
+};
+
+/// The base class for TrailingObjects* classes.
+class TrailingObjectsBase {
+protected:
+  /// OverloadToken's purpose is to allow specifying function overloads
+  /// for different types, without actually taking the types as
+  /// parameters. (Necessary because member function templates cannot
+  /// be specialized, so overloads must be used instead of
+  /// specialization.)
+  template  struct OverloadToken {};
+};
+
+/// This helper template works-around MSVC 2013's lack of useful
+/// alignas() support. The argument to LLVM_ALIGNAS(), in MSVC, is
+/// required to be a literal integer. But, you *can* use template
+/// specialization to select between a bunch of different LLVM_ALIGNAS
+/// expressions...
+template 
+class TrailingObjectsAligner : public TrailingObjectsBase {};
+template <>
+class LLVM_ALIGNAS(1) TrailingObjectsAligner<1> : public TrailingObjectsBase {};
+template <>
+class LLVM_ALIGNAS(2) TrailingObjectsAligner<2> : public TrailingObjectsBase {};
+template <>
+class LLVM_ALIGNAS(4) TrailingObjectsAligner<4> : public TrailingObjectsBase {};
+template <>
+class LLVM_ALIGNAS(8) TrailingObjectsAligner<8> : public TrailingObjectsBase {};
+template <>
+class LLVM_ALIGNAS(16) TrailingObjectsAligner<16> : public TrailingObjectsBase {
+};
+template <>
+class LLVM_ALIGNAS(32) TrailingObjectsAligner<32> : public TrailingObjectsBase {
+};
+
+// Just a little helper for transforming a type pack into the same
+// number of a different type. e.g.:
+//   ExtractSecondType::type
+template  struct ExtractSecondType {
+  typedef Ty2 type;
+};
+
+// TrailingObjectsImpl is somewhat complicated, because it is a
+// recursively inheriting template, in order to handle the template
+// varargs. Each level of inheritance picks off a single trailing type
+// then recurses on the rest. The "Align", "BaseTy", and
+// "TopTrailingObj" arguments are passed through unchanged through the
+// recursion. "PrevTy" is, at each level, the type handled by the
+// level right above it.
+
+template 
+class TrailingObjectsImpl {
+  // The main template definition is never used -- the two
+  // specializations cover all possibilities.
+};
+
+template 
+class TrailingObjectsImpl
+    : public TrailingObjectsImpl {
+
+  typedef TrailingObjectsImpl
+      ParentType;
+
+  struct RequiresRealignment {
+    static const bool value = alignof(PrevTy) < alignof(NextTy);
+  };
+
+  static constexpr bool requiresRealignment() {
+    return RequiresRealignment::value;
+  }
+
+protected:
+  // Ensure the inherited getTrailingObjectsImpl is not hidden.
+  using ParentType::getTrailingObjectsImpl;
+
+  // These two functions are helper functions for
+  // TrailingObjects::getTrailingObjects. They recurse to the left --
+  // the result for each type in the list of trailing types depends on
+  // the result of calling the function on the type to the
+  // left. However, the function for the type to the left is
+  // implemented by a *subclass* of this class, so we invoke it via
+  // the TopTrailingObj, which is, via the
+  // curiously-recurring-template-pattern, the most-derived type in
+  // this recursion, and thus, contains all the overloads.
+  static const NextTy *
+  getTrailingObjectsImpl(const BaseTy *Obj,
+                         TrailingObjectsBase::OverloadToken) {
+    auto *Ptr = TopTrailingObj::getTrailingObjectsImpl(
+                    Obj, TrailingObjectsBase::OverloadToken()) +
+                TopTrailingObj::callNumTrailingObjects(
+                    Obj, TrailingObjectsBase::OverloadToken());
+
+    if (requiresRealignment())
+      return reinterpret_cast(
+          llvm::alignAddr(Ptr, alignof(NextTy)));
+    else
+      return reinterpret_cast(Ptr);
+  }
+
+  static NextTy *
+  getTrailingObjectsImpl(BaseTy *Obj,
+                         TrailingObjectsBase::OverloadToken) {
+    auto *Ptr = TopTrailingObj::getTrailingObjectsImpl(
+                    Obj, TrailingObjectsBase::OverloadToken()) +
+                TopTrailingObj::callNumTrailingObjects(
+                    Obj, TrailingObjectsBase::OverloadToken());
+
+    if (requiresRealignment())
+      return reinterpret_cast(llvm::alignAddr(Ptr, alignof(NextTy)));
+    else
+      return reinterpret_cast(Ptr);
+  }
+
+  // Helper function for TrailingObjects::additionalSizeToAlloc: this
+  // function recurses to superclasses, each of which requires one
+  // fewer size_t argument, and adds its own size.
+  static constexpr size_t additionalSizeToAllocImpl(
+      size_t SizeSoFar, size_t Count1,
+      typename ExtractSecondType::type... MoreCounts) {
+    return ParentType::additionalSizeToAllocImpl(
+        (requiresRealignment() ? llvm::alignTo(SizeSoFar)
+                               : SizeSoFar) +
+            sizeof(NextTy) * Count1,
+        MoreCounts...);
+  }
+};
+
+// The base case of the TrailingObjectsImpl inheritance recursion,
+// when there's no more trailing types.
+template 
+class TrailingObjectsImpl
+    : public TrailingObjectsAligner {
+protected:
+  // This is a dummy method, only here so the "using" doesn't fail --
+  // it will never be called, because this function recurses backwards
+  // up the inheritance chain to subclasses.
+  static void getTrailingObjectsImpl();
+
+  static constexpr size_t additionalSizeToAllocImpl(size_t SizeSoFar) {
+    return SizeSoFar;
+  }
+
+  template  static void verifyTrailingObjectsAlignment() {}
+};
+
+} // end namespace trailing_objects_internal
+
+// Finally, the main type defined in this file, the one intended for users...
+
+/// See the file comment for details on the usage of the
+/// TrailingObjects type.
+template 
+class TrailingObjects : private trailing_objects_internal::TrailingObjectsImpl<
+                            trailing_objects_internal::AlignmentCalcHelper<
+                                TrailingTys...>::Alignment,
+                            BaseTy, TrailingObjects,
+                            BaseTy, TrailingTys...> {
+
+  template 
+  friend class trailing_objects_internal::TrailingObjectsImpl;
+
+  template  class Foo {};
+
+  typedef trailing_objects_internal::TrailingObjectsImpl<
+      trailing_objects_internal::AlignmentCalcHelper::Alignment,
+      BaseTy, TrailingObjects, BaseTy, TrailingTys...>
+      ParentType;
+  using TrailingObjectsBase = trailing_objects_internal::TrailingObjectsBase;
+
+  using ParentType::getTrailingObjectsImpl;
+
+  // This function contains only a static_assert BaseTy is final. The
+  // static_assert must be in a function, and not at class-level
+  // because BaseTy isn't complete at class instantiation time, but
+  // will be by the time this function is instantiated.
+  static void verifyTrailingObjectsAssertions() {
+#ifdef LLVM_IS_FINAL
+    static_assert(LLVM_IS_FINAL(BaseTy), "BaseTy must be final.");
+#endif
+  }
+
+  // These two methods are the base of the recursion for this method.
+  static const BaseTy *
+  getTrailingObjectsImpl(const BaseTy *Obj,
+                         TrailingObjectsBase::OverloadToken) {
+    return Obj;
+  }
+
+  static BaseTy *
+  getTrailingObjectsImpl(BaseTy *Obj,
+                         TrailingObjectsBase::OverloadToken) {
+    return Obj;
+  }
+
+  // callNumTrailingObjects simply calls numTrailingObjects on the
+  // provided Obj -- except when the type being queried is BaseTy
+  // itself. There is always only one of the base object, so that case
+  // is handled here. (An additional benefit of indirecting through
+  // this function is that consumers only say "friend
+  // TrailingObjects", and thus, only this class itself can call the
+  // numTrailingObjects function.)
+  static size_t
+  callNumTrailingObjects(const BaseTy *Obj,
+                         TrailingObjectsBase::OverloadToken) {
+    return 1;
+  }
+
+  template 
+  static size_t callNumTrailingObjects(const BaseTy *Obj,
+                                       TrailingObjectsBase::OverloadToken) {
+    return Obj->numTrailingObjects(TrailingObjectsBase::OverloadToken());
+  }
+
+public:
+  // Make this (privately inherited) member public.
+#ifndef _MSC_VER
+  using ParentType::OverloadToken;
+#else
+  // MSVC bug prevents the above from working, at least up through CL
+  // 19.10.24629.
+  template 
+  using OverloadToken = typename ParentType::template OverloadToken;
+#endif
+
+  /// Returns a pointer to the trailing object array of the given type
+  /// (which must be one of those specified in the class template). The
+  /// array may have zero or more elements in it.
+  template  const T *getTrailingObjects() const {
+    verifyTrailingObjectsAssertions();
+    // Forwards to an impl function with overloads, since member
+    // function templates can't be specialized.
+    return this->getTrailingObjectsImpl(
+        static_cast(this),
+        TrailingObjectsBase::OverloadToken());
+  }
+
+  /// Returns a pointer to the trailing object array of the given type
+  /// (which must be one of those specified in the class template). The
+  /// array may have zero or more elements in it.
+  template  T *getTrailingObjects() {
+    verifyTrailingObjectsAssertions();
+    // Forwards to an impl function with overloads, since member
+    // function templates can't be specialized.
+    return this->getTrailingObjectsImpl(
+        static_cast(this), TrailingObjectsBase::OverloadToken());
+  }
+
+  /// Returns the size of the trailing data, if an object were
+  /// allocated with the given counts (The counts are in the same order
+  /// as the template arguments). This does not include the size of the
+  /// base object.  The template arguments must be the same as those
+  /// used in the class; they are supplied here redundantly only so
+  /// that it's clear what the counts are counting in callers.
+  template 
+  static constexpr typename std::enable_if<
+      std::is_same, Foo>::value, size_t>::type
+  additionalSizeToAlloc(typename trailing_objects_internal::ExtractSecondType<
+                        TrailingTys, size_t>::type... Counts) {
+    return ParentType::additionalSizeToAllocImpl(0, Counts...);
+  }
+
+  /// Returns the total size of an object if it were allocated with the
+  /// given trailing object counts. This is the same as
+  /// additionalSizeToAlloc, except it *does* include the size of the base
+  /// object.
+  template 
+  static constexpr typename std::enable_if<
+      std::is_same, Foo>::value, size_t>::type
+  totalSizeToAlloc(typename trailing_objects_internal::ExtractSecondType<
+                   TrailingTys, size_t>::type... Counts) {
+    return sizeof(BaseTy) + ParentType::additionalSizeToAllocImpl(0, Counts...);
+  }
+
+  /// A type where its ::with_counts template member has a ::type member
+  /// suitable for use as uninitialized storage for an object with the given
+  /// trailing object counts. The template arguments are similar to those
+  /// of additionalSizeToAlloc.
+  ///
+  /// Use with FixedSizeStorageOwner, e.g.:
+  ///
+  /// \code{.cpp}
+  ///
+  /// MyObj::FixedSizeStorage::with_counts<1u>::type myStackObjStorage;
+  /// MyObj::FixedSizeStorageOwner
+  ///     myStackObjOwner(new ((void *)&myStackObjStorage) MyObj);
+  /// MyObj *const myStackObjPtr = myStackObjOwner.get();
+  ///
+  /// \endcode
+  template  struct FixedSizeStorage {
+    template  struct with_counts {
+      enum { Size = totalSizeToAlloc(Counts...) };
+      typedef llvm::AlignedCharArray type;
+    };
+  };
+
+  /// A type that acts as the owner for an object placed into fixed storage.
+  class FixedSizeStorageOwner {
+  public:
+    FixedSizeStorageOwner(BaseTy *p) : p(p) {}
+    ~FixedSizeStorageOwner() {
+      assert(p && "FixedSizeStorageOwner owns null?");
+      p->~BaseTy();
+    }
+
+    BaseTy *get() { return p; }
+    const BaseTy *get() const { return p; }
+
+  private:
+    FixedSizeStorageOwner(const FixedSizeStorageOwner &) = delete;
+    FixedSizeStorageOwner(FixedSizeStorageOwner &&) = delete;
+    FixedSizeStorageOwner &operator=(const FixedSizeStorageOwner &) = delete;
+    FixedSizeStorageOwner &operator=(FixedSizeStorageOwner &&) = delete;
+
+    BaseTy *const p;
+  };
+};
+
+} // end namespace llvm
+
+#endif
diff -Nru spades-3.10.1+dfsg/ext/src/ConsensusCore/CMakeLists.txt spades-3.11.1+dfsg/ext/src/ConsensusCore/CMakeLists.txt
--- spades-3.10.1+dfsg/ext/src/ConsensusCore/CMakeLists.txt	2017-02-28 14:55:40.000000000 +0000
+++ spades-3.11.1+dfsg/ext/src/ConsensusCore/CMakeLists.txt	2017-09-28 09:05:16.000000000 +0000
@@ -6,8 +6,6 @@
 add_library(ConsensusCore STATIC 
             AffineAlignment.cpp
             Coverage.cpp
-            Feature.cpp
-            Features.cpp
             Mutation.cpp
             PairwiseAlignment.cpp
             Sequence.cpp
diff -Nru spades-3.10.1+dfsg/ext/src/ConsensusCore/Feature.cpp spades-3.11.1+dfsg/ext/src/ConsensusCore/Feature.cpp
--- spades-3.10.1+dfsg/ext/src/ConsensusCore/Feature.cpp	2017-02-28 14:55:40.000000000 +0000
+++ spades-3.11.1+dfsg/ext/src/ConsensusCore/Feature.cpp	1970-01-01 00:00:00.000000000 +0000
@@ -1,64 +0,0 @@
-// Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
-//
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted (subject to the limitations in the
-// disclaimer below) provided that the following conditions are met:
-//
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//
-//  * Redistributions in binary form must reproduce the above
-//    copyright notice, this list of conditions and the following
-//    disclaimer in the documentation and/or other materials provided
-//    with the distribution.
-//
-//  * Neither the name of Pacific Biosciences nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-// SUCH DAMAGE.
-
-// Author: David Alexander
-
-#include "Feature.hpp"
-
-#include 
-
-namespace ConsensusCore {
-
-#ifndef SWIG
-    template<>
-    Feature::operator std::string() const
-    {
-        return "";
-    }
-
-    template<>
-    Feature::operator std::string() const
-    {
-        return std::string(get(), Length());
-    }
-#endif  // !SWIG
-
-
-    template class ConsensusCore::Feature;
-    template class ConsensusCore::Feature;
-    template class ConsensusCore::Feature;
-}
-
-
diff -Nru spades-3.10.1+dfsg/ext/src/ConsensusCore/Features.cpp spades-3.11.1+dfsg/ext/src/ConsensusCore/Features.cpp
--- spades-3.10.1+dfsg/ext/src/ConsensusCore/Features.cpp	2017-02-28 14:55:40.000000000 +0000
+++ spades-3.11.1+dfsg/ext/src/ConsensusCore/Features.cpp	1970-01-01 00:00:00.000000000 +0000
@@ -1,51 +0,0 @@
-// Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
-//
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted (subject to the limitations in the
-// disclaimer below) provided that the following conditions are met:
-//
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//
-//  * Redistributions in binary form must reproduce the above
-//    copyright notice, this list of conditions and the following
-//    disclaimer in the documentation and/or other materials provided
-//    with the distribution.
-//
-//  * Neither the name of Pacific Biosciences nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-// SUCH DAMAGE.
-
-// Author: David Alexander
-
-#include "Features.hpp"
-
-#include 
-#include 
-
-#include "Feature.hpp"
-#include "Types.hpp"
-
-
-ConsensusCore::SequenceFeatures::SequenceFeatures(const std::string& seq)
-    : sequence_(seq.c_str(), seq.length())
-{}
-
-
diff -Nru spades-3.10.1+dfsg/ext/src/ConsensusCore/Matrix/DenseMatrix.cpp spades-3.11.1+dfsg/ext/src/ConsensusCore/Matrix/DenseMatrix.cpp
--- spades-3.10.1+dfsg/ext/src/ConsensusCore/Matrix/DenseMatrix.cpp	2017-02-28 14:55:40.000000000 +0000
+++ spades-3.11.1+dfsg/ext/src/ConsensusCore/Matrix/DenseMatrix.cpp	2017-09-28 09:05:16.000000000 +0000
@@ -39,7 +39,7 @@
 #include "Matrix/DenseMatrix.hpp"
 
 #include 
-#include 
+#include 
 #include 
 
 #include "LFloat.hpp"
@@ -73,7 +73,7 @@
         for (int col = 0; col < Columns(); ++col)
         {
             int start, end;
-            boost::tie(start, end) = UsedRowRange(col);
+            std::tie(start, end) = UsedRowRange(col);
             filledEntries += (end - start);
         }
         return filledEntries;
@@ -101,7 +101,7 @@
     {
         // make sure no used entries are outside of the bands
         int start, end;
-        boost::tie(start, end) = UsedRowRange(column);
+        std::tie(start, end) = UsedRowRange(column);
         assert(0 <= start && start <= end && end <= Rows());
         for (int i = 0; i < Rows(); i++)
         {
diff -Nru spades-3.10.1+dfsg/ext/src/ConsensusCore/Matrix/SparseMatrix.cpp spades-3.11.1+dfsg/ext/src/ConsensusCore/Matrix/SparseMatrix.cpp
--- spades-3.10.1+dfsg/ext/src/ConsensusCore/Matrix/SparseMatrix.cpp	2017-02-28 14:55:40.000000000 +0000
+++ spades-3.11.1+dfsg/ext/src/ConsensusCore/Matrix/SparseMatrix.cpp	2017-09-28 09:05:16.000000000 +0000
@@ -38,7 +38,7 @@
 #include "Matrix/SparseMatrix.hpp"
 
 #include 
-#include 
+#include 
 
 namespace ConsensusCore {
     // Performance insensitive routines are not inlined
@@ -85,7 +85,7 @@
         for (int col = 0; col < Columns(); ++col)
         {
             int start, end;
-            boost::tie(start, end) = UsedRowRange(col);
+            std::tie(start, end) = UsedRowRange(col);
             filledEntries += (end - start);
         }
         return filledEntries;
diff -Nru spades-3.10.1+dfsg/ext/src/ConsensusCore/Poa/PoaConsensus.cpp spades-3.11.1+dfsg/ext/src/ConsensusCore/Poa/PoaConsensus.cpp
--- spades-3.10.1+dfsg/ext/src/ConsensusCore/Poa/PoaConsensus.cpp	2017-02-28 14:55:40.000000000 +0000
+++ spades-3.11.1+dfsg/ext/src/ConsensusCore/Poa/PoaConsensus.cpp	2017-09-28 09:05:16.000000000 +0000
@@ -37,9 +37,6 @@
 
 #include "Poa/PoaConsensus.hpp"
 
-#include 
-#include 
-#include 
 #include 
 #include 
 #include 
@@ -47,8 +44,6 @@
 #include "Poa/PoaConfig.hpp"
 #include "Utils.hpp"
 
-using boost::tie;
-
 namespace ConsensusCore
 {
     PoaConsensus::PoaConsensus(const PoaConfig& config)
@@ -80,7 +75,7 @@
             }
             pc->poaGraph_->AddSequence(read, config);
         }
-        boost::tie(pc->consensusSequence_, pc->score_, pc->variants_) =
+        std::tie(pc->consensusSequence_, pc->score_, pc->variants_) =
             pc->poaGraph_->FindConsensus(config);
         return pc;
     }
diff -Nru spades-3.10.1+dfsg/ext/src/ConsensusCore/Poa/PoaGraph.cpp spades-3.11.1+dfsg/ext/src/ConsensusCore/Poa/PoaGraph.cpp
--- spades-3.10.1+dfsg/ext/src/ConsensusCore/Poa/PoaGraph.cpp	2017-02-28 14:55:40.000000000 +0000
+++ spades-3.11.1+dfsg/ext/src/ConsensusCore/Poa/PoaGraph.cpp	2017-09-28 09:05:16.000000000 +0000
@@ -45,7 +45,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -55,6 +54,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include "Poa/PoaConfig.hpp"
 #include "Types.hpp"
@@ -201,7 +202,7 @@
         {}
     };
 
-    typedef unordered_map AlignmentColumnMap;
+    typedef std::unordered_map AlignmentColumnMap;
 
     //
     // Graph::Impl methods
@@ -238,7 +239,7 @@
         void AddSequence(const std::string& sequence, const PoaConfig& config);
 
         // TODO(dalexander): make this const
-        tuple >*>
+        std::tuple >*>
         FindConsensus(const PoaConfig& config);
 
         int NumSequences() const;
@@ -497,7 +498,7 @@
         VertexInfoMap vertexInfoMap = get(vertex_info, g);
         std::list sortedVertices(num_vertices(g));
         topological_sort(g, sortedVertices.rbegin());
-        unordered_map bestPrevVertex;
+        std::unordered_map bestPrevVertex;
 
         // ignore ^ and $
         // TODO(dalexander): find a cleaner way to do this
@@ -696,11 +697,11 @@
     }
 
 
-    static boost::unordered_set
+    static std::unordered_set
     childVertices(Vertex v,
                   BoostGraph& g)
     {
-        boost::unordered_set result;
+        std::unordered_set result;
         foreach (Edge e, out_edges(v, g))
         {
             result.insert(target(e, g));
@@ -708,11 +709,11 @@
         return result;
     }
 
-    static boost::unordered_set
+    static std::unordered_set
     parentVertices(Vertex v,
                    BoostGraph& g)
     {
-        boost::unordered_set result;
+        std::unordered_set result;
         foreach (Edge e, in_edges(v, g))
         {
             result.insert(source(e, g));
@@ -721,7 +722,7 @@
     }
 
 
-    tuple >* >
+    std::tuple >* >
     PoaGraph::Impl::FindConsensus(const PoaConfig& config)
     {
         std::stringstream ss;
@@ -736,14 +737,14 @@
         // if requested, identify likely sequence variants
 
         // will be deallocated by PoaConsensus destructor.
-        vector< pair >* variants = new vector< pair >();
+        vector< std::pair >* variants = new vector< pair >();
 
         if (true)  // TODO(dalexander): Add a flag to PoaConfig
         {
             for (int i = 2; i < (int)bestPath.size() - 2; i++) // NOLINT
             {
                 Vertex v = bestPath[i];
-                boost::unordered_set children = childVertices(v, g_);
+                std::unordered_set children = childVertices(v, g_);
 
                 // Look for a direct edge from the current node to the node
                 // two spaces down---suggesting a deletion with respect to
@@ -758,7 +759,7 @@
                 // This indicates we should try inserting the base at i + 1.
 
                 // Parents of (i + 1)
-                boost::unordered_set lookBack = parentVertices(bestPath[i + 1], g_);
+                std::unordered_set lookBack = parentVertices(bestPath[i + 1], g_);
 
                 // (We could do this in STL using std::set sorted on score, which would then
                 // provide an intersection mechanism (in ) but that actually ends
@@ -768,7 +769,7 @@
 
                 foreach (Vertex v, children)
                 {
-                    boost::unordered_set::iterator found = lookBack.find(v);
+                    std::unordered_set::iterator found = lookBack.find(v);
                     if (found != lookBack.end())
                     {
                         float score = vertexInfoMap_[*found]->Score;
@@ -800,7 +801,7 @@
                 {
                     if (v == bestPath[i + 1]) continue;
 
-                    boost::unordered_set::iterator found = lookBack.find(v);
+                    std::unordered_set::iterator found = lookBack.find(v);
                     if (found != lookBack.end())
                     {
                         float score = vertexInfoMap_[*found]->Score;
@@ -824,7 +825,7 @@
             }
         }
 
-        return boost::make_tuple(ss.str(), 0.0f, variants);  // TODO(dalexander): where do we get scores?
+        return std::tuple >* >{ ss.str(), 0.0f, variants };  // TODO(dalexander): where do we get scores?
     }
 
     inline int
@@ -865,7 +866,7 @@
         return impl->NumSequences();
     }
 
-    tuple >* >
+    std::tuple >* >
     PoaGraph::FindConsensus(const PoaConfig& config) const
     {
         return impl->FindConsensus(config);
diff -Nru spades-3.10.1+dfsg/ext/src/ConsensusCore/Version.cpp spades-3.11.1+dfsg/ext/src/ConsensusCore/Version.cpp
--- spades-3.10.1+dfsg/ext/src/ConsensusCore/Version.cpp	2017-02-28 14:55:40.000000000 +0000
+++ spades-3.11.1+dfsg/ext/src/ConsensusCore/Version.cpp	2017-09-28 09:05:16.000000000 +0000
@@ -38,8 +38,6 @@
 #include 
 #include 
 #include 
-#include 
-#include 
 
 #include "ConsensusCore/Version.hpp"
 
diff -Nru spades-3.10.1+dfsg/LICENSE spades-3.11.1+dfsg/LICENSE
--- spades-3.10.1+dfsg/LICENSE	2017-02-28 14:55:56.000000000 +0000
+++ spades-3.11.1+dfsg/LICENSE	2017-09-28 09:05:59.000000000 +0000
@@ -21,13 +21,14 @@
 
     Dmitry Antipov,
     Anton Bankevich,
+    Elena Bushmanova,
+    Vasisliy Ershov,
     Yuriy Gorshkov,
     Alexey Gurevich,
     Anton Korobeynikov,
-    Dmitriy Meleshko,
     Sergey Nurk,
     Andrey Prjibelski,
-    Yana Safonova,
+    Ivan Tolstoganov,
     Alla Lapidus and
     Pavel Pevzner
 
@@ -37,9 +38,11 @@
     Mikhail Dvorkin,
     Alexander Kulikov,
     Valery Lesin,
+    Dmitriy Meleshko,
     Sergey Nikolenko,
     Son Pham,
     Alexey Pyshkin,
+    Yana Safonova,
     Vladislav Saveliev,
     Alexander Sirotkin,
     Yakov Sirotkin,
diff -Nru spades-3.10.1+dfsg/manual.html spades-3.11.1+dfsg/manual.html
--- spades-3.10.1+dfsg/manual.html	2017-02-28 14:55:56.000000000 +0000
+++ spades-3.11.1+dfsg/manual.html	2017-09-28 09:05:59.000000000 +0000
@@ -1,6 +1,6 @@
 
 
-    SPAdes 3.10.1 Manual
+    SPAdes 3.11.1 Manual
     
 
 
-

SPAdes 3.10.1 Manual

+

SPAdes 3.11.1 Manual

1. About SPAdes
    1.1. Supported data types
@@ -35,16 +35,16 @@

1. About SPAdes

SPAdes – St. Petersburg genome assembler – is an assembly toolkit containing various assembly pipelines. This manual will help you to install and run SPAdes. -SPAdes version 3.10.1 was released under GPLv2 on March 1, 2017 and can be downloaded from http://cab.spbu.ru/software/spades/. +SPAdes version 3.11.1 was released under GPLv2 on October 1, 2017 and can be downloaded from http://cab.spbu.ru/software/spades/.

1.1 Supported data types

The current version of SPAdes works with Illumina or IonTorrent reads and is capable of providing hybrid assemblies using PacBio, Oxford Nanopore and Sanger reads. You can also provide additional contigs that will be used as long reads.

- Version 3.10.1 of SPAdes supports paired-end reads, mate-pairs and unpaired reads. SPAdes can take as input several paired-end and mate-pair libraries simultaneously. Note, that SPAdes was initially designed for small genomes. It was tested on bacterial (both single-cell MDA and standard isolates), fungal and other small genomes. SPAdes is not intended for larger genomes (e.g. mammalian size genomes). For such purposes you can use it at your own risk. + Version 3.11.1 of SPAdes supports paired-end reads, mate-pairs and unpaired reads. SPAdes can take as input several paired-end and mate-pair libraries simultaneously. Note, that SPAdes was initially designed for small genomes. It was tested on bacterial (both single-cell MDA and standard isolates), fungal and other small genomes. SPAdes is not intended for larger genomes (e.g. mammalian size genomes). For such purposes you can use it at your own risk.

- SPAdes 3.10.1 includes the following additional pipelines: + SPAdes 3.11.1 includes the following additional pipelines:

  • dipSPAdes – a module for assembling highly polymorphic diploid genomes (see dipSPAdes manual).
  • metaSPAdes – a pipeline for metagenomic data sets (see metaSPAdes options).
  • @@ -98,42 +98,42 @@ BayesHammer - 29m + 26m 7.1 11 - 34m - 7.6 - 8.8 + 31m + 6.9 + 11.3 SPAdes - 11m - 8.4 - 1.6 - 17m - 8 - 3.0 + 8m + 8.1 + 1.5 + 12m + 7.9 + 2.6 MismatchCorrector - 13m + 20m 1.8 - 27.1 - 16m + 27.7 + 25m 1.8 - 25.5 + 28.3 Whole pipeline - 53m - 8.4 - 29.6 - 1h 7m - 8 - 28.3 + 54m + 8.1 + 30.2 + 1h 8m + 7.9 + 31.1 @@ -143,7 +143,7 @@
  • Running SPAdes without preliminary read error correction (e.g. without BayesHammer or IonHammer) will likely require more time and memory.
  • Each module removes its temporary files as soon as it finishes.
  • SPAdes uses 512 Mb per thread for buffers, which results in higher memory consumption. If you set memory limit manually, SPAdes will use smaller buffers and thus less RAM.
  • -
  • Performance statistics is given for SPAdes version 3.10.1.
  • +
  • Performance statistics is given for SPAdes version 3.11.1.
@@ -151,19 +151,19 @@

2. Installation

- SPAdes requires a 64-bit Linux system or Mac OS and Python (supported versions are 2.4, 2.5, 2.6, 2.7, 3.2, 3.3, 3.4 and 3.5) to be pre-installed on it. To obtain SPAdes you can either download binaries or download source code and compile it yourself. + SPAdes requires a 64-bit Linux system or Mac OS and Python (supported versions are Python2: 2.4–2.7, and Python3: 3.2 and higher) to be pre-installed on it. To obtain SPAdes you can either download binaries or download source code and compile it yourself.

2.1 Downloading SPAdes Linux binaries

- To download SPAdes Linux binaries and extract them, go to the directory in which you wish SPAdes to be installed and run: + To download SPAdes Linux binaries and extract them, go to the directory in which you wish SPAdes to be installed and run:

 
-    wget http://cab.spbu.ru/files/release3.10.1/SPAdes-3.10.1-Linux.tar.gz
-    tar -xzf SPAdes-3.10.1-Linux.tar.gz
-    cd SPAdes-3.10.1-Linux/bin/
+    wget http://cab.spbu.ru/files/release3.11.1/SPAdes-3.11.1-Linux.tar.gz
+    tar -xzf SPAdes-3.11.1-Linux.tar.gz
+    cd SPAdes-3.11.1-Linux/bin/
 
 
@@ -192,13 +192,13 @@

2.2 Downloading SPAdes binaries for Mac

- To obtain SPAdes binaries for Mac, go to the directory in which you wish SPAdes to be installed and run: + To obtain SPAdes binaries for Mac, go to the directory in which you wish SPAdes to be installed and run:

 
-    curl http://cab.spbu.ru/files/release3.10.1/SPAdes-3.10.1-Darwin.tar.gz -o SPAdes-3.10.1-Darwin.tar.gz
-    tar -zxf SPAdes-3.10.1-Darwin.tar.gz
-    cd SPAdes-3.10.1-Darwin/bin/
+    curl http://cab.spbu.ru/files/release3.11.1/SPAdes-3.11.1-Darwin.tar.gz -o SPAdes-3.11.1-Darwin.tar.gz
+    tar -zxf SPAdes-3.11.1-Darwin.tar.gz
+    cd SPAdes-3.11.1-Darwin/bin/
 
 
@@ -236,13 +236,13 @@

- If you meet these requirements, you can download the SPAdes source code: + If you meet these requirements, you can download the SPAdes source code:

 
-    wget http://cab.spbu.ru/files/release3.10.1/SPAdes-3.10.1.tar.gz
-    tar -xzf SPAdes-3.10.1.tar.gz
-    cd SPAdes-3.10.1
+    wget http://cab.spbu.ru/files/release3.11.1/SPAdes-3.11.1.tar.gz
+    tar -xzf SPAdes-3.11.1.tar.gz
+    cd SPAdes-3.11.1
 
 
@@ -354,7 +354,7 @@ SPAdes takes as input paired-end reads, mate-pairs and single (unpaired) reads in FASTA and FASTQ. For IonTorrent data SPAdes also supports unpaired reads in unmapped BAM format (like the one produced by Torrent Server). However, in order to run read error correction, reads should be in FASTQ or BAM format. Sanger, Oxford Nanopore and PacBio CLR reads can be provided in both formats since SPAdes does not run error correction for these types of data.

- To run SPAdes 3.10.1 you need at least one library of the following types: + To run SPAdes 3.11.1 you need at least one library of the following types:

  • Illumina paired-end/high-quality mate-pairs/unpaired reads
  • IonTorrent paired-end/high-quality mate-pairs/unpaired reads
  • @@ -382,6 +382,7 @@

    Notes:

      +
    • It is strongly suggested to provide multiple paired-end and mate-pair libraries according to their insert size (from smallest to longest).
    • It is not recommended to run SPAdes on PacBio reads with low coverage (less than 5).
    • We suggest not to run SPAdes on PacBio reads for large genomes.
    • SPAdes accepts gzip-compressed files.
    • @@ -1209,13 +1210,10 @@

      5. Feedback and bug reports

      - Your comments, bug reports, and suggestions are very welcomed. They will help us to further improve SPAdes. - -

      - If you have any troubles running SPAdes, please send us params.txt and spades.log from the directory <output_dir>. + Your comments, bug reports, and suggestions are very welcomed. They will help us to further improve SPAdes. If you have any troubles running SPAdes, please send us params.txt and spades.log from the directory <output_dir>.

      - Address for communications: spades.support@cab.spbu.ru. + You leave you comments and bug repoers at our GitHub repository tracker or sent it via e-mail: spades.support@cab.spbu.ru.




      diff -Nru spades-3.10.1+dfsg/metaspades.py spades-3.11.1+dfsg/metaspades.py --- spades-3.10.1+dfsg/metaspades.py 2017-02-28 14:55:56.000000000 +0000 +++ spades-3.11.1+dfsg/metaspades.py 2017-09-28 09:05:59.000000000 +0000 @@ -158,7 +158,7 @@ def fill_cfg(options_to_parse, log, secondary_filling=False): - skip_output_dir=secondary_filling + skip_output_dir = secondary_filling skip_stop_after = secondary_filling load_processed_dataset=secondary_filling @@ -211,6 +211,7 @@ options_storage.configs_dir = support.check_dir_existence(arg) elif opt == "--reference": options_storage.reference = support.check_file_existence(arg, 'reference', log) + options_storage.developer_mode = True elif opt == "--series-analysis": options_storage.series_analysis = support.check_file_existence(arg, 'series-analysis', log) elif opt == "--dataset": @@ -239,8 +240,19 @@ options_storage.large_genome = True elif opt == "--plasmid": options_storage.plasmid = True + elif opt == "--rna": options_storage.rna = True + elif opt.startswith("--ss-"): # strand specificity, RNA-Seq only + if opt == "--ss-rf": + options_storage.strand_specific = True + elif opt == "--ss-fr": + options_storage.strand_specific = False + elif opt == "--fast": # fast run, RNA-Seq only + options_storage.fast = True + elif opt == "--fast:false": + options_storage.fast = False + elif opt == "--iontorrent": options_storage.iontorrent = True elif opt == "--disable-gzip-output": @@ -295,6 +307,8 @@ else: support.error('wrong PHRED quality offset value: ' + arg + ' (should be either 33, 64, or \'auto\')', log) + elif opt == "--save-gp": + options_storage.save_gp = True elif opt == "--cov-cutoff": if arg == 'auto' or arg == 'off': options_storage.cov_cutoff = arg @@ -303,6 +317,12 @@ else: support.error('wrong value for --cov-cutoff option: ' + arg + ' (should be a positive float number, or \'auto\', or \'off\')', log) + elif opt == "--hidden-cov-cutoff": + if support.is_float(arg) and float(arg) > 0.0: + options_storage.lcer_cutoff = float(arg) + else: + support.error('wrong value for --hidden-cov-cutoff option: ' + arg + + ' (should be a positive float number)', log) elif opt == '-i' or opt == "--iterations": options_storage.iterations = int(arg) @@ -356,7 +376,7 @@ support.error("the output_dir should exist for --continue and for --restart-from!", log) os.makedirs(options_storage.output_dir) if options_storage.restart_from: - if options_storage.continue_mode: # saving parameters specified with --restart-from + if options_storage.continue_mode: # saving parameters specified with --restart-from if not support.dataset_is_empty(dataset_data): support.error("you cannot specify reads with --restart-from option!", log) options_storage.save_restart_options(log) @@ -370,6 +390,10 @@ support.error("you cannot specify --careful in RNA-Seq mode!", log) if options_storage.k_mers and options_storage.k_mers != 'auto' and len(options_storage.k_mers) > 1: support.error("you cannot specify multiple k-mer sizes in RNA-Seq mode!", log) + if [options_storage.meta, options_storage.large_genome, options_storage.truseq_mode, + options_storage.rna, options_storage.plasmid, options_storage.single_cell].count(True) > 1: + support.error("you cannot simultaneously use more than one mode out of " + "Metagenomic, Large genome, Illumina TruSeq, RNA-Seq, Plasmid, and Single-cell!", log) if options_storage.continue_mode: return None, None @@ -403,11 +427,12 @@ if len(dataset_data) != len(support.get_lib_ids_by_type(dataset_data, spades_logic.READS_TYPES_USED_IN_RNA_SEQ)): support.error('you cannot specify any data types except ' + ', '.join(spades_logic.READS_TYPES_USED_IN_RNA_SEQ) + ' in RNA-Seq mode!') - if len(support.get_lib_ids_by_type(dataset_data, 'paired-end')) > 1: - support.error('you cannot specify more than one paired-end library in RNA-Seq mode!') + #if len(support.get_lib_ids_by_type(dataset_data, 'paired-end')) > 1: + # support.error('you cannot specify more than one paired-end library in RNA-Seq mode!') if existing_dataset_data is None: - pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w')) + pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w'), + default_flow_style=False, default_style='"', width=float("inf")) options_storage.set_default_values() ### FILLING cfg @@ -454,12 +479,27 @@ if options_storage.k_mers: cfg["assembly"].__dict__["iterative_K"] = options_storage.k_mers elif options_storage.rna: - cfg["assembly"].__dict__["iterative_K"] = options_storage.K_MERS_RNA + k_value = options_storage.K_MERS_RNA[0] + if not options_storage.iontorrent: + k_value = int(support.get_reads_length(dataset_data, log) / 2) - 1 + if k_value % 2 == 0: + k_value -= 1 + if k_value < options_storage.MIN_K: + log.info("\n" + 'Default k value (' + str(k_value) + ') is too small, all k values should be between %d and %d. Setting k=%d.\n' + % (options_storage.MIN_K, options_storage.MAX_K, options_storage.MIN_K)) + k_value = options_storage.MIN_K + if k_value > options_storage.MAX_K: + log.info("\n" + 'Default k value (' + str(k_value) + ') is too large, all k values should be between %d and %d. Setting k=%d.\n' + % (options_storage.MIN_K, options_storage.MAX_K, options_storage.MAX_K)) + k_value = options_storage.MAX_K + cfg["assembly"].__dict__["iterative_K"] = k_value else: cfg["assembly"].__dict__["iterative_K"] = options_storage.K_MERS_SHORT cfg["assembly"].__dict__["disable_rr"] = options_storage.disable_rr cfg["assembly"].__dict__["diploid_mode"] = options_storage.diploid_mode cfg["assembly"].__dict__["cov_cutoff"] = options_storage.cov_cutoff + cfg["assembly"].__dict__["lcer_cutoff"] = options_storage.lcer_cutoff + cfg["assembly"].__dict__["save_gp"] = options_storage.save_gp if options_storage.spades_heap_check: cfg["assembly"].__dict__["heap_check"] = options_storage.spades_heap_check if options_storage.read_buffer_size: @@ -658,7 +698,8 @@ if support.dataset_has_additional_contigs(dataset_data): dataset_data = support.process_Ns_in_additional_contigs(dataset_data, dir_for_split_reads, log) options_storage.dataset_yaml_filename = os.path.join(options_storage.output_dir, "input_dataset.yaml") - pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w')) + pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w'), + default_flow_style=False, default_style='"', width=float("inf")) cfg["dataset"].yaml_filename = options_storage.dataset_yaml_filename try: @@ -894,29 +935,37 @@ if "assembly" in cfg and os.path.isfile(result_contigs_filename): message = " * Assembled contigs are in " + support.process_spaces(result_contigs_filename) log.info(message) - if options_storage.rna: - if "assembly" in cfg and os.path.isfile(result_transcripts_filename): + if options_storage.rna and "assembly" in cfg: + if os.path.isfile(result_transcripts_filename): message = " * Assembled transcripts are in " + support.process_spaces(result_transcripts_filename) log.info(message) - if "assembly" in cfg and os.path.isfile(result_transcripts_paths_filename): + if os.path.isfile(result_transcripts_paths_filename): message = " * Paths in the assembly graph corresponding to the transcripts are in " + \ support.process_spaces(result_transcripts_paths_filename) log.info(message) - else: - if "assembly" in cfg and os.path.isfile(result_scaffolds_filename): + for filtering_type in options_storage.filtering_types: + result_filtered_transcripts_filename = os.path.join(cfg["common"].output_dir, + filtering_type + "_filtered_" + + options_storage.transcripts_name) + if os.path.isfile(result_filtered_transcripts_filename): + message = " * " + filtering_type.capitalize() + " filtered transcripts are in " + \ + support.process_spaces(result_filtered_transcripts_filename) + log.info(message) + elif "assembly" in cfg: + if os.path.isfile(result_scaffolds_filename): message = " * Assembled scaffolds are in " + support.process_spaces(result_scaffolds_filename) log.info(message) - if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename): + if os.path.isfile(result_assembly_graph_filename): message = " * Assembly graph is in " + support.process_spaces(result_assembly_graph_filename) log.info(message) - if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename_gfa): + if os.path.isfile(result_assembly_graph_filename_gfa): message = " * Assembly graph in GFA format is in " + support.process_spaces(result_assembly_graph_filename_gfa) log.info(message) - if "assembly" in cfg and os.path.isfile(result_contigs_paths_filename): + if os.path.isfile(result_contigs_paths_filename): message = " * Paths in the assembly graph corresponding to the contigs are in " + \ support.process_spaces(result_contigs_paths_filename) log.info(message) - if "assembly" in cfg and os.path.isfile(result_scaffolds_paths_filename): + if os.path.isfile(result_scaffolds_paths_filename): message = " * Paths in the assembly graph corresponding to the scaffolds are in " + \ support.process_spaces(result_scaffolds_paths_filename) log.info(message) diff -Nru spades-3.10.1+dfsg/plasmidspades.py spades-3.11.1+dfsg/plasmidspades.py --- spades-3.10.1+dfsg/plasmidspades.py 2017-02-28 14:55:56.000000000 +0000 +++ spades-3.11.1+dfsg/plasmidspades.py 2017-09-28 09:05:59.000000000 +0000 @@ -158,7 +158,7 @@ def fill_cfg(options_to_parse, log, secondary_filling=False): - skip_output_dir=secondary_filling + skip_output_dir = secondary_filling skip_stop_after = secondary_filling load_processed_dataset=secondary_filling @@ -211,6 +211,7 @@ options_storage.configs_dir = support.check_dir_existence(arg) elif opt == "--reference": options_storage.reference = support.check_file_existence(arg, 'reference', log) + options_storage.developer_mode = True elif opt == "--series-analysis": options_storage.series_analysis = support.check_file_existence(arg, 'series-analysis', log) elif opt == "--dataset": @@ -239,8 +240,19 @@ options_storage.large_genome = True elif opt == "--plasmid": options_storage.plasmid = True + elif opt == "--rna": options_storage.rna = True + elif opt.startswith("--ss-"): # strand specificity, RNA-Seq only + if opt == "--ss-rf": + options_storage.strand_specific = True + elif opt == "--ss-fr": + options_storage.strand_specific = False + elif opt == "--fast": # fast run, RNA-Seq only + options_storage.fast = True + elif opt == "--fast:false": + options_storage.fast = False + elif opt == "--iontorrent": options_storage.iontorrent = True elif opt == "--disable-gzip-output": @@ -295,6 +307,8 @@ else: support.error('wrong PHRED quality offset value: ' + arg + ' (should be either 33, 64, or \'auto\')', log) + elif opt == "--save-gp": + options_storage.save_gp = True elif opt == "--cov-cutoff": if arg == 'auto' or arg == 'off': options_storage.cov_cutoff = arg @@ -303,6 +317,12 @@ else: support.error('wrong value for --cov-cutoff option: ' + arg + ' (should be a positive float number, or \'auto\', or \'off\')', log) + elif opt == "--hidden-cov-cutoff": + if support.is_float(arg) and float(arg) > 0.0: + options_storage.lcer_cutoff = float(arg) + else: + support.error('wrong value for --hidden-cov-cutoff option: ' + arg + + ' (should be a positive float number)', log) elif opt == '-i' or opt == "--iterations": options_storage.iterations = int(arg) @@ -356,7 +376,7 @@ support.error("the output_dir should exist for --continue and for --restart-from!", log) os.makedirs(options_storage.output_dir) if options_storage.restart_from: - if options_storage.continue_mode: # saving parameters specified with --restart-from + if options_storage.continue_mode: # saving parameters specified with --restart-from if not support.dataset_is_empty(dataset_data): support.error("you cannot specify reads with --restart-from option!", log) options_storage.save_restart_options(log) @@ -370,6 +390,10 @@ support.error("you cannot specify --careful in RNA-Seq mode!", log) if options_storage.k_mers and options_storage.k_mers != 'auto' and len(options_storage.k_mers) > 1: support.error("you cannot specify multiple k-mer sizes in RNA-Seq mode!", log) + if [options_storage.meta, options_storage.large_genome, options_storage.truseq_mode, + options_storage.rna, options_storage.plasmid, options_storage.single_cell].count(True) > 1: + support.error("you cannot simultaneously use more than one mode out of " + "Metagenomic, Large genome, Illumina TruSeq, RNA-Seq, Plasmid, and Single-cell!", log) if options_storage.continue_mode: return None, None @@ -403,11 +427,12 @@ if len(dataset_data) != len(support.get_lib_ids_by_type(dataset_data, spades_logic.READS_TYPES_USED_IN_RNA_SEQ)): support.error('you cannot specify any data types except ' + ', '.join(spades_logic.READS_TYPES_USED_IN_RNA_SEQ) + ' in RNA-Seq mode!') - if len(support.get_lib_ids_by_type(dataset_data, 'paired-end')) > 1: - support.error('you cannot specify more than one paired-end library in RNA-Seq mode!') + #if len(support.get_lib_ids_by_type(dataset_data, 'paired-end')) > 1: + # support.error('you cannot specify more than one paired-end library in RNA-Seq mode!') if existing_dataset_data is None: - pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w')) + pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w'), + default_flow_style=False, default_style='"', width=float("inf")) options_storage.set_default_values() ### FILLING cfg @@ -454,12 +479,27 @@ if options_storage.k_mers: cfg["assembly"].__dict__["iterative_K"] = options_storage.k_mers elif options_storage.rna: - cfg["assembly"].__dict__["iterative_K"] = options_storage.K_MERS_RNA + k_value = options_storage.K_MERS_RNA[0] + if not options_storage.iontorrent: + k_value = int(support.get_reads_length(dataset_data, log) / 2) - 1 + if k_value % 2 == 0: + k_value -= 1 + if k_value < options_storage.MIN_K: + log.info("\n" + 'Default k value (' + str(k_value) + ') is too small, all k values should be between %d and %d. Setting k=%d.\n' + % (options_storage.MIN_K, options_storage.MAX_K, options_storage.MIN_K)) + k_value = options_storage.MIN_K + if k_value > options_storage.MAX_K: + log.info("\n" + 'Default k value (' + str(k_value) + ') is too large, all k values should be between %d and %d. Setting k=%d.\n' + % (options_storage.MIN_K, options_storage.MAX_K, options_storage.MAX_K)) + k_value = options_storage.MAX_K + cfg["assembly"].__dict__["iterative_K"] = k_value else: cfg["assembly"].__dict__["iterative_K"] = options_storage.K_MERS_SHORT cfg["assembly"].__dict__["disable_rr"] = options_storage.disable_rr cfg["assembly"].__dict__["diploid_mode"] = options_storage.diploid_mode cfg["assembly"].__dict__["cov_cutoff"] = options_storage.cov_cutoff + cfg["assembly"].__dict__["lcer_cutoff"] = options_storage.lcer_cutoff + cfg["assembly"].__dict__["save_gp"] = options_storage.save_gp if options_storage.spades_heap_check: cfg["assembly"].__dict__["heap_check"] = options_storage.spades_heap_check if options_storage.read_buffer_size: @@ -658,7 +698,8 @@ if support.dataset_has_additional_contigs(dataset_data): dataset_data = support.process_Ns_in_additional_contigs(dataset_data, dir_for_split_reads, log) options_storage.dataset_yaml_filename = os.path.join(options_storage.output_dir, "input_dataset.yaml") - pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w')) + pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w'), + default_flow_style=False, default_style='"', width=float("inf")) cfg["dataset"].yaml_filename = options_storage.dataset_yaml_filename try: @@ -894,29 +935,37 @@ if "assembly" in cfg and os.path.isfile(result_contigs_filename): message = " * Assembled contigs are in " + support.process_spaces(result_contigs_filename) log.info(message) - if options_storage.rna: - if "assembly" in cfg and os.path.isfile(result_transcripts_filename): + if options_storage.rna and "assembly" in cfg: + if os.path.isfile(result_transcripts_filename): message = " * Assembled transcripts are in " + support.process_spaces(result_transcripts_filename) log.info(message) - if "assembly" in cfg and os.path.isfile(result_transcripts_paths_filename): + if os.path.isfile(result_transcripts_paths_filename): message = " * Paths in the assembly graph corresponding to the transcripts are in " + \ support.process_spaces(result_transcripts_paths_filename) log.info(message) - else: - if "assembly" in cfg and os.path.isfile(result_scaffolds_filename): + for filtering_type in options_storage.filtering_types: + result_filtered_transcripts_filename = os.path.join(cfg["common"].output_dir, + filtering_type + "_filtered_" + + options_storage.transcripts_name) + if os.path.isfile(result_filtered_transcripts_filename): + message = " * " + filtering_type.capitalize() + " filtered transcripts are in " + \ + support.process_spaces(result_filtered_transcripts_filename) + log.info(message) + elif "assembly" in cfg: + if os.path.isfile(result_scaffolds_filename): message = " * Assembled scaffolds are in " + support.process_spaces(result_scaffolds_filename) log.info(message) - if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename): + if os.path.isfile(result_assembly_graph_filename): message = " * Assembly graph is in " + support.process_spaces(result_assembly_graph_filename) log.info(message) - if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename_gfa): + if os.path.isfile(result_assembly_graph_filename_gfa): message = " * Assembly graph in GFA format is in " + support.process_spaces(result_assembly_graph_filename_gfa) log.info(message) - if "assembly" in cfg and os.path.isfile(result_contigs_paths_filename): + if os.path.isfile(result_contigs_paths_filename): message = " * Paths in the assembly graph corresponding to the contigs are in " + \ support.process_spaces(result_contigs_paths_filename) log.info(message) - if "assembly" in cfg and os.path.isfile(result_scaffolds_paths_filename): + if os.path.isfile(result_scaffolds_paths_filename): message = " * Paths in the assembly graph corresponding to the scaffolds are in " + \ support.process_spaces(result_scaffolds_paths_filename) log.info(message) diff -Nru spades-3.10.1+dfsg/README spades-3.11.1+dfsg/README --- spades-3.10.1+dfsg/README 2017-02-28 14:55:56.000000000 +0000 +++ spades-3.11.1+dfsg/README 2017-09-28 09:05:59.000000000 +0000 @@ -9,13 +9,14 @@ Dmitry Antipov, Anton Bankevich, + Elena Bushmanova, + Vasisliy Ershov, Yuriy Gorshkov, Alexey Gurevich, Anton Korobeynikov, - Dmitriy Meleshko, Sergey Nurk, Andrey Prjibelski, - Yana Safonova, + Ivan Tolstoganov, Alla Lapidus and Pavel Pevzner @@ -25,9 +26,11 @@ Mikhail Dvorkin, Alexander Kulikov, Valery Lesin, + Dmitriy Meleshko, Sergey Nikolenko, Son Pham, Alexey Pyshkin, + Yana Safonova, Vladislav Saveliev, Alexander Sirotkin, Yakov Sirotkin, @@ -39,6 +42,8 @@ Installation instructions and manual can be found on the website: http://cab.spbu.ru/software/spades/ +Address for communication: spades.support@cab.spbu.ru + References: Anton Bankevich, Sergey Nurk, Dmitry Antipov, Alexey A. Gurevich, Mikhail Dvorkin, Alexander S. Kulikov, Valery M. Lesin, diff -Nru spades-3.10.1+dfsg/rnaspades_manual.html spades-3.11.1+dfsg/rnaspades_manual.html --- spades-3.10.1+dfsg/rnaspades_manual.html 2017-02-28 14:55:56.000000000 +0000 +++ spades-3.11.1+dfsg/rnaspades_manual.html 2017-09-28 09:05:59.000000000 +0000 @@ -13,7 +13,9 @@ 1. About rnaSPAdes
      2. rnaSPAdes specifics
          2.1. Running rnaSPAdes
      -    2.2. rnaSPAdes output
      +    2.2. rnaSPAdes-specific options
      +    2.2. Assemblying strand-specific RNA-Seq
      +    2.3. rnaSPAdes output
      3. Assembly evaluation
      4. Citation
      5. Feedback and bug reports
      @@ -47,18 +49,50 @@ Note that we assume that SPAdes installation directory is added to the PATH variable (provide full path to rnaSPAdes executable otherwise: <rnaspades installation dir>/rnaspades.py). -

      Here are several notes regarding options : +

      Here are several notes regarding rnaSPAdes options:

        -
      • rnaSPAdes can take as an input only one paired-end library and multiple single-end libraries.
      • +
      • rnaSPAdes can take as an input only paired-end and single-end libraries.
      • rnaSPAdes does not support --careful and --cov-cutoff options.
      • -
      • rnaSPAdes is not compatible with other pipeline options such as --meta, --sc and --plasmid.
      • -
      • rnaSPAdes works using only a single k-mer size (55 by the default). We strongly recommend not to change this parameter. In case your RNA-Seq data set contains long Illumina reads (150 bp and longer) you may try to use longer k-mer size (approximately half of the read length). In case you have any doubts about your run, do not hesitate to contact us using e-mail given below.
      • +
      • rnaSPAdes is not compatible with other pipeline options such as --meta, --sc and --plasmid. If you wish to assemble metatranscriptomic data just run rnaSPAdes as it is.
      • +
      • rnaSPAdes works using only a single k-mer size (automatically detected using read length by the default). We strongly recommend not to change this parameter. In case you have any doubts about your run, do not hesitate to contact us using e-mail given below.
      -

      2.2 rnaSPAdes output

      +

      2.2 rnaSPAdes-specific options

      -rnaSPAdes outputs only one FASTA file named transcripts.fasta. The corresponding file with paths in the assembly_graph.fastg is transcripts.paths. + --fast
      +     Speeds up isoform detection stage by removing short low-covered isolated edges from the graph. Note, that short low-expressed transcripts may be missing when this option is used. +

      + + +

      2.3 Assemblying strand-specific RNA-Seq

      +

      rnaSPAdes now supports strand-specific RNA-Seq dataset. You can indicate that the dataset is strand-specific using one of the following options: + +

      + --ss-fr
      +     The data set is strand-specific and first read in pair corresponds to actual gene strand. +

      + +

      + --ss-rf
      +     The data set is strand-specific and first read in pair corresponds to reverse gene strand. +

      + +If the data set is single-end use --ss-fr option in case when reads correspond to gene strand and --ss-rf otherwise. + + + +

      2.4 rnaSPAdes output

      +

      +rnaSPAdes outputs one main FASTA file named transcripts.fasta. The corresponding file with paths in the assembly_graph.fastg is transcripts.paths. + +

      +In addition rnaSPAdes outputs transcripts with different level of filtration into <output_dir>/:
      +

        +
      • hard_filtered_transcripts.fasta – includes only long and reliable transcripts with rather high expression.
      • +
      • soft_filtered_transcripts.fasta – includes short and low-expressed transcipts, likely to contain junk sequences.
      • +
      +We reccomend to use main transcripts.fasta file in case you don't have any specific needs for you projects. Do not hesitate to contact us using e-mail given below.

      Contigs/scaffolds names in rnaSPAdes output FASTA files have the following format:
      >NODE_97_length_6237_cov_11.9819_g8_i2
      Similarly to SPAdes, 97 is the number of the transcript, 6237 is its sequence length in nucleotides and 11.9819 is the k-mer coverage. Note that the k-mer coverage is always lower than the read (per-base) coverage. g8_i2 correspond to the gene number 8 and isoform number 2 within this gene. Transcripts with the same gene number are presumably received from same or somewhat similar (e.g. paralogous) genes. Note, that the prediction is based on the presence of shared sequences in the transcripts and is very approximate. diff -Nru spades-3.10.1+dfsg/rnaspades.py spades-3.11.1+dfsg/rnaspades.py --- spades-3.10.1+dfsg/rnaspades.py 2017-02-28 14:55:56.000000000 +0000 +++ spades-3.11.1+dfsg/rnaspades.py 2017-09-28 09:05:59.000000000 +0000 @@ -158,7 +158,7 @@ def fill_cfg(options_to_parse, log, secondary_filling=False): - skip_output_dir=secondary_filling + skip_output_dir = secondary_filling skip_stop_after = secondary_filling load_processed_dataset=secondary_filling @@ -211,6 +211,7 @@ options_storage.configs_dir = support.check_dir_existence(arg) elif opt == "--reference": options_storage.reference = support.check_file_existence(arg, 'reference', log) + options_storage.developer_mode = True elif opt == "--series-analysis": options_storage.series_analysis = support.check_file_existence(arg, 'series-analysis', log) elif opt == "--dataset": @@ -239,8 +240,19 @@ options_storage.large_genome = True elif opt == "--plasmid": options_storage.plasmid = True + elif opt == "--rna": options_storage.rna = True + elif opt.startswith("--ss-"): # strand specificity, RNA-Seq only + if opt == "--ss-rf": + options_storage.strand_specific = True + elif opt == "--ss-fr": + options_storage.strand_specific = False + elif opt == "--fast": # fast run, RNA-Seq only + options_storage.fast = True + elif opt == "--fast:false": + options_storage.fast = False + elif opt == "--iontorrent": options_storage.iontorrent = True elif opt == "--disable-gzip-output": @@ -295,6 +307,8 @@ else: support.error('wrong PHRED quality offset value: ' + arg + ' (should be either 33, 64, or \'auto\')', log) + elif opt == "--save-gp": + options_storage.save_gp = True elif opt == "--cov-cutoff": if arg == 'auto' or arg == 'off': options_storage.cov_cutoff = arg @@ -303,6 +317,12 @@ else: support.error('wrong value for --cov-cutoff option: ' + arg + ' (should be a positive float number, or \'auto\', or \'off\')', log) + elif opt == "--hidden-cov-cutoff": + if support.is_float(arg) and float(arg) > 0.0: + options_storage.lcer_cutoff = float(arg) + else: + support.error('wrong value for --hidden-cov-cutoff option: ' + arg + + ' (should be a positive float number)', log) elif opt == '-i' or opt == "--iterations": options_storage.iterations = int(arg) @@ -356,7 +376,7 @@ support.error("the output_dir should exist for --continue and for --restart-from!", log) os.makedirs(options_storage.output_dir) if options_storage.restart_from: - if options_storage.continue_mode: # saving parameters specified with --restart-from + if options_storage.continue_mode: # saving parameters specified with --restart-from if not support.dataset_is_empty(dataset_data): support.error("you cannot specify reads with --restart-from option!", log) options_storage.save_restart_options(log) @@ -370,6 +390,10 @@ support.error("you cannot specify --careful in RNA-Seq mode!", log) if options_storage.k_mers and options_storage.k_mers != 'auto' and len(options_storage.k_mers) > 1: support.error("you cannot specify multiple k-mer sizes in RNA-Seq mode!", log) + if [options_storage.meta, options_storage.large_genome, options_storage.truseq_mode, + options_storage.rna, options_storage.plasmid, options_storage.single_cell].count(True) > 1: + support.error("you cannot simultaneously use more than one mode out of " + "Metagenomic, Large genome, Illumina TruSeq, RNA-Seq, Plasmid, and Single-cell!", log) if options_storage.continue_mode: return None, None @@ -403,11 +427,12 @@ if len(dataset_data) != len(support.get_lib_ids_by_type(dataset_data, spades_logic.READS_TYPES_USED_IN_RNA_SEQ)): support.error('you cannot specify any data types except ' + ', '.join(spades_logic.READS_TYPES_USED_IN_RNA_SEQ) + ' in RNA-Seq mode!') - if len(support.get_lib_ids_by_type(dataset_data, 'paired-end')) > 1: - support.error('you cannot specify more than one paired-end library in RNA-Seq mode!') + #if len(support.get_lib_ids_by_type(dataset_data, 'paired-end')) > 1: + # support.error('you cannot specify more than one paired-end library in RNA-Seq mode!') if existing_dataset_data is None: - pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w')) + pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w'), + default_flow_style=False, default_style='"', width=float("inf")) options_storage.set_default_values() ### FILLING cfg @@ -454,12 +479,27 @@ if options_storage.k_mers: cfg["assembly"].__dict__["iterative_K"] = options_storage.k_mers elif options_storage.rna: - cfg["assembly"].__dict__["iterative_K"] = options_storage.K_MERS_RNA + k_value = options_storage.K_MERS_RNA[0] + if not options_storage.iontorrent: + k_value = int(support.get_reads_length(dataset_data, log) / 2) - 1 + if k_value % 2 == 0: + k_value -= 1 + if k_value < options_storage.MIN_K: + log.info("\n" + 'Default k value (' + str(k_value) + ') is too small, all k values should be between %d and %d. Setting k=%d.\n' + % (options_storage.MIN_K, options_storage.MAX_K, options_storage.MIN_K)) + k_value = options_storage.MIN_K + if k_value > options_storage.MAX_K: + log.info("\n" + 'Default k value (' + str(k_value) + ') is too large, all k values should be between %d and %d. Setting k=%d.\n' + % (options_storage.MIN_K, options_storage.MAX_K, options_storage.MAX_K)) + k_value = options_storage.MAX_K + cfg["assembly"].__dict__["iterative_K"] = k_value else: cfg["assembly"].__dict__["iterative_K"] = options_storage.K_MERS_SHORT cfg["assembly"].__dict__["disable_rr"] = options_storage.disable_rr cfg["assembly"].__dict__["diploid_mode"] = options_storage.diploid_mode cfg["assembly"].__dict__["cov_cutoff"] = options_storage.cov_cutoff + cfg["assembly"].__dict__["lcer_cutoff"] = options_storage.lcer_cutoff + cfg["assembly"].__dict__["save_gp"] = options_storage.save_gp if options_storage.spades_heap_check: cfg["assembly"].__dict__["heap_check"] = options_storage.spades_heap_check if options_storage.read_buffer_size: @@ -658,7 +698,8 @@ if support.dataset_has_additional_contigs(dataset_data): dataset_data = support.process_Ns_in_additional_contigs(dataset_data, dir_for_split_reads, log) options_storage.dataset_yaml_filename = os.path.join(options_storage.output_dir, "input_dataset.yaml") - pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w')) + pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w'), + default_flow_style=False, default_style='"', width=float("inf")) cfg["dataset"].yaml_filename = options_storage.dataset_yaml_filename try: @@ -894,29 +935,37 @@ if "assembly" in cfg and os.path.isfile(result_contigs_filename): message = " * Assembled contigs are in " + support.process_spaces(result_contigs_filename) log.info(message) - if options_storage.rna: - if "assembly" in cfg and os.path.isfile(result_transcripts_filename): + if options_storage.rna and "assembly" in cfg: + if os.path.isfile(result_transcripts_filename): message = " * Assembled transcripts are in " + support.process_spaces(result_transcripts_filename) log.info(message) - if "assembly" in cfg and os.path.isfile(result_transcripts_paths_filename): + if os.path.isfile(result_transcripts_paths_filename): message = " * Paths in the assembly graph corresponding to the transcripts are in " + \ support.process_spaces(result_transcripts_paths_filename) log.info(message) - else: - if "assembly" in cfg and os.path.isfile(result_scaffolds_filename): + for filtering_type in options_storage.filtering_types: + result_filtered_transcripts_filename = os.path.join(cfg["common"].output_dir, + filtering_type + "_filtered_" + + options_storage.transcripts_name) + if os.path.isfile(result_filtered_transcripts_filename): + message = " * " + filtering_type.capitalize() + " filtered transcripts are in " + \ + support.process_spaces(result_filtered_transcripts_filename) + log.info(message) + elif "assembly" in cfg: + if os.path.isfile(result_scaffolds_filename): message = " * Assembled scaffolds are in " + support.process_spaces(result_scaffolds_filename) log.info(message) - if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename): + if os.path.isfile(result_assembly_graph_filename): message = " * Assembly graph is in " + support.process_spaces(result_assembly_graph_filename) log.info(message) - if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename_gfa): + if os.path.isfile(result_assembly_graph_filename_gfa): message = " * Assembly graph in GFA format is in " + support.process_spaces(result_assembly_graph_filename_gfa) log.info(message) - if "assembly" in cfg and os.path.isfile(result_contigs_paths_filename): + if os.path.isfile(result_contigs_paths_filename): message = " * Paths in the assembly graph corresponding to the contigs are in " + \ support.process_spaces(result_contigs_paths_filename) log.info(message) - if "assembly" in cfg and os.path.isfile(result_scaffolds_paths_filename): + if os.path.isfile(result_scaffolds_paths_filename): message = " * Paths in the assembly graph corresponding to the scaffolds are in " + \ support.process_spaces(result_scaffolds_paths_filename) log.info(message) diff -Nru spades-3.10.1+dfsg/spades.py spades-3.11.1+dfsg/spades.py --- spades-3.10.1+dfsg/spades.py 2017-02-28 14:55:56.000000000 +0000 +++ spades-3.11.1+dfsg/spades.py 2017-09-28 09:05:59.000000000 +0000 @@ -158,7 +158,7 @@ def fill_cfg(options_to_parse, log, secondary_filling=False): - skip_output_dir=secondary_filling + skip_output_dir = secondary_filling skip_stop_after = secondary_filling load_processed_dataset=secondary_filling @@ -211,6 +211,7 @@ options_storage.configs_dir = support.check_dir_existence(arg) elif opt == "--reference": options_storage.reference = support.check_file_existence(arg, 'reference', log) + options_storage.developer_mode = True elif opt == "--series-analysis": options_storage.series_analysis = support.check_file_existence(arg, 'series-analysis', log) elif opt == "--dataset": @@ -239,8 +240,19 @@ options_storage.large_genome = True elif opt == "--plasmid": options_storage.plasmid = True + elif opt == "--rna": options_storage.rna = True + elif opt.startswith("--ss-"): # strand specificity, RNA-Seq only + if opt == "--ss-rf": + options_storage.strand_specific = True + elif opt == "--ss-fr": + options_storage.strand_specific = False + elif opt == "--fast": # fast run, RNA-Seq only + options_storage.fast = True + elif opt == "--fast:false": + options_storage.fast = False + elif opt == "--iontorrent": options_storage.iontorrent = True elif opt == "--disable-gzip-output": @@ -295,6 +307,8 @@ else: support.error('wrong PHRED quality offset value: ' + arg + ' (should be either 33, 64, or \'auto\')', log) + elif opt == "--save-gp": + options_storage.save_gp = True elif opt == "--cov-cutoff": if arg == 'auto' or arg == 'off': options_storage.cov_cutoff = arg @@ -303,6 +317,12 @@ else: support.error('wrong value for --cov-cutoff option: ' + arg + ' (should be a positive float number, or \'auto\', or \'off\')', log) + elif opt == "--hidden-cov-cutoff": + if support.is_float(arg) and float(arg) > 0.0: + options_storage.lcer_cutoff = float(arg) + else: + support.error('wrong value for --hidden-cov-cutoff option: ' + arg + + ' (should be a positive float number)', log) elif opt == '-i' or opt == "--iterations": options_storage.iterations = int(arg) @@ -356,7 +376,7 @@ support.error("the output_dir should exist for --continue and for --restart-from!", log) os.makedirs(options_storage.output_dir) if options_storage.restart_from: - if options_storage.continue_mode: # saving parameters specified with --restart-from + if options_storage.continue_mode: # saving parameters specified with --restart-from if not support.dataset_is_empty(dataset_data): support.error("you cannot specify reads with --restart-from option!", log) options_storage.save_restart_options(log) @@ -370,6 +390,10 @@ support.error("you cannot specify --careful in RNA-Seq mode!", log) if options_storage.k_mers and options_storage.k_mers != 'auto' and len(options_storage.k_mers) > 1: support.error("you cannot specify multiple k-mer sizes in RNA-Seq mode!", log) + if [options_storage.meta, options_storage.large_genome, options_storage.truseq_mode, + options_storage.rna, options_storage.plasmid, options_storage.single_cell].count(True) > 1: + support.error("you cannot simultaneously use more than one mode out of " + "Metagenomic, Large genome, Illumina TruSeq, RNA-Seq, Plasmid, and Single-cell!", log) if options_storage.continue_mode: return None, None @@ -403,11 +427,12 @@ if len(dataset_data) != len(support.get_lib_ids_by_type(dataset_data, spades_logic.READS_TYPES_USED_IN_RNA_SEQ)): support.error('you cannot specify any data types except ' + ', '.join(spades_logic.READS_TYPES_USED_IN_RNA_SEQ) + ' in RNA-Seq mode!') - if len(support.get_lib_ids_by_type(dataset_data, 'paired-end')) > 1: - support.error('you cannot specify more than one paired-end library in RNA-Seq mode!') + #if len(support.get_lib_ids_by_type(dataset_data, 'paired-end')) > 1: + # support.error('you cannot specify more than one paired-end library in RNA-Seq mode!') if existing_dataset_data is None: - pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w')) + pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w'), + default_flow_style=False, default_style='"', width=float("inf")) options_storage.set_default_values() ### FILLING cfg @@ -454,12 +479,27 @@ if options_storage.k_mers: cfg["assembly"].__dict__["iterative_K"] = options_storage.k_mers elif options_storage.rna: - cfg["assembly"].__dict__["iterative_K"] = options_storage.K_MERS_RNA + k_value = options_storage.K_MERS_RNA[0] + if not options_storage.iontorrent: + k_value = int(support.get_reads_length(dataset_data, log) / 2) - 1 + if k_value % 2 == 0: + k_value -= 1 + if k_value < options_storage.MIN_K: + log.info("\n" + 'Default k value (' + str(k_value) + ') is too small, all k values should be between %d and %d. Setting k=%d.\n' + % (options_storage.MIN_K, options_storage.MAX_K, options_storage.MIN_K)) + k_value = options_storage.MIN_K + if k_value > options_storage.MAX_K: + log.info("\n" + 'Default k value (' + str(k_value) + ') is too large, all k values should be between %d and %d. Setting k=%d.\n' + % (options_storage.MIN_K, options_storage.MAX_K, options_storage.MAX_K)) + k_value = options_storage.MAX_K + cfg["assembly"].__dict__["iterative_K"] = k_value else: cfg["assembly"].__dict__["iterative_K"] = options_storage.K_MERS_SHORT cfg["assembly"].__dict__["disable_rr"] = options_storage.disable_rr cfg["assembly"].__dict__["diploid_mode"] = options_storage.diploid_mode cfg["assembly"].__dict__["cov_cutoff"] = options_storage.cov_cutoff + cfg["assembly"].__dict__["lcer_cutoff"] = options_storage.lcer_cutoff + cfg["assembly"].__dict__["save_gp"] = options_storage.save_gp if options_storage.spades_heap_check: cfg["assembly"].__dict__["heap_check"] = options_storage.spades_heap_check if options_storage.read_buffer_size: @@ -658,7 +698,8 @@ if support.dataset_has_additional_contigs(dataset_data): dataset_data = support.process_Ns_in_additional_contigs(dataset_data, dir_for_split_reads, log) options_storage.dataset_yaml_filename = os.path.join(options_storage.output_dir, "input_dataset.yaml") - pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w')) + pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w'), + default_flow_style=False, default_style='"', width=float("inf")) cfg["dataset"].yaml_filename = options_storage.dataset_yaml_filename try: @@ -894,29 +935,37 @@ if "assembly" in cfg and os.path.isfile(result_contigs_filename): message = " * Assembled contigs are in " + support.process_spaces(result_contigs_filename) log.info(message) - if options_storage.rna: - if "assembly" in cfg and os.path.isfile(result_transcripts_filename): + if options_storage.rna and "assembly" in cfg: + if os.path.isfile(result_transcripts_filename): message = " * Assembled transcripts are in " + support.process_spaces(result_transcripts_filename) log.info(message) - if "assembly" in cfg and os.path.isfile(result_transcripts_paths_filename): + if os.path.isfile(result_transcripts_paths_filename): message = " * Paths in the assembly graph corresponding to the transcripts are in " + \ support.process_spaces(result_transcripts_paths_filename) log.info(message) - else: - if "assembly" in cfg and os.path.isfile(result_scaffolds_filename): + for filtering_type in options_storage.filtering_types: + result_filtered_transcripts_filename = os.path.join(cfg["common"].output_dir, + filtering_type + "_filtered_" + + options_storage.transcripts_name) + if os.path.isfile(result_filtered_transcripts_filename): + message = " * " + filtering_type.capitalize() + " filtered transcripts are in " + \ + support.process_spaces(result_filtered_transcripts_filename) + log.info(message) + elif "assembly" in cfg: + if os.path.isfile(result_scaffolds_filename): message = " * Assembled scaffolds are in " + support.process_spaces(result_scaffolds_filename) log.info(message) - if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename): + if os.path.isfile(result_assembly_graph_filename): message = " * Assembly graph is in " + support.process_spaces(result_assembly_graph_filename) log.info(message) - if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename_gfa): + if os.path.isfile(result_assembly_graph_filename_gfa): message = " * Assembly graph in GFA format is in " + support.process_spaces(result_assembly_graph_filename_gfa) log.info(message) - if "assembly" in cfg and os.path.isfile(result_contigs_paths_filename): + if os.path.isfile(result_contigs_paths_filename): message = " * Paths in the assembly graph corresponding to the contigs are in " + \ support.process_spaces(result_contigs_paths_filename) log.info(message) - if "assembly" in cfg and os.path.isfile(result_scaffolds_paths_filename): + if os.path.isfile(result_scaffolds_paths_filename): message = " * Paths in the assembly graph corresponding to the scaffolds are in " + \ support.process_spaces(result_scaffolds_paths_filename) log.info(message) diff -Nru spades-3.10.1+dfsg/src/cmake/options.cmake spades-3.11.1+dfsg/src/cmake/options.cmake --- spades-3.10.1+dfsg/src/cmake/options.cmake 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/cmake/options.cmake 2017-09-28 09:05:14.000000000 +0000 @@ -41,6 +41,15 @@ set(Boost_USE_STATIC_RUNTIME ON) endif() +option(SPADES_USE_GPROF "gprof profiler" OFF) + +if (SPADES_USE_GPROF) + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pg") + SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -pg") + SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -pg") + SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pg") +endif() + # Define minimum and maximum K set(SPADES_MIN_K 1 CACHE INTEGER "Minimum k-mer length") set(SPADES_MAX_K 128 CACHE INTEGER "Maximum k-mer length") diff -Nru spades-3.10.1+dfsg/src/cmake/pack.cmake spades-3.11.1+dfsg/src/cmake/pack.cmake --- spades-3.10.1+dfsg/src/cmake/pack.cmake 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/cmake/pack.cmake 2017-09-28 09:05:14.000000000 +0000 @@ -12,9 +12,9 @@ set(CPACK_PACKAGE_VENDOR "Saint Petersburg State University") set(CPACK_PACKAGE_DESCRIPTION_FILE "${SPADES_MAIN_SRC_DIR}/../README") set(CPACK_RESOURCE_FILE_LICENSE "${SPADES_MAIN_SRC_DIR}/../LICENSE") -set(CPACK_PACKAGE_VERSION "3.10.1") +set(CPACK_PACKAGE_VERSION "3.11.1") set(CPACK_PACKAGE_VERSION_MAJOR "3") -set(CPACK_PACKAGE_VERSION_MINOR "10") +set(CPACK_PACKAGE_VERSION_MINOR "11") set(CPACK_PACKAGE_VERSION_PATCH "1") set(CPACK_STRIP_FILES bin/spades bin/hammer bin/ionhammer bin/dipspades bin/spades-bwa bin/corrector bin/scaffold_correction) diff -Nru spades-3.10.1+dfsg/src/CMakeLists.txt spades-3.11.1+dfsg/src/CMakeLists.txt --- spades-3.10.1+dfsg/src/CMakeLists.txt 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/CMakeLists.txt 2017-09-28 09:05:14.000000000 +0000 @@ -29,6 +29,11 @@ set(EXT_DIR "${CMAKE_SOURCE_DIR}/../ext") set(SPADES_CFG_DIR "${CMAKE_SOURCE_DIR}/../configs") +# Uncomment for gprof profiling +#SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pg") +#SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -pg") +#SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -pg") + # Everything option-dependent include(options) diff -Nru spades-3.10.1+dfsg/src/common/adt/array_vector.hpp spades-3.11.1+dfsg/src/common/adt/array_vector.hpp --- spades-3.10.1+dfsg/src/common/adt/array_vector.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/adt/array_vector.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -15,45 +15,48 @@ #include #include +namespace adt { + template -class __array_vector_iterator; +class array_vector_iterator; template -class __array_reference; +class array_reference; template -class __array_const_reference; +class array_const_reference; template struct array_equal_to; + template -class __array { - typedef typename _Cp::__storage_type __storage_type; - typedef typename _Cp::__storage_pointer __storage_pointer; - typedef typename _Cp::__const_storage_pointer __const_storage_pointer; - typedef typename _Cp::size_type __size_type; +class array { + typedef typename _Cp::storage_type storage_type; + typedef typename _Cp::storage_pointer storage_pointer; + typedef typename _Cp::const_storage_pointer const_storage_pointer; + typedef typename _Cp::size_type size_type; #if defined(__clang__) - friend typename _Cp::__self; + friend typename _Cp::self; #else - friend class _Cp::__self; + friend class _Cp::self; #endif - friend class __array_vector_iterator<_Cp, false>; + friend class array_vector_iterator<_Cp, false>; - friend class __array_reference<_Cp>; + friend class array_reference<_Cp>; - friend class __array_const_reference<_Cp>; + friend class array_const_reference<_Cp>; - __storage_pointer ptr_; - __size_type size_; + storage_pointer ptr_; + size_type size_; bool allocated; public: - ~__array() { + ~array() { if (allocated) delete[] ptr_; } @@ -63,53 +66,53 @@ } size_t data_size() const { - return size_ * sizeof(__storage_type); + return size_ * sizeof(storage_type); } - __storage_pointer data() const { + storage_pointer data() const { return ptr_; } - __array(const __array &that) { + array(const array &that) { size_ = that.size_; - ptr_ = new __storage_type[size_]; + ptr_ = new storage_type[size_]; allocated = true; memcpy(ptr_, that.ptr_, data_size()); } - __array(const __array_reference<_Cp> that) { + array(const array_reference<_Cp> that) { size_ = that.size(); - ptr_ = new __storage_type[size_]; + ptr_ = new storage_type[size_]; allocated = true; memcpy(ptr_, that.data(), data_size()); } - __array &operator=(const __array &that) { - __storage_pointer this_ptr = data(), that_ptr = that.data(); + array &operator=(const array &that) { + storage_pointer this_ptr = data(), that_ptr = that.data(); if (this_ptr != that_ptr) memcpy(this_ptr, that_ptr, data_size()); return *this; } - __array &operator=(const __array_reference<_Cp> that) { - __storage_pointer this_ptr = data(), that_ptr = that.data(); + array &operator=(const array_reference<_Cp> that) { + storage_pointer this_ptr = data(), that_ptr = that.data(); if (this_ptr != that_ptr) memcpy(this_ptr, that_ptr, data_size()); return *this; } - __array &operator=(__const_storage_pointer that_ptr) { - __storage_pointer this_ptr = data(); + array &operator=(const_storage_pointer that_ptr) { + storage_pointer this_ptr = data(); if (this_ptr != that_ptr) memcpy(this_ptr, that_ptr, data_size()); return *this; } - bool operator<(const __array &that) const { - __storage_pointer this_ptr = data(), that_ptr = that.data(); + bool operator<(const array &that) const { + storage_pointer this_ptr = data(), that_ptr = that.data(); for (size_t i = 0; i < size(); ++i) { if (this_ptr[i] != that_ptr[i]) @@ -119,8 +122,8 @@ return false; } - bool operator<(const __array_reference<_Cp> that) const { - __storage_pointer this_ptr = data(), that_ptr = that.data(); + bool operator<(const array_reference<_Cp> that) const { + storage_pointer this_ptr = data(), that_ptr = that.data(); for (size_t i = 0; i < size(); ++i) { if (this_ptr[i] != that_ptr[i]) @@ -130,8 +133,8 @@ return false; } - bool operator==(const __array &that) const { - __storage_pointer this_ptr = data(), that_ptr = that.data(); + bool operator==(const array &that) const { + storage_pointer this_ptr = data(), that_ptr = that.data(); for (size_t i = 0; i < size(); ++i) { if (this_ptr[i] != that_ptr[i]) @@ -141,8 +144,8 @@ return true; } - bool operator==(const __array_reference<_Cp> that) const { - __storage_pointer this_ptr = data(), that_ptr = that.data(); + bool operator==(const array_reference<_Cp> that) const { + storage_pointer this_ptr = data(), that_ptr = that.data(); for (size_t i = 0; i < size(); ++i) { if (this_ptr[i] != that_ptr[i]) @@ -152,42 +155,43 @@ return true; } - bool operator!=(const __array &that) const { + bool operator!=(const array &that) const { return !operator==(that); } - bool operator!=(const __array_reference<_Cp> that) const { + bool operator!=(const array_reference<_Cp> that) const { return !operator==(that); } private: - __array(__storage_pointer p, __size_type sz) : + array(storage_pointer p, size_type sz) : ptr_(p), size_(sz), allocated(false) { } }; + template -class __array_reference { - typedef typename _Cp::__storage_type __storage_type; - typedef typename _Cp::__storage_pointer __storage_pointer; - typedef typename _Cp::__const_storage_pointer __const_storage_pointer; - typedef typename _Cp::size_type __size_type; +class array_reference { + typedef typename _Cp::storage_type storage_type; + typedef typename _Cp::storage_pointer storage_pointer; + typedef typename _Cp::const_storage_pointer const_storage_pointer; + typedef typename _Cp::size_type size_type; #if defined(__clang__) - friend typename _Cp::__self; + friend typename _Cp::self; #else - friend class _Cp::__self; + friend class _Cp::self; #endif - friend class __array_vector_iterator<_Cp, false>; + friend class array_vector_iterator<_Cp, false>; - friend class __array<_Cp>; + friend class array<_Cp>; - friend struct array_equal_to<__storage_type>; + friend struct adt::array_equal_to; - __storage_pointer ptr_; - __size_type size_; + storage_pointer ptr_; + size_type size_; public: size_t size() const { @@ -195,39 +199,39 @@ } size_t data_size() const { - return size() * sizeof(__storage_type); + return size() * sizeof(storage_type); } - __storage_pointer data() const { + storage_pointer data() const { return ptr_; } - __array_reference &operator=(const __array<_Cp> &that) { - __storage_pointer this_ptr = data(), that_ptr = that.data(); + array_reference &operator=(const array<_Cp> &that) { + storage_pointer this_ptr = data(), that_ptr = that.data(); if (this_ptr != that_ptr) memcpy(this_ptr, that_ptr, data_size()); return *this; } - __array_reference &operator=(__const_storage_pointer that_ptr) { - __storage_pointer this_ptr = data(); + array_reference &operator=(const_storage_pointer that_ptr) { + storage_pointer this_ptr = data(); if (this_ptr != that_ptr) memcpy(this_ptr, that_ptr, data_size()); return *this; } - __array_reference &operator=(const __array_reference that) { - __storage_pointer this_ptr = data(), that_ptr = that.data(); + array_reference &operator=(const array_reference that) { + storage_pointer this_ptr = data(), that_ptr = that.data(); if (this_ptr != that_ptr) memcpy(this_ptr, that_ptr, data_size()); return *this; } - bool operator<(const __array<_Cp> &that) const { - __storage_pointer this_ptr = data(), that_ptr = that.data(); + bool operator<(const array<_Cp> &that) const { + storage_pointer this_ptr = data(), that_ptr = that.data(); for (size_t i = 0; i < size(); ++i) { if (this_ptr[i] != that_ptr[i]) @@ -237,8 +241,8 @@ return false; } - bool operator<(const __array_reference that) const { - __storage_pointer this_ptr = data(), that_ptr = that.data(); + bool operator<(const array_reference that) const { + storage_pointer this_ptr = data(), that_ptr = that.data(); for (size_t i = 0; i < size(); ++i) { if (this_ptr[i] != that_ptr[i]) @@ -248,8 +252,8 @@ return false; } - bool operator==(const __array<_Cp> &that) const { - __storage_pointer this_ptr = data(), that_ptr = that.data(); + bool operator==(const array<_Cp> &that) const { + storage_pointer this_ptr = data(), that_ptr = that.data(); for (size_t i = 0; i < size(); ++i) { if (this_ptr[i] != that_ptr[i]) @@ -259,8 +263,8 @@ return true; } - bool operator==(const __array_reference that) const { - __storage_pointer this_ptr = data(), that_ptr = that.data(); + bool operator==(const array_reference that) const { + storage_pointer this_ptr = data(), that_ptr = that.data(); for (size_t i = 0; i < size(); ++i) { if (this_ptr[i] != that_ptr[i]) @@ -270,40 +274,40 @@ return true; } - bool operator!=(const __array_reference that) const { + bool operator!=(const array_reference that) const { return !operator==(that); } - bool operator!=(const __array<_Cp> &that) const { + bool operator!=(const array<_Cp> &that) const { return !operator==(that); } private: - __array_reference(__storage_pointer p, __size_type sz) : + array_reference(storage_pointer p, size_type sz) : ptr_(p), size_(sz) { } }; template -class __array_const_reference { - typedef typename _Cp::__storage_type __storage_type; - typedef typename _Cp::__storage_pointer __storage_pointer; - typedef typename _Cp::__const_storage_pointer __const_storage_pointer; - typedef typename _Cp::size_type __size_type; +class array_const_reference { + typedef typename _Cp::storage_type storage_type; + typedef typename _Cp::storage_pointer storage_pointer; + typedef typename _Cp::const_storage_pointer const_storage_pointer; + typedef typename _Cp::size_type size_type; #if defined(__clang__) - friend typename _Cp::__self; + friend typename _Cp::self; #else - friend class _Cp::__self; + friend class _Cp::self; #endif - friend class __array_vector_iterator<_Cp, true>; + friend class array_vector_iterator<_Cp, true>; - friend struct array_equal_to<__storage_type>; + friend struct adt::array_equal_to; - __const_storage_pointer ptr_; - __size_type size_; + const_storage_pointer ptr_; + size_type size_; public: size_t size() const { @@ -311,18 +315,18 @@ } size_t data_size() const { - return size() * sizeof(__storage_type); + return size() * sizeof(storage_type); } - __const_storage_pointer data() const { + const_storage_pointer data() const { return ptr_; } - __array_const_reference(const __array_const_reference &that) + array_const_reference(const array_const_reference &that) : ptr_(that.ptr_), size_(that.size_) { } - bool operator<(__array_const_reference that) const { - const __storage_pointer this_ptr = data(), that_ptr = that.data(); + bool operator<(array_const_reference that) const { + const storage_pointer this_ptr = data(), that_ptr = that.data(); for (size_t i = 0; i < size(); ++i) { if (this_ptr[i] != that_ptr[i]) @@ -332,8 +336,8 @@ return false; } - bool operator==(__array_const_reference that) const { - const __storage_pointer this_ptr = data(), that_ptr = that.data(); + bool operator==(array_const_reference that) const { + const storage_pointer this_ptr = data(), that_ptr = that.data(); for (size_t i = 0; i < size(); ++i) { if (this_ptr[i] != that_ptr[i]) @@ -343,9 +347,9 @@ return true; } - bool operator==(const __array_reference<_Cp> that) const { - __const_storage_pointer this_ptr = data(); - const __storage_pointer that_ptr = that.data(); + bool operator==(const array_reference<_Cp> that) const { + const_storage_pointer this_ptr = data(); + const storage_pointer that_ptr = that.data(); for (size_t i = 0; i < size(); ++i) { if (this_ptr[i] != that_ptr[i]) @@ -355,26 +359,27 @@ return true; } - bool operator!=(const __array_const_reference that) const { + bool operator!=(const array_const_reference that) const { return !operator==(that); } - bool operator!=(const __array_reference<_Cp> that) const { + bool operator!=(const array_reference<_Cp> that) const { return !operator==(that); } private: - __array_const_reference(__const_storage_pointer p, __size_type sz) : + array_const_reference(const_storage_pointer p, size_type sz) : ptr_(p), size_(sz) { } - __array_const_reference &operator=(const __array_const_reference &that); + array_const_reference &operator=(const array_const_reference &that); }; +} // This is hack. Never do this again! #ifdef __GLIBCXX__ namespace std { template - struct __are_same<__array_reference<_Cp>, __array<_Cp> &> { + struct __are_same, adt::array<_Cp> &> { enum { __value = 1 }; @@ -382,7 +387,7 @@ }; template - struct __are_same<__array<_Cp> &, __array_reference<_Cp> > { + struct __are_same &, adt::array_reference<_Cp> > { enum { __value = 1 }; @@ -390,50 +395,51 @@ }; } #endif +namespace adt { template -void swap(__array_reference<_Cp> lhs, __array_reference<_Cp> rhs) { +void swap(array_reference<_Cp> lhs, array_reference<_Cp> rhs) { std::swap_ranges(lhs.data(), lhs.data() + lhs.size(), rhs.data()); } template -void swap(__array<_Cp> &lhs, __array_reference<_Cp> rhs) { +void swap(array<_Cp> &lhs, array_reference<_Cp> rhs) { std::swap_ranges(lhs.data(), lhs.data() + lhs.size(), rhs.data()); } template -void swap(__array_reference<_Cp> lhs, __array<_Cp> &rhs) { +void swap(array_reference<_Cp> lhs, array<_Cp> &rhs) { std::swap_ranges(lhs.data(), lhs.data() + lhs.size(), rhs.data()); } template -class __array_vector_iterator { +class array_vector_iterator { public: typedef typename _Cp::difference_type difference_type; - typedef __array_vector_iterator pointer; - typedef typename std::conditional<_IsConst, __array_const_reference<_Cp>, __array_reference<_Cp> >::type reference; - typedef __array<_Cp> value_type; + typedef array_vector_iterator pointer; + typedef typename std::conditional<_IsConst, array_const_reference<_Cp>, array_reference<_Cp> >::type reference; + typedef array<_Cp> value_type; typedef std::random_access_iterator_tag iterator_category; private: - typedef typename _Cp::__storage_type __storage_type; - typedef typename _Cp::__storage_pointer __storage_pointer; - typedef typename _Cp::size_type __size_type; + typedef typename _Cp::storage_type storage_type; + typedef typename _Cp::storage_pointer storage_pointer; + typedef typename _Cp::size_type size_type; #if defined(__clang__) - friend typename _Cp::__self; + friend typename _Cp::self; #else - friend class _Cp::__self; + friend class _Cp::self; #endif - __storage_pointer data_; - __size_type el_sz_; + storage_pointer data_; + size_type el_sz_; public: - __array_vector_iterator(__storage_pointer data, __size_type el_sz) + array_vector_iterator(storage_pointer data, size_type el_sz) : data_(data), el_sz_(el_sz) { } size_t size() const { @@ -441,10 +447,10 @@ } size_t data_size() const { - return el_sz_ * sizeof(__storage_type); + return el_sz_ * sizeof(storage_type); } - __storage_pointer data() const { + storage_pointer data() const { return data_; } @@ -456,86 +462,86 @@ return *(*this + n); } - __array_vector_iterator &operator++() { + array_vector_iterator &operator++() { data_ += el_sz_; return *this; } - __array_vector_iterator &operator--() { + array_vector_iterator &operator--() { data_ -= el_sz_; return *this; } - __array_vector_iterator operator++(int) { - __array_vector_iterator res = *this; + array_vector_iterator operator++(int) { + array_vector_iterator res = *this; data_ += el_sz_; return res; } - __array_vector_iterator operator--(int) { - __array_vector_iterator res = *this; + array_vector_iterator operator--(int) { + array_vector_iterator res = *this; data_ -= el_sz_; return res; } - __array_vector_iterator operator+(const difference_type &n) const { - return __array_vector_iterator(data_ + n * el_sz_, el_sz_); + array_vector_iterator operator+(const difference_type &n) const { + return array_vector_iterator(data_ + n * el_sz_, el_sz_); } - __array_vector_iterator &operator+=(const difference_type &n) { + array_vector_iterator &operator+=(const difference_type &n) { data_ += n * el_sz_; return *this; } - __array_vector_iterator operator-(const difference_type &n) const { - return __array_vector_iterator(data_ - n * el_sz_, el_sz_); + array_vector_iterator operator-(const difference_type &n) const { + return array_vector_iterator(data_ - n * el_sz_, el_sz_); } - __array_vector_iterator &operator-=(const difference_type &n) { + array_vector_iterator &operator-=(const difference_type &n) { data_ -= n * el_sz_; return *this; } - friend bool operator==(const __array_vector_iterator &r1, - const __array_vector_iterator &r2) { + friend bool operator==(const array_vector_iterator &r1, + const array_vector_iterator &r2) { return r1.data_ == r2.data_; } - friend bool operator!=(const __array_vector_iterator &r1, - const __array_vector_iterator &r2) { + friend bool operator!=(const array_vector_iterator &r1, + const array_vector_iterator &r2) { return r1.data_ != r2.data_; } - friend bool operator<(const __array_vector_iterator &r1, - const __array_vector_iterator &r2) { + friend bool operator<(const array_vector_iterator &r1, + const array_vector_iterator &r2) { return r1.data_ < r2.data_; } - friend bool operator<=(const __array_vector_iterator &r1, - const __array_vector_iterator &r2) { + friend bool operator<=(const array_vector_iterator &r1, + const array_vector_iterator &r2) { return r1.data_ <= r2.data_; } - friend bool operator>(const __array_vector_iterator &r1, - const __array_vector_iterator &r2) { + friend bool operator>(const array_vector_iterator &r1, + const array_vector_iterator &r2) { return r1.data_ > r2.data_; } - friend bool operator>=(const __array_vector_iterator &r1, - const __array_vector_iterator &r2) { + friend bool operator>=(const array_vector_iterator &r1, + const array_vector_iterator &r2) { return r1.data_ >= r2.data_; } - friend __array_vector_iterator + friend array_vector_iterator operator+(difference_type n, - const __array_vector_iterator &r2) { + const array_vector_iterator &r2) { return r2 + n; } friend difference_type - operator-(const __array_vector_iterator &r1, - const __array_vector_iterator &r2) { + operator-(const array_vector_iterator &r1, + const array_vector_iterator &r2) { return (r1.data_ - r2.data_) / r1.el_sz_; } }; @@ -546,34 +552,34 @@ typedef size_t size_type; typedef ptrdiff_t difference_type; - typedef __array_reference reference; - typedef __array_const_reference const_reference; - typedef __array value_type; - typedef __array_vector_iterator iterator; - typedef __array_vector_iterator const_iterator; + typedef array_reference reference; + typedef array_const_reference const_reference; + typedef array value_type; + typedef array_vector_iterator iterator; + typedef array_vector_iterator const_iterator; private: - typedef ElTy __storage_type; - typedef array_vector __self; - typedef __storage_type *__storage_pointer; - typedef const __storage_type *__const_storage_pointer; + typedef ElTy storage_type; + typedef array_vector self; + typedef storage_type *storage_pointer; + typedef const storage_type *const_storage_pointer; - friend class __array<__self>; + friend class array; - friend class __array_reference<__self>; + friend class array_reference; - friend class __array_const_reference<__self>; + friend class array_const_reference; - friend class __array_vector_iterator<__self, true>; + friend class array_vector_iterator; - friend class __array_vector_iterator<__self, false>; + friend class array_vector_iterator; - __storage_pointer data_; + storage_pointer data_; size_type size_; size_type el_sz_; public: - array_vector(__storage_pointer data, size_type sz, size_type el_sz) + array_vector(storage_pointer data, size_type sz, size_type el_sz) : data_(data), size_(sz), el_sz_(el_sz) { } reference operator[](size_t pos) { @@ -610,13 +616,13 @@ size_t size() const { return size_; } - __storage_pointer data() const { return data_; } + storage_pointer data() const { return data_; } void set_size(size_t size) { size_ = size; } - void set_data(__storage_pointer data) { + void set_data(storage_pointer data) { data_ = data; } }; @@ -674,4 +680,5 @@ } }; +} //adt #endif diff -Nru spades-3.10.1+dfsg/src/common/adt/bag.hpp spades-3.11.1+dfsg/src/common/adt/bag.hpp --- spades-3.10.1+dfsg/src/common/adt/bag.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/adt/bag.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -9,29 +9,31 @@ #include "utils/verify.hpp" +namespace adt { + template> class bag { typedef std::unordered_map Data; Data data_; size_t size_; public: - + bag() : size_(0) { } typedef typename Data::const_iterator const_iterator; - void put(const T& t, size_t mult) { + void put(const T &t, size_t mult) { VERIFY(mult > 0); data_[t] += mult; size_ += mult; } - void put(const T& t) { + void put(const T &t) { put(t, 1); } - bool take(const T& t, size_t mult) { + bool take(const T &t, size_t mult) { VERIFY(mult > 0); /*typename map::iterator*/auto it = data_.find(t); if (it == data_.end()) { @@ -54,11 +56,11 @@ } } - bool take(const T& t) { + bool take(const T &t) { return take(t, 1); } - size_t mult(const T& t) const { + size_t mult(const T &t) const { auto it = data_.find(t); if (it == data_.end()) { return 0; @@ -85,3 +87,5 @@ } }; + +} //adt \ No newline at end of file diff -Nru spades-3.10.1+dfsg/src/common/adt/bf.hpp spades-3.11.1+dfsg/src/common/adt/bf.hpp --- spades-3.10.1+dfsg/src/common/adt/bf.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/adt/bf.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -11,164 +11,167 @@ /// The counting Bloom filter. template class counting_bloom_filter { - counting_bloom_filter(const counting_bloom_filter&) = delete; - counting_bloom_filter& operator=(const counting_bloom_filter&) = delete; + counting_bloom_filter(const counting_bloom_filter &) = delete; + counting_bloom_filter &operator=(const counting_bloom_filter &) = delete; protected: - static constexpr uint64_t cell_mask_ = (1ull << width_) - 1; - static constexpr size_t cells_per_entry_ = 8 * sizeof(uint64_t) / width_; + static constexpr uint64_t cell_mask_ = (1ull << width_) - 1; + static constexpr size_t cells_per_entry_ = 8 * sizeof(uint64_t) / width_; public: - /// The hash digest type. - typedef size_t digest; + /// The hash digest type.F + typedef size_t digest; - /// The hash function type. - typedef std::function hasher; + /// The hash function type. + typedef std::function hasher; - counting_bloom_filter() = default; - ~counting_bloom_filter() = default; + counting_bloom_filter() = default; - /// Constructs a counting Bloom filter. - /// @param h The hasher. - /// @param cells The number of cells. - /// @param num_hashes The number of hash functions to use - /// The memory consumption will be cells * width bits - counting_bloom_filter(hasher h, - size_t cells, size_t num_hashes = 3) - : hasher_(std::move(h)), - num_hashes_(num_hashes), - cells_(cells), - data_((cells * width_ + 8 * sizeof(uint64_t) - 1)/ 8 / sizeof(uint64_t)) { - static_assert((width_ & (width_ - 1)) == 0, "Width must be power of two"); - } - - /// Move-constructs a counting Bloom filter. - counting_bloom_filter(counting_bloom_filter&&) = default; - - /// Adds an element to the Bloom filter. - /// @tparam T The type of the element to insert. - /// @param x An instance of type `T`. - void add(const T &o) { - for (size_t i = 0; i < num_hashes_; ++i) { - digest d = hasher_(o, i); - size_t cell_id = d - cells_ * (d / cells_); // Use division here in order to test stuff like libidivide - size_t pos = cell_id / cells_per_entry_; - size_t epos = cell_id - pos * cells_per_entry_; - auto &entry = data_[pos]; - uint64_t mask = cell_mask_ << (width_ * epos); - - // Add counter - while (true) { - uint64_t val = entry.load(); - - // Overflow, do nothing - if ((val & mask) == mask) - break; - - uint64_t newval = val + (1ull << (width_ * epos)); - if (!entry.compare_exchange_strong(val, newval)) - continue; - - break; - } - - } - } - - /// Retrieves the count of an element. - /// @tparam T The type of the element to query. - /// @param x An instance of type `T`. - /// @return A frequency estimate for *x*. - size_t lookup(const T &o) const { - size_t val = (1ull << width_) - 1; - for (size_t i = 0; i < num_hashes_; ++i) { - digest d = hasher_(o, i); - size_t cell_id = d - cells_ * (d / cells_); // Use division here in order to test stuff like libidivide - size_t pos = cell_id / cells_per_entry_; - size_t epos = cell_id - pos * cells_per_entry_; - size_t cval = (data_[pos] >> (width_ * epos)) & cell_mask_; - if (val > cval) - val = cval; - } - - return val; - } - - /// Removes all items from the Bloom filter. - void clear() { - std::fill(data_.begin(), data_.end(), 0); - } + ~counting_bloom_filter() = default; + + /// Constructs a counting Bloom filter. + /// @param h The hasher. + /// @param cells The number of cells. + /// @param num_hashes The number of hash functions to use + /// The memory consumption will be cells * width bits + counting_bloom_filter(hasher h, + size_t cells, size_t num_hashes = 3) + : hasher_(std::move(h)), + num_hashes_(num_hashes), + cells_(cells), + data_((cells * width_ + 8 * sizeof(uint64_t) - 1) / 8 / sizeof(uint64_t)) { + static_assert((width_ & (width_ - 1)) == 0, "Width must be power of two"); + } + + /// Move-constructs a counting Bloom filter. + counting_bloom_filter(counting_bloom_filter &&) = default; + + /// Adds an element to the Bloom filter. + /// @tparam T The type of the element to insert. + /// @param x An instance of type `T`. + void add(const T &o) { + for (size_t i = 0; i < num_hashes_; ++i) { + digest d = hasher_(o, i); + size_t cell_id = d - cells_ * (d / cells_); // Use division here in order to test stuff like libidivide + size_t pos = cell_id / cells_per_entry_; + size_t epos = cell_id - pos * cells_per_entry_; + auto &entry = data_[pos]; + uint64_t mask = cell_mask_ << (width_ * epos); + + // Add counter + while (true) { + uint64_t val = entry.load(); + + // Overflow, do nothing + if ((val & mask) == mask) + break; + + uint64_t newval = val + (1ull << (width_ * epos)); + if (!entry.compare_exchange_strong(val, newval)) + continue; + + break; + } + + } + } + + /// Retrieves the count of an element. + /// @tparam T The type of the element to query. + /// @param x An instance of type `T`. + /// @return A frequency estimate for *x*. + size_t lookup(const T &o) const { + size_t val = (1ull << width_) - 1; + for (size_t i = 0; i < num_hashes_; ++i) { + digest d = hasher_(o, i); + size_t cell_id = d - cells_ * (d / cells_); // Use division here in order to test stuff like libidivide + size_t pos = cell_id / cells_per_entry_; + size_t epos = cell_id - pos * cells_per_entry_; + size_t cval = (data_[pos] >> (width_ * epos)) & cell_mask_; + if (val > cval) + val = cval; + } + + return val; + } + + /// Removes all items from the Bloom filter. + void clear() { + std::fill(data_.begin(), data_.end(), 0); + } protected: - hasher hasher_; - size_t num_hashes_; - size_t cells_; - std::vector> data_; + hasher hasher_; + size_t num_hashes_; + size_t cells_; + std::vector> data_; }; /// The counting Bloom filter. template class bitcounting_bloom_filter : public counting_bloom_filter { - using typename counting_bloom_filter::digest; - using typename counting_bloom_filter::hasher; - - public: - bitcounting_bloom_filter(hasher h, - size_t cells, size_t num_hashes = 3) - : counting_bloom_filter(h, cells, num_hashes) {} - - /// Adds an element to the Bloom filter. - /// @tparam T The type of the element to insert. - /// @param x An instance of type `T`. - void add(const T &o) { - for (size_t i = 0; i < this->num_hashes_; ++i) { - digest d = this->hasher_(o, i); - size_t cell_id = d - this->cells_ * (d / this->cells_); // Use division here in order to test stuff like libidivide - size_t pos = cell_id / this->cells_per_entry_; - size_t epos = cell_id - pos * this->cells_per_entry_; - auto &entry = this->data_[pos]; - uint64_t mask = this->cell_mask_ << (width_ * epos); - - // Add counter - while (true) { - uint64_t val = entry.load() & mask; - - // Overflow, do nothing - if (val == mask) - break; - - uint64_t cellval = val >> width_ * epos; - size_t cnt = (cellval == 0 ? 0 : 64 - __builtin_clzll(cellval)) + width_ * epos; - - if ((std::atomic_fetch_or(&entry, uint64_t(1) << cnt) & mask) != val) - continue; - - break; - } - } - } - - /// Retrieves the count of an element. - /// @tparam T The type of the element to query. - /// @param x An instance of type `T`. - /// @return A frequency estimate for *x*. - size_t lookup(const T &o) const { - size_t val = (1ull << width_) - 1; - for (size_t i = 0; i < this->num_hashes_; ++i) { - digest d = this->hasher_(o, i); - size_t cell_id = d - this->cells_ * (d / this->cells_); // Use division here in order to test stuff like libidivide - size_t pos = cell_id / this->cells_per_entry_; - size_t epos = cell_id - pos * this->cells_per_entry_; - uint64_t entry = (this->data_[pos] >> (width_ * epos)) & this->cell_mask_; - size_t cval = (entry == 0 ? 0 : 64 - __builtin_clzll(entry)); - - if (val > cval) - val = cval; + using typename counting_bloom_filter::digest; + using typename counting_bloom_filter::hasher; + +public: + bitcounting_bloom_filter(hasher h, + size_t cells, size_t num_hashes = 3) + : counting_bloom_filter(h, cells, num_hashes) { } + + /// Adds an element to the Bloom filter. + /// @tparam T The type of the element to insert. + /// @param x An instance of type `T`. + void add(const T &o) { + for (size_t i = 0; i < this->num_hashes_; ++i) { + digest d = this->hasher_(o, i); + size_t cell_id = d - this->cells_ * + (d / this->cells_); // Use division here in order to test stuff like libidivide + size_t pos = cell_id / this->cells_per_entry_; + size_t epos = cell_id - pos * this->cells_per_entry_; + auto &entry = this->data_[pos]; + uint64_t mask = this->cell_mask_ << (width_ * epos); + + // Add counter + while (true) { + uint64_t val = entry.load() & mask; + + // Overflow, do nothing + if (val == mask) + break; + + uint64_t cellval = val >> width_ * epos; + size_t cnt = (cellval == 0 ? 0 : 64 - __builtin_clzll(cellval)) + width_ * epos; + + if ((std::atomic_fetch_or(&entry, uint64_t(1) << cnt) & mask) != val) + continue; + + break; + } + } } - return val; - } + /// Retrieves the count of an element. + /// @tparam T The type of the element to query. + /// @param x An instance of type `T`. + /// @return A frequency estimate for *x*. + size_t lookup(const T &o) const { + size_t val = (1ull << width_) - 1; + for (size_t i = 0; i < this->num_hashes_; ++i) { + digest d = this->hasher_(o, i); + size_t cell_id = d - this->cells_ * + (d / this->cells_); // Use division here in order to test stuff like libidivide + size_t pos = cell_id / this->cells_per_entry_; + size_t epos = cell_id - pos * this->cells_per_entry_; + uint64_t entry = (this->data_[pos] >> (width_ * epos)) & this->cell_mask_; + size_t cval = (entry == 0 ? 0 : 64 - __builtin_clzll(entry)); + + if (val > cval) + val = cval; + } + + return val; + } }; -} // namespace bf +} // namespace bf \ No newline at end of file diff -Nru spades-3.10.1+dfsg/src/common/adt/chained_iterator.hpp spades-3.11.1+dfsg/src/common/adt/chained_iterator.hpp --- spades-3.10.1+dfsg/src/common/adt/chained_iterator.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/adt/chained_iterator.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -13,6 +13,8 @@ #include #include +namespace adt { + template class chained_iterator : public boost::iterator_facade, @@ -72,5 +74,5 @@ std::vector ends_; }; - +} //adt #endif diff -Nru spades-3.10.1+dfsg/src/common/adt/concurrent_dsu.hpp spades-3.11.1+dfsg/src/common/adt/concurrent_dsu.hpp --- spades-3.10.1+dfsg/src/common/adt/concurrent_dsu.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/adt/concurrent_dsu.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -26,6 +26,8 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wconversion" +namespace dsu { + class ConcurrentDSU { struct atomic_set_t { uint64_t data : 61; @@ -209,12 +211,12 @@ std::unordered_map sizes; #if 0 - for (size_t x = 0; x < size; ++x) { - if (data_[x].parent != x) { - size_t t = data_[x].parent; - VERIFY(data_[t].parent == t) + for (size_t x = 0; x < size; ++x) { + if (data_[x].parent != x) { + size_t t = data_[x].parent; + VERIFY(data_[t].parent == t) + } } - } #endif // Insert all the root elements into the map @@ -256,7 +258,7 @@ os.close(); // Write down the sizes - MMappedRecordWriter index(Prefix + ".idx"); + MMappedRecordWriter index(Prefix + ".idx"); index.reserve(sizes.size()); size_t *idx = index.data(); for (size_t x = 0, i = 0, sz = 0; x < data_.size(); ++x) { @@ -292,6 +294,7 @@ mutable std::vector > data_; }; +} //dsu #pragma GCC diagnostic pop #endif /* CONCURRENTDSU_HPP_ */ diff -Nru spades-3.10.1+dfsg/src/common/adt/filter_iterator.hpp spades-3.11.1+dfsg/src/common/adt/filter_iterator.hpp --- spades-3.10.1+dfsg/src/common/adt/filter_iterator.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/adt/filter_iterator.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -8,42 +8,44 @@ #ifndef FILTER_ITERATOR_H_ #define FILTER_ITERATOR_H_ +namespace adt { + /** - * Iterator with some predicate -- iterates only on elements with predicate(item) == true - */ +* Iterator with some predicate -- iterates only on elements with predicate(item) == true +*/ template class filter_iterator { public: typedef typename iterator_type::value_type value_type; - filter_iterator(const iterator_type& begin, const iterator_type& end, const predicate_type& pred): - current_(begin), end_(end), pred_(pred) - { - while((current_ != end_) && (!pred_(*current_))) // why do we need here? DRY, see method advance() below. + filter_iterator(const iterator_type &begin, const iterator_type &end, const predicate_type &pred) : + current_(begin), end_(end), pred_(pred) { + while ((current_ != end_) && (!pred_(*current_))) // why do we need here? DRY, see method advance() below. ++current_; } // filter_iterator value_type operator*() const { return *current_; } value_type operator->() const { return *current_; } - filter_iterator& operator++() { advance(); return *this; } + filter_iterator &operator++() { + advance(); + return *this; + } - bool operator==(const filter_iterator& rhs) const { return current_ == rhs.current_; } - bool operator!=(const filter_iterator& rhs) const { return !(operator==(rhs)); } + bool operator==(const filter_iterator &rhs) const { return current_ == rhs.current_; } + bool operator!=(const filter_iterator &rhs) const { return !(operator==(rhs)); } private: - void advance() - { - do - { + void advance() { + do { ++current_; } - while((current_ != end_) && (!pred_(*current_))); + while ((current_ != end_) && (!pred_(*current_))); } // advance iterator_type current_; iterator_type end_; predicate_type pred_; }; - +} //adt #endif /* FILTER_ITERATOR_H_ */ diff -Nru spades-3.10.1+dfsg/src/common/adt/flat_map.hpp spades-3.11.1+dfsg/src/common/adt/flat_map.hpp --- spades-3.10.1+dfsg/src/common/adt/flat_map.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/adt/flat_map.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -15,14 +15,16 @@ typedef V mapped_type; typedef std::pair value_type; typedef Comp key_compare; + struct value_compare : std::binary_function { - bool operator()(const value_type & lhs, const value_type & rhs) const { + bool operator()(const value_type &lhs, const value_type &rhs) const { return key_compare()(lhs.first, rhs.first); } }; + typedef Allocator allocator_type; - typedef V& reference; - typedef const V& const_reference; + typedef V &reference; + typedef const V &const_reference; typedef typename std::allocator_traits::pointer pointer; typedef typename std::allocator_traits::const_pointer const_pointer; typedef std::vector container_type; @@ -34,10 +36,12 @@ typedef typename container_type::size_type size_type; flat_map() = default; + template flat_map(It begin, It end) { insert(begin, end); } + flat_map(std::initializer_list init) - : flat_map(init.begin(), init.end()) {} + : flat_map(init.begin(), init.end()) { } iterator begin() { return data_.begin(); } iterator end() { return data_.end(); } @@ -56,11 +60,11 @@ size_type size() const { return data_.size(); } size_type max_size() const { return data_.max_size(); } size_type capacity() const { return data_.capacity(); } - void reserve(size_type size) {data_.reserve(size); } + void reserve(size_type size) { data_.reserve(size); } void shrink_to_fit() { data_.shrink_to_fit(); } size_type bytes_used() const { return capacity() * sizeof(value_type) + sizeof(data_); } - mapped_type & operator[](const key_type &key) { + mapped_type &operator[](const key_type &key) { KeyOrValueCompare comp; auto lower = lower_bound(key); if (lower == end() || comp(key, *lower)) @@ -68,7 +72,8 @@ else return lower->second; } - mapped_type & operator[](key_type &&key) { + + mapped_type &operator[](key_type &&key) { KeyOrValueCompare comp; auto lower = lower_bound(key); if (lower == end() || comp(key, *lower)) @@ -80,12 +85,15 @@ std::pair insert(value_type &&value) { return emplace(std::move(value)); } + std::pair insert(const value_type &value) { return emplace(value); } + iterator insert(const_iterator hint, value_type &&value) { return emplace_hint(hint, std::move(value)); } + iterator insert(const_iterator hint, const value_type &value) { return emplace_hint(hint, value); } @@ -99,7 +107,7 @@ } if (begin == end) return; - + // If we don't need to increase capacity, then we can use a more efficient // insert method where everything is just put in the same vector // and then merge in place. @@ -108,7 +116,7 @@ for (size_t i = capacity(); i > size_before && begin != end; --i, ++begin) { data_.emplace_back(*begin); } - } catch(...) { + } catch (...) { // If emplace_back throws an exception, the easiest way to make sure // that our invariants are still in place is to resize to the state // we were in before @@ -139,15 +147,19 @@ // Insert the remaining elements that didn't fit by calling this function recursively. return insert(begin, end); } + void insert(std::initializer_list il) { insert(il.begin(), il.end()); } + iterator erase(iterator it) { return data_.erase(it); } + iterator erase(const_iterator it) { return erase(iterator_const_cast(it)); } + size_type erase(const key_type &key) { auto found = find(key); if (found == end()) @@ -155,33 +167,40 @@ erase(found); return 1; } + iterator erase(const_iterator first, const_iterator last) { return data_.erase(iterator_const_cast(first), iterator_const_cast(last)); } - void swap(flat_map & other) { + + void swap(flat_map &other) { data_.swap(other.data); } + void clear() { data_.clear(); } + template std::pair emplace(First &&first, Args &&... args) { KeyOrValueCompare comp; auto lower_bound = std::lower_bound(data_.begin(), data_.end(), first, comp); if (lower_bound == data_.end() || comp(first, *lower_bound)) - return { data_.emplace(lower_bound, std::forward(first), std::forward(args)...), true }; + return {data_.emplace(lower_bound, std::forward(first), std::forward(args)...), true}; else - return { lower_bound, false }; + return {lower_bound, false}; } + std::pair emplace() { return emplace(value_type()); } + template iterator emplace_hint(const_iterator hint, First &&first, Args &&... args) { KeyOrValueCompare comp; if (hint == cend() || comp(first, *hint)) { if (hint == cbegin() || comp(*(hint - 1), first)) - return data_.emplace(iterator_const_cast(hint), std::forward(first), std::forward(args)...); + return data_.emplace(iterator_const_cast(hint), std::forward(first), + std::forward(args)...); else return emplace(std::forward(first), std::forward(args)...).first; } else if (!comp(*hint, first)) { @@ -190,6 +209,7 @@ return emplace(std::forward(first), std::forward(args)...).first; } } + iterator emplace_hint(const_iterator hint) { return emplace_hint(hint, value_type()); } @@ -218,11 +238,11 @@ return std::lower_bound(begin(), end(), key, KeyOrValueCompare()); } template - const_iterator lower_bound(const T & key) const { + const_iterator lower_bound(const T &key) const { return std::lower_bound(begin(), end(), key, KeyOrValueCompare()); } template - iterator upper_bound(const T & key) { + iterator upper_bound(const T &key) { return std::upper_bound(begin(), end(), key, KeyOrValueCompare()); } template @@ -260,7 +280,7 @@ return !(*this < other); } - private: +private: container_type data_; iterator iterator_const_cast(const_iterator it) { @@ -301,7 +321,7 @@ // like std::binary_search, but returns the iterator to the element // if it was found, and returns end otherwise template - static It binary_find(It begin, It end, const T & value, const Compare & cmp) { + static It binary_find(It begin, It end, const T &value, const Compare &cmp) { auto lower_bound = std::lower_bound(begin, end, value, cmp); if (lower_bound == end || cmp(value, *lower_bound)) return end; @@ -311,10 +331,9 @@ }; template -void swap(flat_map & lhs, flat_map & rhs) { +void swap(flat_map &lhs, flat_map &rhs) { lhs.swap(rhs); } -} - +} //adt #endif diff -Nru spades-3.10.1+dfsg/src/common/adt/flat_set.hpp spades-3.11.1+dfsg/src/common/adt/flat_set.hpp --- spades-3.10.1+dfsg/src/common/adt/flat_set.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/adt/flat_set.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -7,17 +7,16 @@ #include #include #include - namespace adt { -template, template class Container = std::vector > +template, template class Container = std::vector> struct flat_set { typedef T key_type; typedef T value_type; typedef Comp key_compare; typedef Comp value_compare; - typedef value_type& reference; - typedef const value_type& const_reference; + typedef value_type &reference; + typedef const value_type &const_reference; typedef Container container_type; typedef typename container_type::pointer pointer; typedef typename container_type::const_pointer const_pointer; @@ -29,10 +28,12 @@ typedef typename container_type::size_type size_type; flat_set() = default; + template flat_set(It begin, It end) { insert(begin, end); } + flat_set(std::initializer_list init) : flat_set(init.begin(), init.end()) { } @@ -57,8 +58,8 @@ void shrink_to_fit() { data_.shrink_to_fit(); } size_type bytes_used() const { return capacity() * sizeof(value_type) + sizeof(data_); } - std::pair insert(value_type && value) { return emplace(std::move(value)); } - std::pair insert(const value_type & value) { return emplace(value); } + std::pair insert(value_type &&value) { return emplace(std::move(value)); } + std::pair insert(const value_type &value) { return emplace(value); } iterator insert(const_iterator hint, value_type && value) { return emplace_hint(hint, std::move(value)); } iterator insert(const_iterator hint, const value_type & value) { return emplace_hint(hint, value); } void insert(std::initializer_list il) { insert(il.begin(), il.end()); } @@ -80,7 +81,7 @@ for (size_t i = capacity(); i > size_before && begin != end; --i, ++begin) { data_.emplace_back(*begin); } - } catch(...) { + } catch (...) { // If emplace_back throws an exception, the easiest way to make sure // that our invariants are still in place is to resize to the state // we were in before @@ -110,6 +111,7 @@ // this will recurse log(n) times where n is std::distance(begin, end) return insert(begin, end); } + iterator erase(iterator it) { return data_.erase(it); } iterator erase(const_iterator it) { return erase(iterator_const_cast(it)); } size_type erase(const value_type &val) { @@ -122,25 +124,30 @@ return data_.erase(iterator_const_cast(first), iterator_const_cast(last)); } - void swap(flat_set & other) { data_.swap(other.data); } + void swap(flat_set &other) { data_.swap(other.data); } + void clear() { data_.clear(); } template - std::pair emplace(First && first, Args &&... args) { + std::pair emplace(First &&first, Args &&... args) { Comp comp; auto lower_bound = std::lower_bound(data_.begin(), data_.end(), first, comp); if (lower_bound == data_.end() || comp(first, *lower_bound)) - return { data_.emplace(lower_bound, std::forward(first), std::forward(args)...), true }; + return {data_.emplace(lower_bound, std::forward(first), std::forward(args)...), + true}; else - return { lower_bound, false }; + return {lower_bound, false}; } + std::pair emplace() { return emplace(value_type()); } + template - iterator emplace_hint(const_iterator hint, First && first, Args &&... args) { + iterator emplace_hint(const_iterator hint, First &&first, Args &&... args) { Comp comp; if (hint == cend() || comp(first, *hint)) { if (hint == cbegin() || comp(*(hint - 1), first)) - return data_.emplace(iterator_const_cast(hint), std::forward(first), std::forward(args)...); + return data_.emplace(iterator_const_cast(hint), std::forward(first), + std::forward(args)...); else return emplace(std::forward(first), std::forward(args)...).first; } else if (!comp(*hint, first)) { @@ -201,7 +208,7 @@ return !(*this < other); } - private: +private: container_type data_; iterator iterator_const_cast(const_iterator it) { @@ -221,10 +228,9 @@ }; template class Container> -void swap(flat_set & lhs, flat_set & rhs) { +void swap(flat_set &lhs, flat_set &rhs) { lhs.swap(rhs); } -} - +} //adt #endif // __ADT_FLAT_SET_HPP__ diff -Nru spades-3.10.1+dfsg/src/common/adt/hll.hpp spades-3.11.1+dfsg/src/common/adt/hll.hpp --- spades-3.10.1+dfsg/src/common/adt/hll.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/adt/hll.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -4,21 +4,21 @@ #include #include #include - namespace hll { - template - class hll { + +template +class hll { static constexpr uint64_t m_ = 1ull << precision; static constexpr uint64_t mask_ = (m_ - 1) << (64 - precision); - + constexpr double alpha(unsigned p) const { // constexpr switches are C++14 only :( return (p > 6 ? 0.7213 / (1.0 + 1.079 / double(1ull << p)) : p == 6 ? 0.709 : p == 5 ? 0.697 : 0.673); } - - public: + +public: /// The hash digest type. typedef uint64_t digest; @@ -26,8 +26,7 @@ typedef std::function hasher; hll(hasher h) - : hasher_(std::move(h)), data_(1ull << precision, 0) - { } + : hasher_(std::move(h)), data_(1ull << precision, 0) { } /// @tparam T The type of the element to insert. @@ -46,24 +45,24 @@ for (size_t i = 0; i < data_.size(); ++i) data_[i] = std::max(data_[i], other.data_[i]); } - + std::pair cardinality() const { // FIXME: Precision loss? // FIXME: Bias correction! double res = alpha(precision) * m_ * m_; double E = std::accumulate(data_.begin(), data_.end(), - 0.0, [](double a, uint8_t b) { return a + exp2(-(double)b); }); + 0.0, [](double a, uint8_t b) { return a + exp2(-(double) b); }); res /= E; - return { res, res > 5.0 * m_/2 }; + return {res, res > 5.0 * m_ / 2}; } void clear() { - std::fill(data_.begin(), data_.end(), 0); + std::fill(data_.begin(), data_.end(), 0); } - - private: + +private: hasher hasher_; std::vector data_; - }; +}; -} //namespace hll +} // hll \ No newline at end of file diff -Nru spades-3.10.1+dfsg/src/common/adt/iterator_range.hpp spades-3.11.1+dfsg/src/common/adt/iterator_range.hpp --- spades-3.10.1+dfsg/src/common/adt/iterator_range.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/adt/iterator_range.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -9,7 +9,6 @@ #include #include - namespace adt { template @@ -26,9 +25,10 @@ : begin_iterator(std::move(begin_iterator)), end_iterator(std::move(end_iterator)) { } - IteratorT begin() const { return begin_iterator; } - - IteratorT end() const { return end_iterator; } + const IteratorT& begin() const { return begin_iterator; } + const IteratorT& end() const { return end_iterator; } + IteratorT& begin() { return begin_iterator; } + IteratorT& end() { return end_iterator; } }; template @@ -45,6 +45,7 @@ iterator_range()))> drop_begin(T &&t, int n) { return make_range(std::next(begin(t), n), end(t)); } -} + +} //adt #endif diff -Nru spades-3.10.1+dfsg/src/common/adt/kmer_hash_vector.hpp spades-3.11.1+dfsg/src/common/adt/kmer_hash_vector.hpp --- spades-3.10.1+dfsg/src/common/adt/kmer_hash_vector.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/adt/kmer_hash_vector.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,370 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -/* - * kmer_hash_vector.hpp - * - * Created on: Jul 19, 2012 - * Author: alex - */ - -#ifndef KMER_HASH_VECTOR_HPP_ -#define KMER_HASH_VECTOR_HPP_ - - -#include "sequence/runtime_k.hpp" -#include "kmer_map.hpp" - - -namespace runtime_k { - -class IKmerHashVector { - -protected: - static const size_t LOAD_OVERHEAD = 1000; - - size_t nthreads_; - - size_t cell_size_; - -public: - typedef RtSeq input_value_type; - - IKmerHashVector(size_t nthreads) - : nthreads_ (nthreads) - , cell_size_ (LOAD_OVERHEAD) { - } - - virtual ~IKmerHashVector() { - - } - - virtual IKmerHashVector * copy() const = 0; - - virtual void clear() = 0; - - virtual void clear(size_t i) = 0; - - virtual bool is_full() const = 0; - - virtual bool is_presisely_full() const = 0; - - virtual size_t capacity(size_t i) const = 0; - - virtual size_t size(size_t i) const = 0; - - - virtual void insert(const input_value_type& value) = 0; - - virtual void reserve(size_t cell_size) = 0; - - - virtual size_t get_k() const = 0; - - size_t get_threads_num() const - { - return nthreads_; - } - - virtual void dump (KmerMap& destination, size_t bucketNum) = 0; -}; - - - -class KmerHashVector { - -public: - - typedef IKmerHashVector base_vector_type; - -private: - - base_vector_type * data_; - -public: - - typedef KmerHashVector vector_type; - - typedef base_vector_type::input_value_type input_value_type; - - - KmerHashVector(size_t k, size_t nthreads); - - KmerHashVector(base_vector_type * vec): data_(vec) { - } - - KmerHashVector(const vector_type& vec) { - data_ = vec.data_->copy(); - } - - vector_type& operator=(const vector_type& vec) { - if (vec.data_ != data_) { - delete data_; - data_ = vec.data_->copy(); - } - - return *this; - } - - ~KmerHashVector() { - delete data_; - } - - - - bool is_full() const { - return data_->is_full(); - } - - bool is_presisely_full() const { - return data_->is_presisely_full(); - } - - size_t get_threads_num() const - { - return data_->get_threads_num(); - } - - - void insert(const input_value_type& value) { - data_->insert(value); - } - - void clear() { - data_->clear(); - } - - - void clear(size_t i) { - data_->clear(i); - } - - size_t get_k() const { - return data_->get_k(); - } - - size_t capacity(size_t i) const { - return data_->capacity(i); - } - - void reserve(size_t cell_size) { - data_->reserve(cell_size); - } - - base_vector_type * get_data() const { - return data_; - } - - void print_sizes() { - for (size_t i = 0; i < data_->get_threads_num(); ++i) { - INFO("Size " << i << ": " << data_->size(i)); - } - } - - void dump (KmerMap& destination, size_t bucketNum) { - data_->dump(destination, bucketNum); - } -}; - - -// ================================= VECTOR IMPLEMENTATION ================================= - -template -class KmerHashVectorImpl: public IKmerHashVector { - -public: - - typedef TypeContainerImpl type_container; - - typedef typename type_container::Kmer Kmer; - - typedef typename type_container::vector_type vector_type; - - typedef std::vector data_type; - - typedef IKmerHashVector base_type; - - typedef typename base_type::input_value_type input_value_type; - -private: - - data_type data_; - - size_t k_; - -public: - - KmerHashVectorImpl(size_t k, size_t nthreads): - IKmerHashVector(nthreads) - , data_ (nthreads) - , k_ (k) { - } - - virtual base_type * copy() const { - return new KmerHashVectorImpl(*this); - } - - virtual bool is_full() const { - return data_[0].size() >= cell_size_; - } - - virtual bool is_presisely_full() const { - for (size_t i = 0; i < nthreads_; ++i) { - if (data_[i].size() >= cell_size_) - return true; - } - return false; - } - - virtual void insert(const input_value_type& value) { - Kmer kmer = type_container::from_sequence(value); - data_[kmer.GetHash() % nthreads_].push_back(kmer); - } - - virtual void clear() { - for (size_t i = 0; i < nthreads_; ++i) { - data_[i].clear(); - } - } - - virtual void clear(size_t i) { - data_[i].clear(); - } - - virtual size_t get_k() const { - return k_; - } - - virtual size_t capacity(size_t i) const { - return data_[i].capacity(); - } - - virtual size_t size(size_t i) const { - return data_[i].size(); - } - - virtual void reserve(size_t cell_size) { - cell_size_ = cell_size; - for (size_t i = 0; i < nthreads_; ++i) { - data_[i].reserve(cell_size_ + LOAD_OVERHEAD); - } - } - - const data_type& get_data() const { - return data_; - } - - virtual void dump (KmerMap& destination, size_t bucketNum) { - KmerMapImpl& destImpl = dynamic_cast&>(destination.get_data()); - - for (auto it = data_[bucketNum].begin(), end = data_[bucketNum].end(); it != end; ++it) { - ++destImpl[*it]; - } - } -}; - - -// ================================= VECTOR FACTORIES ================================= -// Single factory interface -class SingleKmerHashVectorFactory { - -public: - - virtual IKmerHashVector * GetHashVector(size_t k, size_t nthreads) const = 0; - - virtual ~SingleKmerHashVectorFactory() { - - } -}; - - -// Single factory for specific k and value -template -class SingleKmerHashVectorFactoryImpl: public SingleKmerHashVectorFactory { - -public: - - virtual IKmerHashVector * GetHashVector(size_t k, size_t nthreads) const { - VERIFY_MSG(GET_UPPER_BOUND(k) == GET_K_BY_TS(ts_), k << " -> " << GET_UPPER_BOUND(k) << ", " << ts_ << " -> " << GET_K_BY_TS(ts_)); - //INFO(k << " -> " << GET_UPPER_BOUND(k) << ", " << ts_ << " -> " << GET_K_BY_TS(ts_)); - - return new KmerHashVectorImpl< GET_K_BY_TS(ts_) >(k, nthreads); - } - -}; - -//Factory genetator -template -class HashVectorGenerator { - -public: - - static void GenerateHashVectors(std::vector< SingleKmerHashVectorFactory* > & factories) { - factories[ts_] = new SingleKmerHashVectorFactoryImpl(); - HashVectorGenerator :: GenerateHashVectors (factories); - } -}; - -//Terminating factory generator -template<> -class HashVectorGenerator { - -public: - - static void GenerateHashVectors(std::vector< SingleKmerHashVectorFactory* > & factories) { - factories[MIN_TS] = new SingleKmerHashVectorFactoryImpl; - } -}; - - -//Lazy singleton for factory for every required value -class KmerHashVectorFactory { - -private: - - std::vector < SingleKmerHashVectorFactory* > single_factories_; - - KmerHashVectorFactory() { - VERIFY_MSG(MIN_K <= MAX_K, "Invalid K value range"); - - single_factories_ = std::vector < SingleKmerHashVectorFactory* >(MAX_TS + 1); - HashVectorGenerator::GenerateHashVectors(single_factories_); - } - -public: - - static KmerHashVectorFactory& GetInstance() { - static KmerHashVectorFactory instance; - - return instance; - } - - KmerHashVector GetHashVector(size_t k, size_t nthreads) { - VERIFY_MSG(k >= MIN_K && k <= MAX_K, "K value " + ToString(k) + " is not supported, should be >= " + - ToString(MIN_K) + " and <= " + ToString(MAX_K)); - - return KmerHashVector(single_factories_[GET_T_ELEMENTS_NUMBER(k)]->GetHashVector(k, nthreads)); - } - - IKmerHashVector * GetRawHashVector(size_t k, size_t nthreads) { - VERIFY_MSG(k >= MIN_K && k <= MAX_K, "K value " + ToString(k) + " is not supported, should be >= " + - ToString(MIN_K) + " and <= " + ToString(MAX_K)); - - return single_factories_[GET_T_ELEMENTS_NUMBER(k)]->GetHashVector(k, nthreads); - } -}; - -KmerHashVector GetHashVector(size_t k, size_t nthreads) { - return KmerHashVectorFactory::GetInstance().GetHashVector(k, nthreads); -} - -KmerHashVector::KmerHashVector(size_t k, size_t nthreads): data_(KmerHashVectorFactory::GetInstance().GetRawHashVector(k, nthreads)) { -} - -} //namespace runtime_k - -#endif /* KMER_HASH_VECTOR_HPP_ */ diff -Nru spades-3.10.1+dfsg/src/common/adt/kmer_vector.hpp spades-3.11.1+dfsg/src/common/adt/kmer_vector.hpp --- spades-3.10.1+dfsg/src/common/adt/kmer_vector.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/adt/kmer_vector.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -17,6 +17,8 @@ #endif +namespace adt { + template class KMerVector { private: @@ -37,13 +39,13 @@ je_free(storage_); storage_ = res; #else - // No JEMalloc, no cookies - ElTy *res = new ElTy[capacity_ * el_sz_]; - if (storage_) - std::memcpy(res, storage_, size_ * sizeof(ElTy) * el_sz_); + // No JEMalloc, no cookies + ElTy *res = new ElTy[capacity_ * el_sz_]; + if (storage_) + std::memcpy(res, storage_, size_ * sizeof(ElTy) * el_sz_); - delete[] storage_; - storage_ = res; + delete[] storage_; + storage_ = res; #endif return storage_; @@ -59,12 +61,14 @@ typedef array_equal_to equal_to; explicit KMerVector(unsigned K, size_t capacity = 1) - : K_(K), size_(0), capacity_(std::max(capacity, (size_t) 1)), el_sz_(Seq::GetDataSize(K)), storage_(NULL), + : K_(K), size_(0), capacity_(std::max(capacity, (size_t) 1)), el_sz_(Seq::GetDataSize(K)), + storage_(NULL), vector_(realloc(), size_, el_sz_) { } KMerVector(KMerVector &&that) - : K_(that.K_), size_(that.size_), capacity_(that.capacity_), el_sz_(that.el_sz_), storage_(that.storage_), + : K_(that.K_), size_(that.size_), capacity_(that.capacity_), el_sz_(that.el_sz_), + storage_(that.storage_), vector_(storage_, size_, el_sz_) { that.storage_ = NULL; } @@ -188,5 +192,5 @@ array_vector vector_; }; - +} //adt #endif /* __KMER_VECTOR_HPP */ diff -Nru spades-3.10.1+dfsg/src/common/adt/loser_tree.hpp spades-3.11.1+dfsg/src/common/adt/loser_tree.hpp --- spades-3.10.1+dfsg/src/common/adt/loser_tree.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/adt/loser_tree.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -8,7 +8,10 @@ template IntegerType ilog2(IntegerType x) { IntegerType lg = 0; - while (x >= 256) { x >>= 8; lg += 8; } + while (x >= 256) { + x >>= 8; + lg += 8; + } while (x >>= 1) lg += 1; return lg; @@ -20,9 +23,9 @@ } template -class loser_tree { +class loser_tree { typedef typename std::iterator_traits::value_type value_type; - + size_t log_k_; size_t k_; std::vector entry_; @@ -34,10 +37,10 @@ return true; if (a.end() == a.begin()) return false; - + return inner_cmp_(*a.begin(), *b.begin()); } - + size_t init_winner(size_t root) { if (root >= k_) return root - k_; @@ -53,7 +56,7 @@ } } - public: +public: loser_tree(const std::vector> &runs, Cmp inner_cmp = Cmp()) : inner_cmp_(inner_cmp), runs_(runs) { @@ -61,11 +64,11 @@ k_ = (size_t(1) << log_k_); // fprintf(stderr, "k: %zu, logK: %zu, nruns: %zu\n", k_, log_k_, runs.size()); - + entry_.resize(2 * k_); for (size_t i = 0; i < k_; ++i) entry_[k_ + i] = i; - + // Insert sentinels for (size_t i = runs.size(); i < k_; ++i) runs_.emplace_back(adt::make_range(runs_[0].end(), runs_[0].end())); @@ -96,7 +99,7 @@ const auto &winner = runs_[winner_index]; return (winner.begin() == winner.end()); } - + template size_t multi_merge(It2 out, size_t amount = -1ULL) { @@ -112,7 +115,7 @@ winner_index = replay(winner_index); } - + entry_[0] = winner_index; return cnt; @@ -125,10 +128,10 @@ return res; } - - private: + +private: std::vector> runs_; }; -} \ No newline at end of file +} //adt \ No newline at end of file diff -Nru spades-3.10.1+dfsg/src/common/adt/parallel_seq_vector.hpp spades-3.11.1+dfsg/src/common/adt/parallel_seq_vector.hpp --- spades-3.10.1+dfsg/src/common/adt/parallel_seq_vector.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/adt/parallel_seq_vector.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,110 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#pragma once - -#include "parallel_unordered_map.hpp" -#include "utils/openmp_wrapper.h" - -#include "sequence/runtime_k.hpp" -#include "kmer_map.hpp" -#include "kmer_hash_vector.hpp" - -class ParallelSeqVector { - -public: - typedef runtime_k::KmerHashVector par_container_t; - - typedef runtime_k::KmerMap destination_container_t; - - typedef RtSeq Kmer; - -private: - - size_t k_; - - size_t nthreads_; - - std::vector nodes_; - -public: - - ParallelSeqVector(size_t k, size_t nthreads, size_t cell_size) : - k_(k), - nthreads_(nthreads), - nodes_() - - { - for (size_t i = 0; i < nthreads_; ++i) { - nodes_.push_back(runtime_k::GetHashVector(k_, nthreads_)); - } - - for (size_t i = 0; i < nthreads_; ++i) { - nodes_[i].reserve(cell_size); - } - } - - - void AddEdge(const Kmer &kmer, size_t thread_number) { - nodes_[thread_number].insert(kmer); - } - - void CountSequence(const Sequence& s, size_t thread_number) { - if (s.size() < k_) - return; - - Kmer kmer = s.start(k_); - - AddEdge(kmer, thread_number); - for (size_t j = k_; j < s.size(); ++j) { - kmer <<= s[j]; - AddEdge(kmer, thread_number); - } - - } -// -// void MergeMaps(destination_container_t & dest_container, size_t i) { -// for (size_t j = 0; j < nthreads_; ++j) { -// dest_container.transfer(nodes_[j], i); -// } -// } - - void Dump(destination_container_t & bucket, size_t bucket_number) { - for (size_t i = 0; i < nodes_.size(); ++i) { - nodes_[i].dump(bucket, bucket_number); - nodes_[i].clear(bucket_number); - } - } - - - size_t SingleBucketCount() const { - return nodes_[0].capacity(0); - } - - bool IsFull(size_t i) const { - return nodes_[i].is_full(); - } - - void Clear(size_t i) { - nodes_[i].clear(); - } - - void Clear() { - for (size_t i = 0; i < nthreads_; ++i) { - nodes_[i].clear(); - } - } - - void print_sizes() { - for (size_t i = 0; i < nodes_.size(); ++i) { - INFO("Size " << i << "::: "); - nodes_[i].print_sizes(); - } - } - - -}; diff -Nru spades-3.10.1+dfsg/src/common/adt/parallel_unordered_map.hpp spades-3.11.1+dfsg/src/common/adt/parallel_unordered_map.hpp --- spades-3.10.1+dfsg/src/common/adt/parallel_unordered_map.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/adt/parallel_unordered_map.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,137 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#pragma once - -#include - -template -struct parallel_unordered_set -{ -private: - - typedef std::unordered_set origin_container_t; - - typedef std::vector container_arr_t; - - typedef typename origin_container_t::value_type value_type; - - public: - parallel_unordered_set(size_t nthreads, size_t cell_size = 100000) - : nthreads_ (nthreads) - , buckets_ (nthreads, origin_container_t(cell_size)) { - - } - - void insert(const value_type& value, size_t bucket_num) - { - buckets_[bucket_num].insert(value); - } - - const origin_container_t & operator[](size_t i) const - { - return buckets_[i]; - } - - size_t get_threads_num() const - { - return nthreads_; - } - - const container_arr_t & get_buckets() const - { - return buckets_; - } - - void clear() { - for (size_t i = 0; i < nthreads_; ++i) { - buckets_[i].clear(); - } - } - - private: - parallel_unordered_set& operator=(const parallel_unordered_set&); - - private: - size_t nthreads_; - container_arr_t buckets_; -}; - - - -template -struct parallel_vector -{ - private: - static const size_t LOAD_OVERHEAD = 1000; - - typedef std::vector origin_container_t; - typedef std::vector container_arr_t; - typedef typename origin_container_t::value_type value_type; - - public: - parallel_vector(size_t nthreads, size_t cell_size = 100000) - : nthreads_ (nthreads) - , cell_size_ (cell_size) - , buckets_ (nthreads) { - - for (size_t i = 0; i < nthreads_; ++i) { - buckets_[i].reserve(cell_size + LOAD_OVERHEAD); - } - } - - void insert(const value_type& value, size_t bucket_num) - { - buckets_[bucket_num].push_back(value); - } - - const origin_container_t & operator[](size_t i) const - { - return buckets_[i]; - } - - origin_container_t & operator[](size_t i) { - return buckets_[i]; - } - - size_t get_threads_num() const - { - return nthreads_; - } - - const container_arr_t & get_buckets() const - { - return buckets_; - } - - bool is_full() const { - return buckets_[0].size() >= cell_size_; - } - - bool is_presisely_full() const { - for (size_t i = 0; i < nthreads_; ++i) { - if (buckets_[i].size() >= cell_size_) - return true; - } - return false; - } - - void clear() { - for (size_t i = 0; i < nthreads_; ++i) { - buckets_[i].clear(); - } - } - - - private: - parallel_vector& operator=(const parallel_vector&); - - private: - size_t nthreads_; - size_t cell_size_; - container_arr_t buckets_; -}; diff -Nru spades-3.10.1+dfsg/src/common/adt/pointer_iterator.hpp spades-3.11.1+dfsg/src/common/adt/pointer_iterator.hpp --- spades-3.10.1+dfsg/src/common/adt/pointer_iterator.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/adt/pointer_iterator.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -14,6 +14,8 @@ #include #include +namespace adt { + template class pointer_iterator : public std::iterator { protected: @@ -171,4 +173,6 @@ return (r1.data_ - r2.data_); } +} //adt + #endif // __HAMMER_POINTER_ITERATOR_HPP__ diff -Nru spades-3.10.1+dfsg/src/common/adt/queue_iterator.hpp spades-3.11.1+dfsg/src/common/adt/queue_iterator.hpp --- spades-3.10.1+dfsg/src/common/adt/queue_iterator.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/adt/queue_iterator.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -11,6 +11,10 @@ #include "utils/verify.hpp" #include + +namespace adt { + + template class erasable_priority_queue { private: @@ -20,13 +24,13 @@ * Be careful! This constructor requires Comparator to have default constructor even if you call it with * specified comparator. In this case just create default constructor with VERIFY(false) inside it. */ - erasable_priority_queue(const Comparator& comparator = Comparator()) : + erasable_priority_queue(const Comparator &comparator = Comparator()) : storage_(comparator) { } template erasable_priority_queue(InputIterator begin, InputIterator end, - const Comparator& comparator = Comparator()) : + const Comparator &comparator = Comparator()) : storage_(begin, end, comparator) { } @@ -40,11 +44,11 @@ return *(storage_.begin()); } - void push(const T& key) { + void push(const T &key) { storage_.insert(key); } - bool erase(const T& key) { + bool erase(const T &key) { bool res = storage_.erase(key) > 0; return res; } @@ -78,7 +82,7 @@ public: - DynamicQueueIterator(const Comparator& comparator = Comparator()) : + DynamicQueueIterator(const Comparator &comparator = Comparator()) : current_actual_(false), current_deleted_(false), queue_(comparator) { } @@ -87,11 +91,11 @@ queue_.insert(begin, end); } - void push(const T& to_add) { + void push(const T &to_add) { queue_.push(to_add); } - void erase(const T& to_remove) { + void erase(const T &to_remove) { if (current_actual_ && to_remove == current_) { current_deleted_ = true; } @@ -114,7 +118,7 @@ const T& operator*() { VERIFY(!queue_.empty()); - if(!current_actual_ || current_deleted_) { + if (!current_actual_ || current_deleted_) { current_ = queue_.top(); current_actual_ = true; current_deleted_ = false; @@ -137,7 +141,7 @@ } }; - +} //adt #endif /* QUEUE_ITERATOR_HPP_ */ diff -Nru spades-3.10.1+dfsg/src/common/adt/small_pod_vector.hpp spades-3.11.1+dfsg/src/common/adt/small_pod_vector.hpp --- spades-3.10.1+dfsg/src/common/adt/small_pod_vector.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/adt/small_pod_vector.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -62,7 +62,8 @@ #define IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable::value #endif - static_assert(IS_TRIVIALLY_COPYABLE(value_type), "Value type for SmallPODVector should be trivially copyable"); + static_assert(IS_TRIVIALLY_COPYABLE(value_type), + "Value type for SmallPODVector should be trivially copyable"); #undef IS_TRIVIALLY_COPYABLE @@ -394,6 +395,5 @@ #undef LIKELY #undef UNLIKELY -} - +} //adt #endif // __ADT_SMALL_POD_VECTOR__ diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/components/connected_component.cpp spades-3.11.1+dfsg/src/common/assembly_graph/components/connected_component.cpp --- spades-3.10.1+dfsg/src/common/assembly_graph/components/connected_component.cpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/components/connected_component.cpp 2017-09-28 09:05:08.000000000 +0000 @@ -65,11 +65,11 @@ return; } -size_t ConnectedComponentCounter::GetComponent(EdgeId & e) const { +size_t ConnectedComponentCounter::GetComponent(EdgeId e) const { if (component_ids_.size() == 0) { CalculateComponents(); } - return component_ids_[e]; + return component_ids_.at(e); } diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/components/connected_component.hpp spades-3.11.1+dfsg/src/common/assembly_graph/components/connected_component.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/components/connected_component.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/components/connected_component.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -17,7 +17,7 @@ ConnectedComponentCounter(const Graph &g):g_(g) {} void CalculateComponents() const; // size_t GetComponent(path_extend::BidirectionalPath * p) const; - size_t GetComponent(EdgeId & e) const; + size_t GetComponent(EdgeId e) const; bool IsFilled() const { return (component_ids_.size() != 0); } diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/components/splitters.hpp spades-3.11.1+dfsg/src/common/assembly_graph/components/splitters.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/components/splitters.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/components/splitters.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -277,8 +277,14 @@ const double coverage_bound_; const size_t edge_limit_; mutable size_t edge_summary_length_; + const size_t edge_summary_length_limit_; void Find(EdgeId edge, std::set &result) const { + + if (edge_summary_length_ > edge_summary_length_limit_) { + return; + } + if (result.size() > edge_limit_) { return; } @@ -307,12 +313,13 @@ } public: - CoverageBoundedDFS(const Graph &graph, double coverage_bound, - size_t edge_limit = 10000) + CoverageBoundedDFS(const Graph &graph, double coverage_bound, size_t edge_summary_limit, + size_t edge_limit = 500) : graph_(graph), coverage_bound_(coverage_bound), edge_limit_(edge_limit), - edge_summary_length_(0) { + edge_summary_length_(0), + edge_summary_length_limit_(edge_summary_limit) { } std::set Find(VertexId v) const { @@ -337,11 +344,15 @@ CoverageBoundedDFS dfs_helper; public: - HighCoverageComponentFinder(const Graph &graph, double max_coverage) - : AbstractNeighbourhoodFinder(graph), coverage_bound_(max_coverage), dfs_helper(graph, max_coverage) { + HighCoverageComponentFinder(const Graph &graph, + double max_coverage, + size_t edge_sum_limit = std::numeric_limits::max()) + : AbstractNeighbourhoodFinder(graph), + coverage_bound_(max_coverage), + dfs_helper(graph, max_coverage, edge_sum_limit) { } - GraphComponent Find(typename Graph::VertexId v) const { + GraphComponent Find(VertexId v) const { std::set result = dfs_helper.Find(v); return GraphComponent::FromEdges(this->graph(), result, false); } @@ -388,8 +399,8 @@ set FindBorder(const GraphComponent& component) const { set result; - insert_all(result, component.entrances()); - insert_all(result, component.exits()); + utils::insert_all(result, component.entrances()); + utils::insert_all(result, component.exits()); return result; } @@ -784,7 +795,7 @@ size_t edge_length_bound = ReliableNeighbourhoodFinder::DEFAULT_EDGE_LENGTH_BOUND, size_t max_size = ReliableNeighbourhoodFinder::DEFAULT_MAX_SIZE) { typedef typename Graph::VertexId VertexId; - shared_ptr> inner_iterator = make_shared>>(graph.begin(), graph.end()); + shared_ptr> inner_iterator = make_shared>(graph.begin(), graph.end()); shared_ptr> nf = make_shared>(graph, edge_length_bound, max_size); return make_shared>(graph, inner_iterator, nf); @@ -795,7 +806,7 @@ size_t edge_length_bound = 1000000, size_t max_size = 1000000) { typedef typename Graph::VertexId VertexId; - shared_ptr> inner_iterator = make_shared>>(graph.begin(), graph.end()); + shared_ptr> inner_iterator = make_shared>(graph.begin(), graph.end()); shared_ptr> nf = make_shared>(graph, edge_length_bound, max_size); return make_shared>(graph, inner_iterator, nf); @@ -822,7 +833,7 @@ ReliableNeighbourhoodFinder::DEFAULT_EDGE_LENGTH_BOUND) { typedef typename Graph::VertexId VertexId; shared_ptr> inner_iterator = make_shared< - CollectionIterator>>(graph.begin(), graph.end()); + CollectionIterator>(graph.begin(), graph.end()); shared_ptr> nf = make_shared< ShortEdgeComponentFinder>(graph, bound); return make_shared>(graph, diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/construction/debruijn_graph_constructor.hpp spades-3.11.1+dfsg/src/common/assembly_graph/construction/debruijn_graph_constructor.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/construction/debruijn_graph_constructor.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/construction/debruijn_graph_constructor.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -0,0 +1,545 @@ +#pragma once +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "assembly_graph/core/graph.hpp" +#include "assembly_graph/core/construction_helper.hpp" +#include "utils/standard_base.hpp" +#include "utils/extension_index/kmer_extension_index.hpp" +#include "utils/parallel/openmp_wrapper.h" +#include "utils/parallel/parallel_wrapper.hpp" +#include + +namespace debruijn_graph { + +/* + * Constructs DeBruijnGraph from DeBruijn Graph using "new DeBruijnGraphConstructor(DeBruijn).ConstructGraph(DeBruijnGraph, Index)" + */ +template +class DeBruijnGraphConstructor { +private: + typedef typename Graph::EdgeId EdgeId; + typedef Index DeBruijn; + typedef typename Graph::VertexId VertexId; + typedef typename Index::KMer Kmer; + typedef typename DeBruijn::KeyWithHash KeyWithHash; + typedef typename DeBruijn::kmer_iterator kmer_iterator; + + Graph &graph_; + DeBruijn &origin_; + size_t kmer_size_; + + bool StepRightIfPossible(KeyWithHash &kwh) { + // VERIFY(origin_.contains(edge)); + if (origin_.RivalEdgeCount(kwh) == 1 + && origin_.NextEdgeCount(kwh) == 1) { + kwh = origin_.NextEdge(kwh); + // VERIFY(origin_.contains(next_edge)); + return true; + } + return false; + } + + KeyWithHash &GoRight(KeyWithHash &kwh) { + KeyWithHash initial = kwh; + while (StepRightIfPossible(kwh) && kwh != initial) { + ; + } + return kwh; + } + + KeyWithHash &GoLeft(KeyWithHash &kwh) { + //These strange things are in order to avoid making copies of kwh + kwh = !kwh; + kwh = !GoRight(kwh); + return kwh; + } + + Sequence ConstructSeqGoingRight(KeyWithHash &kwh) { + SequenceBuilder s; + s.append(kwh.key()); + KeyWithHash initial = kwh; + while (StepRightIfPossible(kwh) && kwh != initial) { + s.append(kwh[kmer_size_]); + } + return s.BuildSequence(); + } + + Sequence ConstructSequenceWithEdge(const KeyWithHash &kwh) { + KeyWithHash tmp = kwh; + return ConstructSeqGoingRight(GoLeft(tmp)); + } + + VertexId FindVertexByOutgoingEdges(Kmer kmer) { + for (char c = 0; c < 4; ++c) { + KeyWithHash edge = origin_.ConstructKWH(kmer.pushBack(c)); + if (origin_.contains(edge)) + return graph_.EdgeStart(origin_.get_value(edge).edge_id); + } + return VertexId(NULL); + } + + VertexId FindVertexByIncomingEdges(Kmer kmer) { + for (char c = 0; c < 4; ++c) { + KeyWithHash edge = origin_.ConstructKWH(kmer.pushFront(c)); + if (origin_.contains(edge)) { + return graph_.EdgeEnd(origin_.get_value(edge).edge_id); + } + } + return VertexId(NULL); + } + + VertexId FindVertex(Kmer kmer) { + VertexId v = FindVertexByOutgoingEdges(kmer); + return v == VertexId(NULL) ? FindVertexByIncomingEdges(kmer) : v; + } + + VertexId FindVertexMaybeMissing(Kmer kmer) { + VertexId v = FindVertex(kmer); + return v != VertexId(NULL) ? v : graph_.AddVertex(); + } + + VertexId FindEndMaybeMissing(const ConjugateDeBruijnGraph& graph, + VertexId start, Kmer start_kmer, Kmer end_kmer) { + if (start_kmer == end_kmer) { + return start; + } else if (start_kmer == !end_kmer) { + return graph.conjugate(start); + } else { + return FindVertexMaybeMissing(end_kmer); + } + } + + void ConstructPart(const std::vector& kwh_list, + std::vector& sequences) { + for (size_t i = 0; i < sequences.size(); ++i) { + if (origin_.contains(kwh_list[i])) { + continue; + } + + Kmer start_kmer = sequences[i].start < Kmer > (kmer_size_); + Kmer end_kmer = sequences[i].end < Kmer > (kmer_size_); + + VertexId start = FindVertexMaybeMissing(start_kmer); + VertexId end = FindEndMaybeMissing(graph_, start, start_kmer, + end_kmer); + + graph_.AddEdge(start, end, sequences[i]); + } + } + + void AddKmers(kmer_iterator &it, kmer_iterator &end, size_t queueSize, + std::vector& kwh_list) { + for (; kwh_list.size() != queueSize && it != end; ++it) { + KeyWithHash kwh = origin_.ConstructKWH(Kmer(unsigned(kmer_size_ + 1), (*it).data())); + + if (!origin_.contains(kwh)) + kwh_list.push_back(kwh); + } + } + + void CalculateSequences(std::vector &kwh_list, + std::vector &sequences) { + size_t size = kwh_list.size(); + sequences.resize(size); + +# pragma omp parallel for schedule(guided) + for (size_t i = 0; i < size; ++i) { + sequences[i] = ConstructSequenceWithEdge(kwh_list[i]); + } + } + +public: + DeBruijnGraphConstructor(Graph& graph, DeBruijn &origin) : + graph_(graph), origin_(origin), kmer_size_(graph_.k()) { + } + + void ConstructGraph(size_t queueMinSize, size_t queueMaxSize, + double queueGrowthRate) { + kmer_iterator it = origin_.kmer_begin(); + kmer_iterator end = origin_.kmer_end(); + size_t queueSize = queueMinSize; + std::vector kwh_list; + std::vector sequences; + kwh_list.reserve(queueSize); + sequences.reserve(queueMaxSize); + while (it != end) { + AddKmers(it, end, queueSize, kwh_list); // format a queue of kmers that are not in index + CalculateSequences(kwh_list, sequences); // in parallel + ConstructPart(kwh_list, sequences); + kwh_list.clear(); + queueSize = min(size_t(double(queueSize) * queueGrowthRate), queueMaxSize); + } + } + +private: + DECL_LOGGER("DeBruijnGraphConstructor") +}; + +class UnbranchingPathExtractor { +private: + typedef utils::DeBruijnExtensionIndex<> Index; + typedef RtSeq Kmer; + typedef Index::kmer_iterator kmer_iterator; + typedef Index::DeEdge DeEdge; + typedef Index::KeyWithHash KeyWithHash; + + Index &origin_; + size_t kmer_size_; + + bool IsJunction(KeyWithHash kwh) const { + return IsJunction(origin_.get_value(kwh)); + } + + bool IsJunction(utils::InOutMask mask) const { + return !mask.CheckUniqueOutgoing() || !mask.CheckUniqueIncoming(); + } + + void AddStartDeEdgesForVertex(KeyWithHash kh, utils::InOutMask mask, + std::vector& start_edges) const { + for (char next = 0; next < 4; next++) { + if (!mask.CheckOutgoing(next)) + continue; + + start_edges.emplace_back(kh, origin_.GetOutgoing(kh, next)); + TRACE("Added to queue " << start_edges.back() << " " << mask); + } + } + + void AddStartDeEdges(KeyWithHash kh, std::vector& start_edges) const { + start_edges.clear(); + auto extensions = origin_.get_value(kh); + if (!IsJunction(extensions)) + return; + + AddStartDeEdgesForVertex(kh, extensions, start_edges); + KeyWithHash kh_inv = !kh; + if (!kh_inv.is_minimal()) { + AddStartDeEdgesForVertex(kh_inv, origin_.get_value(kh_inv), + start_edges); + } + } + + bool StepRightIfPossible(DeEdge &edge) const { + utils::InOutMask mask = origin_.get_value(edge.end); + if (mask.CheckUniqueOutgoing() && mask.CheckUniqueIncoming()) { + edge = DeEdge(edge.end, + origin_.GetOutgoing(edge.end, mask.GetUniqueOutgoing())); + return true; + } + return false; + } + + Sequence ConstructSequenceWithEdge(DeEdge edge, SequenceBuilder &builder) const { + builder.clear(); // We reuse the buffer to reduce malloc traffic + builder.append(edge.start.key()); + builder.append(edge.end[kmer_size_ - 1]); + DeEdge initial = edge; + while (StepRightIfPossible(edge) && edge != initial) { + builder.append(edge.end[kmer_size_ - 1]); + } + return builder.BuildSequence(); + } + + // Loop consists of 4 parts: 2 selfRC k+1-mers and two sequences of arbitrary length RC to each other; pos is a position of one of selfRC edges + std::vector SplitLoop(const Sequence &s, size_t pos) const { + return { s.Subseq(pos, pos + kmer_size_ + 1), + s.Subseq(pos + 1, s.size() - kmer_size_) + s.Subseq(0, pos + kmer_size_) }; + + } + +// TODO Think about what happends to self rc perfect loops + std::vector ConstructLoopFromVertex(const KeyWithHash &kh, SequenceBuilder &builder) const { + DeEdge break_point(kh, origin_.GetUniqueOutgoing(kh)); + Sequence s = ConstructSequenceWithEdge(break_point, builder); + Kmer kmer = s.start(kmer_size_ + 1) >> 'A'; + for (size_t i = kmer_size_; i < s.size(); i++) { + kmer = kmer << s[i]; + if (kmer == !kmer) + return SplitLoop(s, i - kmer_size_); + } + return {s}; + } + + void CalculateSequences(kmer_iterator &it, + std::vector &sequences) const { + SequenceBuilder builder; + std::vector start_edges; + start_edges.reserve(8); + + for ( ; it.good(); ++it) { + KeyWithHash kh = origin_.ConstructKWH(Kmer(kmer_size_, *it)); + AddStartDeEdges(kh, start_edges); + + for (auto edge : start_edges) { + Sequence s = ConstructSequenceWithEdge(edge, builder); + if (s < !s) + continue; + + sequences.push_back(s); + TRACE("From " << edge << " calculated sequence\n" << s); + } + } + } + + void CleanCondensed(const Sequence &sequence) { + Kmer kmer = sequence.start(kmer_size_); + KeyWithHash kwh = origin_.ConstructKWH(kmer); + origin_.IsolateVertex(kwh); + for (size_t pos = kmer_size_; pos < sequence.size(); pos++) { + kwh = kwh << sequence[pos]; + origin_.IsolateVertex(kwh); + } + } + + void CleanCondensed(const std::vector &sequences) { +# pragma omp parallel for schedule(guided) + for (size_t i = 0; i < sequences.size(); ++i) { + CleanCondensed(sequences[i]); + CleanCondensed(!sequences[i]); + } + } + + // This methods collects all loops that were not extracted by finding + // unbranching paths because there are no junctions on loops. + const std::vector CollectLoops(unsigned nchunks) { + INFO("Collecting perfect loops"); + auto its = origin_.kmer_begin(nchunks); + std::vector > starts(its.size()); + +# pragma omp parallel for schedule(guided) + for (size_t i = 0; i < its.size(); ++i) { + auto &it = its[i]; + for (; it.good(); ++it) { + KeyWithHash kh = origin_.ConstructKWH(Kmer(kmer_size_, *it)); + if (!IsJunction(kh)) + starts[i].push_back(kh); + } + } + + std::vector result; + SequenceBuilder builder; + for (const auto& entry : starts) { + for (const auto& kwh : entry) { + if (IsJunction(kwh)) + continue; + + for (Sequence s : ConstructLoopFromVertex(kwh, builder)) { + Sequence s_rc = !s; + if (s < s_rc) + result.push_back(s_rc); + else + result.push_back(s); + + CleanCondensed(s); + CleanCondensed(s_rc); + } + } + } + INFO("Collecting perfect loops finished. " << result.size() << " loops collected"); + return result; + } + +public: + UnbranchingPathExtractor(Index &origin, size_t k) + : origin_(origin), kmer_size_(k) {} + + //TODO very large vector is returned. But I hate to make all those artificial changes that can fix it. + const std::vector ExtractUnbranchingPaths(unsigned nchunks) const { + auto its = origin_.kmer_begin(nchunks); + + INFO("Extracting unbranching paths"); + std::vector> sequences(its.size()); +# pragma omp parallel for schedule(guided) + for (size_t i = 0; i < its.size(); ++i) + CalculateSequences(its[i], sequences[i]); + + size_t snum = std::accumulate(sequences.begin(), sequences.end(), + 0, + [](size_t val, const std::vector &s) { + return val + s.size(); + }); + sequences[0].reserve(snum); + for (size_t i = 1; i < sequences.size(); ++i) { + sequences[0].insert(sequences[0].end(), + std::make_move_iterator(sequences[i].begin()), std::make_move_iterator(sequences[i].end())); + sequences[i].clear(); + sequences[i].shrink_to_fit(); + } + + INFO("Extracting unbranching paths finished. " << sequences[0].size() << " sequences extracted"); + return sequences[0]; + } + + const std::vector ExtractUnbranchingPathsAndLoops(unsigned nchunks) { + std::vector result = ExtractUnbranchingPaths(nchunks); + CleanCondensed(result); + std::vector loops = CollectLoops(nchunks); + result.insert(result.end(), + std::make_move_iterator(loops.begin()), std::make_move_iterator(loops.end())); + return result; + } + +private: + DECL_LOGGER("UnbranchingPathExtractor") +}; + +template +class FastGraphFromSequencesConstructor { +private: + typedef typename Graph::EdgeId EdgeId; + typedef typename Graph::VertexId VertexId; + typedef RtSeq Kmer; + typedef utils::DeBruijnExtensionIndex<> Index; + size_t kmer_size_; + Index &origin_; + + class LinkRecord { + private: + size_t hash_and_mask_; + EdgeId edge_; + + size_t BitBool(bool flag) const { + if (flag) + return 1; + return 0; + } + + public: + size_t GetHash() const { return hash_and_mask_ >> 2; } + bool IsRC() const { return hash_and_mask_ & 2; } + bool IsStart() const { return hash_and_mask_ & 1; } + EdgeId GetEdge() const { return edge_; } + bool IsInvalid() { return hash_and_mask_ + 1 == 0 && edge_ == EdgeId(0); } + + LinkRecord(size_t hash, EdgeId edge, bool is_start, bool is_rc) + : hash_and_mask_((hash << 2) | (BitBool(is_rc) << 1)| BitBool(is_start)), edge_(edge) { } + + LinkRecord() + : hash_and_mask_(-1ul), edge_(0) {} + + + bool operator<(const LinkRecord &other) const { + if (this->hash_and_mask_ == other.hash_and_mask_) + return this->edge_ < other.edge_; + return this->hash_and_mask_ < other.hash_and_mask_; + } + }; + + LinkRecord StartLink(const EdgeId &edge, const Sequence &sequence) const { + Kmer kmer(kmer_size_, sequence); + Kmer kmer_rc = !kmer; + if (kmer < kmer_rc) + return LinkRecord(origin_.ConstructKWH(kmer).idx(), edge, true, false); + else + return LinkRecord(origin_.ConstructKWH(kmer_rc).idx(), edge, true, true); + } + + LinkRecord EndLink(const EdgeId &edge, const Sequence &sequence) const { + Kmer kmer(kmer_size_, sequence, sequence.size() - kmer_size_); + Kmer kmer_rc = !kmer; + if (kmer < kmer_rc) + return LinkRecord(origin_.ConstructKWH(kmer).idx(), edge, false, false); + else + return LinkRecord(origin_.ConstructKWH(kmer_rc).idx(), edge, false, true); + } + + void CollectLinkRecords(typename Graph::HelperT &helper, const Graph &graph, std::vector &records, const vector &sequences) const { + size_t size = sequences.size(); + records.resize(size * 2, LinkRecord(0, EdgeId(0), false, false)); + restricted::IdSegmentStorage id_storage = helper.graph().GetGraphIdDistributor().Reserve(size * 2); +# pragma omp parallel for schedule(guided) + for (size_t i = 0; i < size; ++i) { + size_t j = i << 1; + auto id_distributor = id_storage.GetSegmentIdDistributor(j, j + 2);//indices for two edges are required + EdgeId edge = helper.AddEdge(DeBruijnEdgeData(sequences[i]), id_distributor); + records[j] = StartLink(edge, sequences[i]); + if (graph.conjugate(edge) != edge) + records[j + 1] = EndLink(edge, sequences[i]); + else + records[j + 1] = LinkRecord(); + } + } + + void LinkEdge(typename Graph::HelperT &helper, const Graph &graph, const VertexId v, const EdgeId edge, const bool is_start, const bool is_rc) const { + VertexId v1 = v; + if (is_rc) + v1 = graph.conjugate(v); + + if (is_start) + helper.LinkOutgoingEdge(v1, edge); + else + helper.LinkIncomingEdge(v1, edge); + } + +public: + FastGraphFromSequencesConstructor(size_t k, Index &origin) + : kmer_size_(k), origin_(origin) {} + + void ConstructGraph(Graph &graph, const vector &sequences) const { + typename Graph::HelperT helper = graph.GetConstructionHelper(); + vector records; + CollectLinkRecords(helper, graph, records, sequences);//TODO make parallel + parallel::sort(records.begin(), records.end()); + size_t size = records.size(); + vector> vertices_list(omp_get_max_threads()); + restricted::IdSegmentStorage id_storage = helper.graph().GetGraphIdDistributor().Reserve(size * 2); +# pragma omp parallel for schedule(guided) + for (size_t i = 0; i < size; i++) { + if (i != 0 && records[i].GetHash() == records[i - 1].GetHash()) + continue; + if (records[i].IsInvalid()) + continue; + + auto id_distributor = id_storage.GetSegmentIdDistributor(i << 1, (i << 1) + 2); + VertexId v = helper.CreateVertex(DeBruijnVertexData(), id_distributor); + vertices_list[omp_get_thread_num()].push_back(v); + for (size_t j = i; j < size && records[j].GetHash() == records[i].GetHash(); j++) { + LinkEdge(helper, graph, v, records[j].GetEdge(), records[j].IsStart(), records[j].IsRC()); + } + } + + for (size_t i = 0; i < vertices_list.size(); i++) + helper.AddVerticesToGraph(vertices_list[i].begin(), vertices_list[i].end()); + } +}; + +/* + * Constructs DeBruijnGraph from DeBruijnExtensionIndex using "new DeBruijnGraphExtentionConstructor(DeBruijn).ConstructGraph(DeBruijnGraph, Index)" + */ +template +class DeBruijnGraphExtentionConstructor { +private: + typedef typename Graph::EdgeId EdgeId; + typedef utils::DeBruijnExtensionIndex<> DeBruijn; + typedef typename Graph::VertexId VertexId; + typedef RtSeq Kmer; + + Graph &graph_; + DeBruijn &origin_; + size_t kmer_size_; + +public: + DeBruijnGraphExtentionConstructor(Graph& graph, DeBruijn &origin) : + graph_(graph), origin_(origin), kmer_size_(graph.k()) { + } + + void ConstructGraph(bool keep_perfect_loops) { + std::vector edge_sequences; + unsigned nchunks = 16 * omp_get_max_threads(); + if (keep_perfect_loops) + edge_sequences = UnbranchingPathExtractor(origin_, kmer_size_).ExtractUnbranchingPathsAndLoops(nchunks); + else + edge_sequences = UnbranchingPathExtractor(origin_, kmer_size_).ExtractUnbranchingPaths(nchunks); + FastGraphFromSequencesConstructor(kmer_size_, origin_).ConstructGraph(graph_, edge_sequences); + } + +private: + DECL_LOGGER("DeBruijnGraphConstructor") +}; + +} diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/construction/early_simplification.hpp spades-3.11.1+dfsg/src/common/assembly_graph/construction/early_simplification.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/construction/early_simplification.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/construction/early_simplification.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -0,0 +1,191 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once +#include "utils/standard_base.hpp" +#include "utils/ph_map/perfect_hash_map.hpp" +#include "utils/kmer_mph/kmer_index.hpp" + +namespace debruijn_graph { + +class LinkCleaner { +private: + typedef utils::DeBruijnExtensionIndex<> Index; + typedef Index::KMer Kmer; + typedef Index::KeyWithHash KeyWithHash; + Index &index_; + + void CleanForwardLinks(KeyWithHash &kh, char i) { + if(index_.CheckOutgoing(kh, i)) { + KeyWithHash next_kh = index_.GetOutgoing(kh, i); + if(!index_.CheckIncoming(next_kh, kh[0])) { + index_.DeleteOutgoing(kh, i); + } + } + } + + void CleanBackwardLinks(KeyWithHash &kh, char i) { + if(index_.CheckIncoming(kh, i)) { + KeyWithHash prev_kh = index_.GetIncoming(kh, i); + if(!index_.CheckOutgoing(prev_kh, kh[index_.k() - 1])) { + index_.DeleteIncoming(kh, i); + } + } + } + +public: + LinkCleaner(Index &index) : index_(index) {} + + //TODO make parallel + void CleanLinks() { + vector iters = index_.kmer_begin(10 * omp_get_max_threads()); +# pragma omp parallel for schedule(guided) + for(size_t i = 0; i < iters.size(); i++) { + for (Index::kmer_iterator &it = iters[i]; it.good(); ++it) { + KeyWithHash kh = index_.ConstructKWH(RtSeq(index_.k(), *it)); + if (kh.is_minimal()) { + KeyWithHash kh = index_.ConstructKWH(RtSeq(index_.k(), *it)); + for (char i = 0; i < 4; i++) { + CleanForwardLinks(kh, i); + CleanBackwardLinks(kh, i); + } + } + } + } + } +}; + +class AlternativeEarlyTipClipper { +private: + typedef utils::DeBruijnExtensionIndex<> Index; + typedef Index::KMer Kmer; + typedef Index::KeyWithHash KeyWithHash; + Index &index_; + size_t length_bound_; + + /* + * This method starts from the kmer that is second in the tip counting from junction vertex. It records all kmers of a tip into tip vector. + * The method returns length of a tip. + * In case it did not end as a tip or if it was too long tip vector is cleared and infinite length is returned. + * Thus tip vector contains only kmers to be removed while returned length value gives reasonable information of what happend. + */ + size_t FindForward(KeyWithHash kh, vector &tip) { + while(tip.size() < length_bound_ && index_.CheckUniqueIncoming(kh) && index_.CheckUniqueOutgoing(kh)) { + tip.push_back(kh); + kh = index_.GetUniqueOutgoing(kh); + } + tip.push_back(kh); + if(index_.CheckUniqueIncoming(kh) && index_.IsDeadEnd(kh)) { + return tip.size(); + } + tip.clear(); + return -1; + } + + size_t FindBackward(KeyWithHash kh, vector &tip) { + while(tip.size() < length_bound_ && index_.CheckUniqueOutgoing(kh) && index_.CheckUniqueIncoming(kh)) { + tip.push_back(kh); + kh = index_.GetUniqueIncoming(kh); + } + tip.push_back(kh); + if(index_.CheckUniqueOutgoing(kh) && index_.IsDeadStart(kh)) { + return tip.size(); + } + tip.clear(); + return -1; + } + + size_t RemoveTip(vector &tip) { + for(size_t i = 0; i < tip.size(); i++) + index_.IsolateVertex(tip[i]); + return tip.size(); + } + + size_t RemoveTips(vector > tips, size_t max) { + size_t result = 0; + for(char c = 0; c < 4; c++) { + if(tips[c].size() < max) { + result += RemoveTip(tips[c]); + } + } + return result; + } + + size_t RemoveForward(KeyWithHash kh) { + vector> tips; + tips.resize(4); + size_t max = 0; + for(char c = 0; c < 4; c++) { + if(index_.CheckOutgoing(kh, c)) { + KeyWithHash khc = index_.GetOutgoing(kh, c); + size_t len = FindForward(khc, tips[c]); + if(len > max) + max = len; + } + } + return RemoveTips(tips, max); + } + + size_t RemoveBackward(KeyWithHash kh) { + vector> tips; + tips.resize(4); + size_t max = 0; + for(char c = 0; c < 4; c++) { + if(index_.CheckIncoming(kh, c)) { + KeyWithHash khc = index_.GetIncoming(kh, c); + size_t len = FindBackward(khc, tips[c]); + if(len > max) + max = len; + } + } + return RemoveTips(tips, max); + } + + //TODO make parallel + size_t RoughClipTips() { + vector iters = index_.kmer_begin(10 * omp_get_max_threads()); + vector result(iters.size()); +# pragma omp parallel for schedule(guided) + for(size_t i = 0; i < iters.size(); i++) { + for(Index::kmer_iterator &it = iters[i]; it.good(); ++it) { + KeyWithHash kh = index_.ConstructKWH(RtSeq(index_.k(), *it)); + if(kh.is_minimal()) { + if (index_.OutgoingEdgeCount(kh) >= 2) { + result[i] += RemoveForward(kh); + } + if (index_.IncomingEdgeCount(kh) >= 2) { + result[i] += RemoveBackward(kh); + } + } + } + } + size_t sum = 0; + for(size_t i = 0; i < result.size(); i++) + sum += result[i]; + return sum; + } + + +public: + AlternativeEarlyTipClipper(Index &index, size_t length_bound) : index_(index), length_bound_(length_bound) { + } + + /* + * Method returns the number of removed edges + */ + size_t ClipTips() { + INFO("Early tip clipping"); + size_t result = RoughClipTips(); + LinkCleaner(index_).CleanLinks(); + INFO(result << " " << (index_.k()+1) <<"-mers were removed by early tip clipper"); + return result; + } +protected: + DECL_LOGGER("Early tip clipping"); +}; + +} diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/core/graph_core.hpp spades-3.11.1+dfsg/src/common/assembly_graph/core/graph_core.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/core/graph_core.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/core/graph_core.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -11,12 +11,15 @@ #include "utils/verify.hpp" #include "utils/logger/logger.hpp" #include "order_and_law.hpp" +#include "utils/stl_utils.hpp" + +#include "adt/small_pod_vector.hpp" + #include -#include "utils/simple_tools.hpp" +#include namespace omnigraph { -using std::vector; template class GraphCore; @@ -92,7 +95,7 @@ typedef typename DataMaster::VertexData VertexData; typedef restricted::pure_pointer> EdgeId; typedef restricted::pure_pointer> VertexId; - typedef typename std::vector::const_iterator edge_raw_iterator; + typedef typename adt::SmallPODVector::const_iterator edge_raw_iterator; class conjugate_iterator : public boost::iterator_facade { @@ -137,7 +140,7 @@ friend class PairedElementManipulationHelper; friend class conjugate_iterator; - std::vector outgoing_edges_; + adt::SmallPODVector outgoing_edges_; VertexId conjugate_; @@ -196,7 +199,7 @@ } const std::vector OutgoingEdgesTo(VertexId v) const { - vector result; + std::vector result; for (auto it = outgoing_edges_.begin(); it != outgoing_edges_.end(); ++it) { if ((*it)->end() == v) { result.push_back(*it); @@ -232,13 +235,14 @@ typedef typename DataMasterT::EdgeData EdgeData; typedef restricted::pure_pointer> EdgeId; typedef restricted::pure_pointer> VertexId; - typedef typename std::set::const_iterator VertexIt; + typedef btree::safe_btree_set VertexContainer; + typedef typename VertexContainer::const_iterator VertexIt; typedef typename PairedVertex::edge_const_iterator edge_const_iterator; private: restricted::LocalIdDistributor id_distributor_; DataMaster master_; - std::set vertices_; + VertexContainer vertices_; friend class ConstructionHelper; public: @@ -506,20 +510,20 @@ return edge->conjugate(); } - size_t length(const EdgeId edge) const { + size_t length(EdgeId edge) const { return master_.length(data(edge)); } - size_t length(const VertexId v) const { + size_t length(VertexId v) const { return master_.length(data(v)); } //////////////////////shortcut methods std::vector IncidentEdges(VertexId v) const { - vector answer; - push_back_all(answer, IncomingEdges(v)); - push_back_all(answer, OutgoingEdges(v)); + std::vector answer; + utils::push_back_all(answer, IncomingEdges(v)); + utils::push_back_all(answer, OutgoingEdges(v)); return answer; } @@ -574,7 +578,7 @@ std::string str(const VertexId v) const { // return master_.str(data(v)); - return ToString(int_id(v)); + return std::to_string(int_id(v)); } std::string detailed_str(const VertexId v) const { diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/core/graph_iterators.hpp spades-3.11.1+dfsg/src/common/assembly_graph/core/graph_iterators.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/core/graph_iterators.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/core/graph_iterators.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -7,10 +7,10 @@ #pragma once -#include "common/adt/queue_iterator.hpp" +#include "adt/queue_iterator.hpp" #include "func/pred.hpp" #include "action_handlers.hpp" -#include "utils/simple_tools.hpp" +#include "utils/stl_utils.hpp" #include namespace omnigraph { @@ -23,7 +23,8 @@ template> class SmartIterator : public GraphActionHandler { typedef GraphActionHandler base; - DynamicQueueIterator inner_it_; + typedef adt::DynamicQueueIterator DynamicQueueIterator; + DynamicQueueIterator inner_it_; bool add_new_; bool canonical_only_; //todo think of checking it in HandleAdd @@ -120,7 +121,7 @@ const Comparator& comparator = Comparator(), bool canonical_only = false, func::TypedPredicate add_condition = func::AlwaysTrue()) - : base(g, "SmartSet " + ToString(this), add_new, comparator, canonical_only, add_condition) { + : base(g, "SmartSet", add_new, comparator, canonical_only, add_condition) { } template @@ -169,7 +170,7 @@ SmartVertexIterator(const Graph &g, const Comparator& comparator = Comparator(), bool canonical_only = false) : SmartIterator( - g, "SmartVertexIterator " + ToString(get_id()), true, + g, "SmartVertexIterator " + std::to_string(get_id()), true, comparator, canonical_only) { this->insert(g.begin(), g.end()); } @@ -297,7 +298,7 @@ SmartEdgeIterator(const Graph &g, Comparator comparator = Comparator(), bool canonical_only = false) : SmartIterator( - g, "SmartEdgeIterator " + ToString(get_id()), true, + g, "SmartEdgeIterator " + std::to_string(get_id()), true, comparator, canonical_only) { this->insert(EdgeIt(g, g.begin()), EdgeIt(g, g.end())); diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/core/order_and_law.hpp spades-3.11.1+dfsg/src/common/assembly_graph/core/order_and_law.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/core/order_and_law.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/core/order_and_law.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -15,7 +15,7 @@ #include "utils/stacktrace.hpp" #include #include -#include "utils/openmp_wrapper.h" +#include "utils/parallel/openmp_wrapper.h" #include "folly/PackedSyncPtr.h" diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/dijkstra/dijkstra_algorithm.hpp spades-3.11.1+dfsg/src/common/assembly_graph/dijkstra/dijkstra_algorithm.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/dijkstra/dijkstra_algorithm.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/dijkstra/dijkstra_algorithm.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -6,7 +6,7 @@ //*************************************************************************** #pragma once -#include "utils/simple_tools.hpp" +#include "utils/stl_utils.hpp" #include "dijkstra_settings.hpp" #include @@ -203,8 +203,8 @@ return path; VertexId curr_vertex = vertex; - VertexId prev_vertex = get(prev_vert_map_, vertex).first; - EdgeId edge = get(prev_vert_map_, curr_vertex).second; + VertexId prev_vertex = utils::get(prev_vert_map_, vertex).first; + EdgeId edge = utils::get(prev_vert_map_, curr_vertex).second; while (prev_vertex != VertexId(0)) { if (graph_.EdgeStart(edge) == prev_vertex) @@ -212,7 +212,7 @@ else path.push_back(edge); curr_vertex = prev_vertex; - const auto& prev_v_e = get(prev_vert_map_, curr_vertex); + const auto& prev_v_e = utils::get(prev_vert_map_, curr_vertex); prev_vertex = prev_v_e.first; edge = prev_v_e.second; } diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/graph_support/contig_output.hpp spades-3.11.1+dfsg/src/common/assembly_graph/graph_support/contig_output.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/graph_support/contig_output.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/graph_support/contig_output.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -13,590 +13,20 @@ #include "assembly_graph/stats/statistics.hpp" #include "assembly_graph/paths/path_finders.hpp" #include "assembly_graph/paths/path_utils.hpp" +#include "assembly_graph/paths/bidirectional_path_io/io_support.hpp" namespace debruijn_graph { -//This class corrects mismatches or masks repeat differences or other such things with the sequence of an edge -template -class ContigCorrector { -private: - typedef typename Graph::EdgeId EdgeId; - const Graph &graph_; -protected: - const Graph &graph() const { - return graph_; - } - -public: - ContigCorrector(const Graph &graph) : graph_(graph) { - } - - virtual string correct(EdgeId e) = 0; - - virtual ~ContigCorrector() { - } -}; - -template -class DefaultContigCorrector : public ContigCorrector { -private: - typedef typename Graph::EdgeId EdgeId; -public: - DefaultContigCorrector(const Graph &graph) : ContigCorrector(graph) { - } - - string correct(EdgeId e) { - return this->graph().EdgeNucls(e).str(); - } -}; - - -class GFASegmentWriter { -private: - std::ostream &ostream_; - - -public: - - GFASegmentWriter(std::ostream &stream) : ostream_(stream) { - } - - void Write(size_t edge_id, const Sequence &seq, double cov) { - ostream_ << "S\t" << edge_id << "\t"; - ostream_ << seq.str() << "\t"; - ostream_ << "KC:i:" << int(cov) << std::endl; - } -}; - -class GFALinkWriter { -private: - std::ostream &ostream_; - size_t overlap_size_; - -public: - - GFALinkWriter(std::ostream &stream, size_t overlap_size) : ostream_(stream), overlap_size_(overlap_size) { - } - - void Write(size_t first_segment, std::string &first_orientation, size_t second_segment, std::string &second_orientation) { - ostream_ << "L\t" << first_segment << "\t" << first_orientation << "\t" ; - ostream_ << second_segment << "\t" << second_orientation << "\t" << overlap_size_ << "M"; - ostream_ << std::endl; - - } -}; - - -struct PathSegmentSequence { - size_t path_id_; - size_t segment_number_; - std::vector segment_sequence_; - PathSegmentSequence(size_t path_id, std::vector &segment_sequence) - : path_id_(path_id), segment_number_(1), segment_sequence_(segment_sequence) { - } - - PathSegmentSequence() - : path_id_(0), segment_number_(1), segment_sequence_(){ - } - void Reset() { - segment_sequence_.clear(); - } -}; - -class GFAPathWriter { -private: - std::ostream &ostream_; - -public: - - GFAPathWriter(std::ostream &stream) - : ostream_(stream) { - } - - void Write(const PathSegmentSequence &path_segment_sequence) { - ostream_ << "P" << "\t" ; - ostream_ << path_segment_sequence.path_id_ << "_" << path_segment_sequence.segment_number_ << "\t"; - std::string delimeter = ""; - for (size_t i = 0; i < path_segment_sequence.segment_sequence_.size() - 1; ++i) { - ostream_ << delimeter << path_segment_sequence.segment_sequence_[i]; - delimeter = ","; - } - ostream_ << "\t"; - std::string delimeter2 = ""; - for (size_t i = 0; i < path_segment_sequence.segment_sequence_.size() - 1; ++i) { - ostream_ << delimeter2 << "*"; - delimeter2 = ","; - } - ostream_ << std::endl; - } - -}; - -template -class GFAWriter { -private: - typedef typename Graph::EdgeId EdgeId; - const Graph &graph_; - const path_extend::PathContainer &paths_; - const string filename_; - std::set set_of_authentic_edges_; - - bool IsCanonical(EdgeId e) const { - if (e <= graph_.conjugate(e)) { - return true; - } else { - return false; - } - } - - std::string GetOrientation(EdgeId e) const { - return IsCanonical(e) ? "+" : "-"; - } - - void WriteSegments(std::ofstream &stream) { - GFASegmentWriter segment_writer(stream); - for (auto it = graph_.ConstEdgeBegin(true); !it.IsEnd(); ++it) { - segment_writer.Write((*it).int_id(), graph_.EdgeNucls(*it), graph_.coverage(*it) * double(graph_.length(*it))); - } - } - - void WriteLinks(std::ofstream &stream) { - GFALinkWriter link_writer(stream, graph_.k()); - for (auto it = graph_.SmartVertexBegin(); !it.IsEnd(); ++it) { - for (auto inc_edge : graph_.IncomingEdges(*it)) { - std::string orientation_first = GetOrientation(inc_edge); - size_t segment_first = IsCanonical(inc_edge) ? inc_edge.int_id() : graph_.conjugate(inc_edge).int_id(); - for (auto out_edge : graph_.OutgoingEdges(*it)) { - size_t segment_second = IsCanonical(out_edge) ? out_edge.int_id() : graph_.conjugate(out_edge).int_id(); - std::string orientation_second = GetOrientation(out_edge); - link_writer.Write(segment_first, orientation_first, segment_second, orientation_second); - } - } - } - } - - void UpdateSegmentedPath(PathSegmentSequence &segmented_path, EdgeId e) { - std::string segment_id = IsCanonical(e) ? ToString(e.int_id()) : ToString(graph_.conjugate(e).int_id()); - std::string orientation = GetOrientation(e); - segmented_path.segment_sequence_.push_back(segment_id + orientation); - } - - void WritePaths(std::ofstream &stream) { - GFAPathWriter path_writer(stream); - for (const auto &path_pair : paths_) { - const path_extend::BidirectionalPath &p = (*path_pair.first); - if (p.Size() == 0) { - continue; - } - PathSegmentSequence segmented_path; - segmented_path.path_id_ = p.GetId(); - for (size_t i = 0; i < p.Size() - 1; ++i) { - EdgeId e = p[i]; - UpdateSegmentedPath(segmented_path, e); - if (graph_.EdgeEnd(e) != graph_.EdgeStart(p[i+1])) { - path_writer.Write(segmented_path); - segmented_path.segment_number_++; - segmented_path.Reset(); - } - } - UpdateSegmentedPath(segmented_path, p.Back()); - path_writer.Write(segmented_path); - - } - } - -public: - GFAWriter(const Graph &graph, const path_extend::PathContainer &paths, const string &filename) - : graph_(graph), paths_(paths), filename_(filename) { - } - - void Write() { - std::ofstream stream; - stream.open(filename_); - WriteSegments(stream); - WriteLinks(stream); - WritePaths(stream); - } -}; - -//This class uses corrected sequences to construct contig (just return as is, find unipath, trim contig) -template -class ContigConstructor { -private: - typedef typename Graph::EdgeId EdgeId; - const Graph &graph_; - ContigCorrector &corrector_; -protected: - string correct(EdgeId e) { - return corrector_.correct(e); - } - - const Graph &graph() const { - return graph_; - } - -public: - - ContigConstructor(const Graph &graph, ContigCorrector &corrector) : graph_(graph), corrector_(corrector) { - } - - virtual pair construct(EdgeId e) = 0; - - virtual ~ContigConstructor(){ - } -}; - -template -class DefaultContigConstructor : public ContigConstructor { -private: - typedef typename Graph::EdgeId EdgeId; -public: - - DefaultContigConstructor(const Graph &graph, ContigCorrector &corrector) : ContigConstructor(graph, corrector) { - } - - pair construct(EdgeId e) { - return make_pair(this->correct(e), this->graph().coverage(e)); - } -}; - -template -vector Unipath(const Graph& g, typename Graph::EdgeId e) { - omnigraph::UniquePathFinder unipath_finder(g); - vector answer = unipath_finder.UniquePathBackward(e); - const vector& forward = unipath_finder.UniquePathForward(e); - for (size_t i = 1; i < forward.size(); ++i) { - answer.push_back(forward[i]); - } - return answer; -} - -template -class UnipathConstructor : public ContigConstructor { -private: - typedef typename Graph::EdgeId EdgeId; - - - - string MergeOverlappingSequences(std::vector& ss, size_t overlap) { - if (ss.empty()) { - return ""; - } - stringstream result; - result << ss.front().substr(0, overlap); -// prev_end = ss.front().substr(0, overlap); - for (auto it = ss.begin(); it != ss.end(); ++it) { -// VERIFY(prev_end == it->substr(0, overlap)); - result << it->substr(overlap); -// prev_end = it->substr(it->size() - overlap); - } - return result.str(); - } - - - string MergeSequences(const Graph& g, - const vector& continuous_path) { - vector path_sequences; - for (size_t i = 0; i < continuous_path.size(); ++i) { - if(i > 0) - VERIFY( - g.EdgeEnd(continuous_path[i - 1]) - == g.EdgeStart(continuous_path[i])); - path_sequences.push_back(this->correct(continuous_path[i])); - } - return MergeOverlappingSequences(path_sequences, g.k()); - } - -public: - - UnipathConstructor(const Graph &graph, ContigCorrector &corrector) : ContigConstructor(graph, corrector) { - } - - pair construct(EdgeId e) { - vector unipath = Unipath(this->graph(), e); - return make_pair(MergeSequences(this->graph(), unipath), stats::AvgCoverage(this->graph(), unipath)); - } -}; - -template -class CuttingContigConstructor : public ContigConstructor { -private: - typedef typename Graph::EdgeId EdgeId; - - bool ShouldCut(VertexId v) const { - const Graph &g = this->graph(); - vector edges; - push_back_all(edges, g.OutgoingEdges(v)); - if(edges.size() == 0) - return false; - for(size_t i = 1; i < edges.size(); i++) { - if(g.EdgeNucls(edges[i])[g.k()] != g.EdgeNucls(edges[0])[g.k()]) - return false; - } - edges.clear(); - push_back_all(edges, g.IncomingEdges(v)); - for(size_t i = 0; i < edges.size(); i++) - for(size_t j = i + 1; j < edges.size(); j++) { - if(g.EdgeNucls(edges[i])[g.length(edges[i]) - 1] != g.EdgeNucls(edges[j])[g.length(edges[j]) - 1]) - return true; - } - return false; - } - -public: - - CuttingContigConstructor(const Graph &graph, ContigCorrector &corrector) : ContigConstructor(graph, corrector) { - } - - pair construct(EdgeId e) { - string result = this->correct(e); - if(result.size() > this->graph().k() && ShouldCut(this->graph().EdgeEnd(e))) { - result = result.substr(0, result.size() - this->graph().k()); - } - if(result.size() > this->graph().k() && ShouldCut(this->graph().conjugate(this->graph().EdgeStart(e)))) { - result = result.substr(this->graph().k(), result.size()); - } - return make_pair(result, this->graph().coverage(e)); - } -}; - -struct ExtendedContigIdT { - string full_id_; - string short_id_; - - ExtendedContigIdT(): full_id_(""), short_id_("") {} - - ExtendedContigIdT(string full_id, string short_id): full_id_(full_id), short_id_(short_id) {} -}; - -template -void MakeContigIdMap(const Graph& graph, map& ids, const ConnectedComponentCounter &cc_counter_, string prefix) { - int counter = 0; - for (auto it = graph.ConstEdgeBegin(true); !it.IsEnd(); ++it) { - EdgeId e = *it; - if (ids.count(e) == 0) { - string id; - if (cfg::get().pd) { - size_t c_id = cc_counter_.GetComponent(e); - id = io::MakeContigComponentId(++counter, graph.length(e) + graph.k(), graph.coverage(e), c_id, prefix); - } - else - id = io::MakeContigId(++counter, graph.length(e) + graph.k(), graph.coverage(e), prefix); - ids[e] = ExtendedContigIdT(id, ToString(counter) + "+"); - if (e != graph.conjugate(e)) - ids[graph.conjugate(e)] = ExtendedContigIdT(id + "'", ToString(counter) + "-"); - } - } -} - -template -class ContigPrinter { -private: - const Graph &graph_; - ContigConstructor &constructor_; - template - void ReportEdge(sequence_stream& oss - , const pair sequence_data) { - oss << sequence_data.second; - oss << sequence_data.first; - } - - void ReportEdge(io::osequencestream_for_fastg& oss, - const string& sequence, - const string& id, - const set& nex_ids) { - oss.set_header(id); - oss << nex_ids; - oss << sequence; - } - -public: - ContigPrinter(const Graph &graph, ContigConstructor &constructor) : graph_(graph), constructor_(constructor) { - } - - template - void PrintContigs(sequence_stream &os) { - for (auto it = graph_.ConstEdgeBegin(true); !it.IsEnd(); ++it) { - ReportEdge(os, constructor_.construct(*it)); - } - } - - template - void PrintContigsFASTG(sequence_stream &os, const ConnectedComponentCounter & cc_counter) { - map ids; - MakeContigIdMap(graph_, ids, cc_counter, "EDGE"); - for (auto it = graph_.ConstEdgeBegin(true); !it.IsEnd(); ++it) { - set next; - VertexId v = graph_.EdgeEnd(*it); - auto edges = graph_.OutgoingEdges(v); - for (auto next_it = edges.begin(); next_it != edges.end(); ++next_it) { - next.insert(ids[*next_it].full_id_); - } - ReportEdge(os, constructor_.construct(*it).first, ids[*it].full_id_, next); - if (*it != graph_.conjugate(*it)) - { - set next_conj; - v = graph_.EdgeEnd(graph_.conjugate(*it)); - edges = graph_.OutgoingEdges(v); - for (auto next_it = edges.begin(); next_it != edges.end(); ++next_it) { - next_conj.insert(ids[*next_it].full_id_); - } - ReportEdge(os, constructor_.construct(graph_.conjugate(*it)).first, ids[graph_.conjugate(*it)].full_id_, next_conj); - } - } - } -}; - -template -bool PossibleECSimpleCheck(const Graph& g - , typename Graph::EdgeId e) { - return g.OutgoingEdgeCount(g.EdgeStart(e)) > 1 && g.IncomingEdgeCount(g.EdgeEnd(e)) > 1; -} - -template -void ReportEdge(io::osequencestream_cov& oss - , const Graph& g - , typename Graph::EdgeId e - , bool output_unipath = false - , size_t solid_edge_length_bound = 0) { - typedef typename Graph::EdgeId EdgeId; - if (!output_unipath || (PossibleECSimpleCheck(g, e) && g.length(e) <= solid_edge_length_bound)) { - TRACE("Outputting edge " << g.str(e) << " as single edge"); - oss << g.coverage(e); - oss << g.EdgeNucls(e); - } else { - TRACE("Outputting edge " << g.str(e) << " as part of unipath"); - vector unipath = Unipath(g, e); - TRACE("Unipath is " << g.str(unipath)); - oss << stats::AvgCoverage(g, unipath); - TRACE("Merged sequence is of length " << MergeSequences(g, unipath).size()); - oss << MergeSequences(g, unipath); - } -} - -inline void OutputContigs(ConjugateDeBruijnGraph &g, const string &contigs_output_filename, bool output_unipath) { +inline void OutputEdgeSequences(const Graph &g, + const string &contigs_output_filename) { INFO("Outputting contigs to " << contigs_output_filename << ".fasta"); - DefaultContigCorrector corrector(g); io::osequencestream_cov oss(contigs_output_filename + ".fasta"); - if(!output_unipath) { - DefaultContigConstructor constructor(g, corrector); - - ContigPrinter(g, constructor).PrintContigs(oss); - } else { - UnipathConstructor constructor(g, corrector); - ContigPrinter(g, constructor).PrintContigs(oss); - } - -// { -// osequencestream_cov oss(contigs_output_filename); -// set edges; -// for (auto it = g.SmartEdgeBegin(); !it.IsEnd(); ++it) { -// if (edges.count(*it) == 0) { -// ReportEdge(oss, g, *it, output_unipath, solid_edge_length_bound + ".oppa.fasta"); -// edges.insert(g.conjugate(*it)); -// } -// // oss << g.EdgeNucls(*it); -// } -// DEBUG("Contigs written"); -// } -// if(!output_unipath) { -// OutputContigs(g, contigs_output_filename + ".2.fasta", true, solid_edge_length_bound); -// } -} - -inline void OutputContigsToGFA(ConjugateDeBruijnGraph &g, path_extend::PathContainer &paths, const string &contigs_output_filename) { - INFO("Outputting graph to " << contigs_output_filename << ".gfa"); - GFAWriter writer(g, paths, contigs_output_filename + ".gfa"); - writer.Write(); -} - - -inline void OutputContigsToFASTG(ConjugateDeBruijnGraph& g, - const string& contigs_output_filename, const ConnectedComponentCounter & cc_counter) { - - INFO("Outputting graph to " << contigs_output_filename << ".fastg"); - DefaultContigCorrector corrector(g); - DefaultContigConstructor constructor(g, corrector); - io::osequencestream_for_fastg ossfg(contigs_output_filename + ".fastg"); - ContigPrinter(g, constructor).PrintContigsFASTG(ossfg, cc_counter); -} - - - - -inline bool ShouldCut(ConjugateDeBruijnGraph& g, VertexId v) { - vector edges; - push_back_all(edges, g.OutgoingEdges(v)); - - if(edges.size() == 0) - return false; - for(size_t i = 1; i < edges.size(); i++) { - if(g.EdgeNucls(edges[i])[g.k()] != g.EdgeNucls(edges[0])[g.k()]) - return false; - } - edges.clear(); - push_back_all(edges, g.IncomingEdges(v)); - for(size_t i = 0; i < edges.size(); i++) - for(size_t j = i + 1; j < edges.size(); j++) { - if(g.EdgeNucls(edges[i])[g.length(edges[i]) - 1] != g.EdgeNucls(edges[j])[g.length(edges[j]) - 1]) - return true; - } - return false; -} - -inline void OutputCutContigs(ConjugateDeBruijnGraph& g, - const string& contigs_output_filename, - bool /*output_unipath*/ = false, - size_t /*solid_edge_length_bound*/ = 0) { - INFO("Outputting contigs to " << contigs_output_filename); - DefaultContigCorrector corrector(g); - io::osequencestream_cov oss(contigs_output_filename); - CuttingContigConstructor constructor(g, corrector); - -// osequencestream_cov oss(contigs_output_filename); -// set edges; -// for (auto it = g.SmartEdgeBegin(); !it.IsEnd(); ++it) { -// EdgeId e = *it; -// cout << g.length(e) << endl; -// if (edges.count(e) == 0) { -// Sequence s = g.EdgeNucls(e); -// cout << s.size() << endl; -// cout << "oppa " << ShouldCut(g, g.EdgeEnd(e)) << endl; -// if(s.size() > g.k() && ShouldCut(g, g.EdgeEnd(e))) { -// s = s.Subseq(0, s.size() - g.k()); -// cout << s.size() << endl; -// } -// cout << "oppa1 " << ShouldCut(g, g.conjugate(g.EdgeStart(e))) << endl; -// if(s.size() > g.k() && ShouldCut(g, g.conjugate(g.EdgeStart(e)))) { -// s = s.Subseq(g.k(), s.size()); -// cout << s.size() << endl; -// } -// oss << g.coverage(e); -// oss << s; -// edges.insert(g.conjugate(*it)); -// } -// // oss << g.EdgeNucls(*it); -// } -} - -inline void OutputSingleFileContigs(ConjugateDeBruijnGraph& g, - const string& contigs_output_dir) { - INFO("Outputting contigs to " << contigs_output_dir); - int n = 0; - make_dir(contigs_output_dir); - char n_str[20]; - set edges; - for (auto it = g.SmartEdgeBegin(); !it.IsEnd(); ++it) { - if (edges.count(*it) == 0) { - sprintf(n_str, "%d.fa", n); - edges.insert(g.conjugate(*it)); - io::osequencestream oss(contigs_output_dir + n_str); - oss << g.EdgeNucls(*it); - n++; - } + for (auto it = g.ConstEdgeBegin(true); !it.IsEnd(); ++it) { + EdgeId e = *it; + oss << g.coverage(e); + oss << g.EdgeNucls(e).str(); } - DEBUG("SingleFileContigs(Conjugate) written"); } } diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/graph_support/coverage_filling.hpp spades-3.11.1+dfsg/src/common/assembly_graph/graph_support/coverage_filling.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/graph_support/coverage_filling.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/graph_support/coverage_filling.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -10,7 +10,7 @@ }; template<> -struct SimultaneousCoverageCollector { +struct SimultaneousCoverageCollector { template static void CollectCoverage(SimultaneousCoverageFiller& filler, const Info &edge_info) { filler.inc_coverage(edge_info); @@ -18,7 +18,7 @@ }; template<> -struct SimultaneousCoverageCollector { +struct SimultaneousCoverageCollector { template static void CollectCoverage(SimultaneousCoverageFiller& filler, const Info &edge_info) { filler.inc_coverage(edge_info); diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/graph_support/detail_coverage.hpp spades-3.11.1+dfsg/src/common/assembly_graph/graph_support/detail_coverage.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/graph_support/detail_coverage.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/graph_support/detail_coverage.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -7,7 +7,7 @@ #pragma once -#include "utils/indices/perfect_hash_map.hpp" +#include "utils/ph_map/perfect_hash_map.hpp" #include "assembly_graph/core/coverage.hpp" #include "assembly_graph/core/action_handlers.hpp" #include "utils/verify.hpp" @@ -80,10 +80,11 @@ for (auto I = count_index.value_cbegin(), E = count_index.value_cend(); I != E; ++I) { const auto& edge_info = *I; + if (!edge_info.valid()) + continue; EdgeId e = edge_info.edge_id; unsigned offset = edge_info.offset; unsigned count = edge_info.count; - VERIFY(edge_info.valid()); VERIFY(e.get() != NULL); if (offset < averaging_range_) { IncRawCoverage(e, count); diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/graph_support/genomic_quality.hpp spades-3.11.1+dfsg/src/common/assembly_graph/graph_support/genomic_quality.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/graph_support/genomic_quality.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/graph_support/genomic_quality.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -109,7 +109,7 @@ virtual std::string label(EdgeId edge) const { double q = quality(edge); - return (q == 0) ? "" : "quality: " + ToString(q); + return (q == 0) ? "" : "quality: " + std::to_string(q); } void clear() { @@ -187,7 +187,7 @@ {} virtual void HandlePositiveQuality(EdgeId e) { - printing_rh_.HandleDelete(e, "_" + ToString(this->quality_handler().quality(e))); + printing_rh_.HandleDelete(e, "_" + std::to_string(this->quality_handler().quality(e))); } private: @@ -265,7 +265,7 @@ // //todo magic constant // map empty_coloring; // visualization::visualization_utils::WriteComponent(g_, EdgeNeighborhood(g_, edge, 50, 250), -// folder + "edge_" + ToString(g_.int_id(edge)) + ".dot", empty_coloring, labeler_); +// folder + "edge_" + std::to_string(g_.int_id(edge)) + ".dot", empty_coloring, labeler_); // } // //private: @@ -356,7 +356,7 @@ // // virtual std::string label(EdgeId edge) const { // double q = quality(edge); -// return (q == 0) ? "" : "quality: " + ToString(q); +// return (q == 0) ? "" : "quality: " + std::to_string(q); // } // //}; @@ -451,8 +451,8 @@ // //todo magic constant //// map empty_coloring; // shared_ptr> splitter = EdgeNeighborhoodFinder(g_, edge, 50, 250); -// visualization::visualization_utils::WriteComponents(g_, *splitter/*, "locality_of_edge_" + ToString(g_.int_id(edge))*/ -// , folder + "edge_" + ToString(g_.int_id(edge)) + "_" + ToString(quality_handler_.quality(edge)) + ".dot" +// visualization::visualization_utils::WriteComponents(g_, *splitter/*, "locality_of_edge_" + std::to_string(g_.int_id(edge))*/ +// , folder + "edge_" + std::to_string(g_.int_id(edge)) + "_" + std::to_string(quality_handler_.quality(edge)) + ".dot" // , colorer_, labeler_); // } else { // TRACE("Deleting edge " << g_.str(edge) << " with zero quality"); @@ -503,8 +503,8 @@ // shared_ptr> splitter = EdgeNeighborhoodFinder(g_, edge, 50, // 250); // -// visualization::visualization_utils::WriteComponents(g_, *splitter, TrueFilter>(), "locality_of_edge_" + ToString(g_.int_id(edge)) -// , folder + "edge_" + ToString(g_.int_id(edge)) + "_" + ToString(quality_handler_.quality(edge)) + ".dot" +// visualization::visualization_utils::WriteComponents(g_, *splitter, TrueFilter>(), "locality_of_edge_" + std::to_string(g_.int_id(edge)) +// , folder + "edge_" + std::to_string(g_.int_id(edge)) + "_" + std::to_string(quality_handler_.quality(edge)) + ".dot" // , empty_coloring, labeler_); // } // } @@ -543,8 +543,8 @@ // //todo magic constant // map empty_coloring; // shared_ptr> splitter = EdgeNeighborhoodFinder(g_, edge, 50, 250); -// visualization::visualization_utils::WriteComponents(g_, *splitter, TrueFilter>(), "locality_of_edge_" + ToString(g_.int_id(edge)) -// , folder + "edge_" + ToString(g_.int_id(edge)) + ".dot", empty_coloring, labeler_); +// visualization::visualization_utils::WriteComponents(g_, *splitter, TrueFilter>(), "locality_of_edge_" + std::to_string(g_.int_id(edge)) +// , folder + "edge_" + std::to_string(g_.int_id(edge)) + ".dot", empty_coloring, labeler_); // } // //private: diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/graph_support/parallel_processing.hpp spades-3.11.1+dfsg/src/common/assembly_graph/graph_support/parallel_processing.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/graph_support/parallel_processing.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/graph_support/parallel_processing.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -10,39 +10,10 @@ #include "utils/logger/logger.hpp" #include "assembly_graph/core/graph_iterators.hpp" #include "assembly_graph/graph_support/graph_processing_algorithm.hpp" -#include "utils/openmp_wrapper.h" +#include "utils/parallel/openmp_wrapper.h" namespace omnigraph { -template -void FindInterestingFromChunkIterators(const ItVec& chunk_iterators, - const Condition& predicate, - const Handler& handler) { - VERIFY(chunk_iterators.size() > 1); - typedef typename Condition::checked_type ElementType; - std::vector> of_interest(omp_get_max_threads()); - - #pragma omp parallel for schedule(guided) - for (size_t i = 0; i < chunk_iterators.size() - 1; ++i) { - size_t cnt = 0; - for (auto it = chunk_iterators[i], end = chunk_iterators[i + 1]; it != end; ++it) { - ElementType t = *it; - if (predicate(t)) { - of_interest[omp_get_thread_num()].push_back(t); - } - cnt++; - } - DEBUG("Processed " << cnt << " elements as potential candidates by thread " << omp_get_thread_num()); - } - - for (auto& chunk : of_interest) { - for (const auto& el : chunk) { - handler(el); - } - chunk.clear(); - } -} - template class InterestingElementFinder { protected: @@ -59,6 +30,9 @@ virtual bool Run(const Graph& /*g*/, HandlerF /*handler*/) const = 0; }; +template +using InterestingFinderPtr = std::shared_ptr>; + template class TrivialInterestingElementFinder : public InterestingElementFinder { @@ -101,6 +75,39 @@ const size_t chunk_cnt_; public: + template + static void FindInterestingFromChunkIterators(const ItVec& chunk_iterators, + const Condition& predicate, + const Handler& handler) { + VERIFY(chunk_iterators.size() > 1); + DEBUG("Parallel search for elements of interest"); + typedef typename Condition::checked_type ElementType; + std::vector> of_interest(chunk_iterators.size() - 1); + + #pragma omp parallel for schedule(guided) + for (size_t i = 0; i < chunk_iterators.size() - 1; ++i) { + DEBUG("Processing chunk " << i << " by thread " << omp_get_thread_num()); + size_t cnt = 0; + for (auto it = chunk_iterators[i], end = chunk_iterators[i + 1]; it != end; ++it) { + ElementType t = *it; + if (predicate(t)) { + of_interest[i].push_back(t); + } + cnt++; + } + DEBUG("Processed chunk " << i << ". " << cnt << " elements identified as potential candidates"); + } + + DEBUG("Merging chunks"); + for (auto& chunk : of_interest) { + for (const auto& el : chunk) { + handler(el); + } + chunk.clear(); + } + DEBUG("Chunks merged"); + } + ParallelInterestingElementFinder(func::TypedPredicate condition, size_t chunk_cnt) : base(condition), chunk_cnt_(chunk_cnt) {} @@ -143,7 +150,7 @@ return total_triggered; } -//todo only potentially relevant edges should be stored at any point +//FIXME only potentially relevant edges should be stored at any point template> class PersistentProcessingAlgorithm : public PersistentAlgorithmBase { @@ -153,8 +160,8 @@ private: SmartSetIterator it_; - bool tracking_; - size_t total_iteration_estimate_; + const bool tracking_; + const size_t total_iteration_estimate_; size_t curr_iteration_; protected: @@ -222,6 +229,7 @@ curr_iteration_++; return triggered; } + private: DECL_LOGGER("PersistentProcessingAlgorithm"); }; @@ -269,6 +277,7 @@ DECL_LOGGER("ParallelEdgeRemovingAlgorithm"); }; +//TODO use coverage order? template> class DisconnectionAlgorithm : public PersistentProcessingAlgorithm>(condition, chunk_cnt), /*canonical_only*/false, comp, track_changes), + //condition_(second_check ? condition : func::AlwaysTrue()), condition_(condition), disconnector_(g, removal_handler) { } diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/graph_support/scaff_supplementary.cpp spades-3.11.1+dfsg/src/common/assembly_graph/graph_support/scaff_supplementary.cpp --- spades-3.10.1+dfsg/src/common/assembly_graph/graph_support/scaff_supplementary.cpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/graph_support/scaff_supplementary.cpp 2017-09-28 09:05:09.000000000 +0000 @@ -33,23 +33,23 @@ } -void ScaffoldingUniqueEdgeAnalyzer::FillUniqueEdgeStorage(ScaffoldingUniqueEdgeStorage &storage_) { - storage_.unique_edges_.clear(); +void ScaffoldingUniqueEdgeAnalyzer::FillUniqueEdgeStorage(ScaffoldingUniqueEdgeStorage &storage) { + storage.unique_edges_.clear(); size_t total_len = 0; size_t unique_len = 0; size_t unique_num = 0; - storage_.SetMinLength(length_cutoff_); + storage.set_min_length(length_cutoff_); for (auto iter = gp_.g.ConstEdgeBegin(); !iter.IsEnd(); ++iter) { size_t tlen = gp_.g.length(*iter); total_len += tlen; if (gp_.g.length(*iter) >= length_cutoff_ && gp_.g.coverage(*iter) > median_coverage_ * (1 - relative_coverage_variation_) && gp_.g.coverage(*iter) < median_coverage_ * (1 + relative_coverage_variation_) ) { - storage_.unique_edges_.insert(*iter); + storage.unique_edges_.insert(*iter); unique_len += tlen; unique_num ++; } } - for (auto iter = storage_.begin(); iter != storage_.end(); ++iter) { + for (auto iter = storage.begin(); iter != storage.end(); ++iter) { DEBUG (gp_.g.int_id(*iter) << " " << gp_.g.coverage(*iter) << " " << gp_.g.length(*iter) ); } INFO ("With length cutoff: " << length_cutoff_ <<", median long edge coverage: " << median_coverage_ << ", and maximal unique coverage: " << @@ -84,8 +84,9 @@ return voting; } -bool ScaffoldingUniqueEdgeAnalyzer::ConservativeByPaths(EdgeId e, shared_ptr long_reads_cov_map, const pe_config::LongReads lr_config, int direction) const { - BidirectionalPathSet all_set = long_reads_cov_map->GetCoveringPaths(e); +bool ScaffoldingUniqueEdgeAnalyzer::ConservativeByPaths(EdgeId e, const GraphCoverageMap &long_reads_cov_map, + const pe_config::LongReads &lr_config, int direction) const { + BidirectionalPathSet all_set = long_reads_cov_map.GetCoveringPaths(e); BidirectionalPathMap active_paths; size_t loop_weight = 0; size_t nonloop_weight = 0; @@ -101,7 +102,9 @@ } } //TODO: small plasmid, paths a-b-a, b-a-b ? - if (loop_weight > 1) +//2 - hybrid paths weight doubles (conjugate paths) +//TODO: remove weight dublication + if (loop_weight > 2 && loop_weight * overwhelming_majority_ > nonloop_weight) return false; else DEBUG (gp_.g.int_id(e) << " loop/nonloop weight " << loop_weight << " " << nonloop_weight); @@ -121,8 +124,8 @@ maxx = pair.second; } for (const auto &pair: voting) - //TODO:: 1 from config? - if (pair.first != next_unique && pair.second > 1) +//2 - hybrid paths weight doubles (conjugate paths) + if (pair.first != next_unique && pair.second > 2) alt += pair.second; if (maxx < lr_config.unique_edge_priority * double(alt)) { DEBUG("edge " << gp_.g.int_id(e) <<" dir "<< direction << " was not unique" ); @@ -146,7 +149,9 @@ return true; } -bool ScaffoldingUniqueEdgeAnalyzer::ConservativeByPaths(EdgeId e, shared_ptr long_reads_cov_map, const pe_config::LongReads lr_config) const{ +bool ScaffoldingUniqueEdgeAnalyzer::ConservativeByPaths(EdgeId e, + const GraphCoverageMap &long_reads_cov_map, + const pe_config::LongReads &lr_config) const{ return (ConservativeByPaths(e, long_reads_cov_map, lr_config, 1) && ConservativeByPaths(e, long_reads_cov_map, lr_config, -1)); } @@ -167,7 +172,7 @@ } } -set ScaffoldingUniqueEdgeAnalyzer::GetChildren(VertexId v, map > &dijkstra_cash_) const { +set ScaffoldingUniqueEdgeAnalyzer::GetChildren(VertexId v, map> &dijkstra_cash_) const { DijkstraHelper::BoundedDijkstra dijkstra( DijkstraHelper::CreateBoundedDijkstra(gp_.g, max_dijkstra_depth_, max_dijkstra_vertices_)); dijkstra.Run(v); @@ -180,7 +185,7 @@ return dijkstra_cash_[v]; } -bool ScaffoldingUniqueEdgeAnalyzer::FindCommonChildren(EdgeId e1, EdgeId e2, map > &dijkstra_cash_) const { +bool ScaffoldingUniqueEdgeAnalyzer::FindCommonChildren(EdgeId e1, EdgeId e2, map> &dijkstra_cash_) const { auto s1 = GetChildren(gp_.g.EdgeEnd(e1), dijkstra_cash_); auto s2 = GetChildren(gp_.g.EdgeEnd(e2), dijkstra_cash_); if (s1.find(gp_.g.EdgeStart(e2)) != s1.end()) { @@ -198,7 +203,7 @@ return false; } -bool ScaffoldingUniqueEdgeAnalyzer::FindCommonChildren(vector> &next_weights) const { +bool ScaffoldingUniqueEdgeAnalyzer::FindCommonChildren(const vector> &next_weights) const { map > dijkstra_cash_; for (size_t i = 0; i < next_weights.size(); i ++) { for (size_t j = i + 1; j < next_weights.size(); j++) { @@ -238,17 +243,18 @@ } -void ScaffoldingUniqueEdgeAnalyzer::ClearLongEdgesWithPairedLib(size_t lib_index, ScaffoldingUniqueEdgeStorage &storage_) const { +void ScaffoldingUniqueEdgeAnalyzer::ClearLongEdgesWithPairedLib(size_t lib_index, + ScaffoldingUniqueEdgeStorage &storage) const { set to_erase; - for (EdgeId edge: storage_ ) { + for (EdgeId edge: storage) { if (!FindCommonChildren(edge, lib_index)) { to_erase.insert(edge); to_erase.insert(gp_.g.conjugate(edge)); } } - for (auto iter = storage_.begin(); iter != storage_.end(); ){ + for (auto iter = storage.begin(); iter != storage.end(); ){ if (to_erase.find(*iter) != to_erase.end()){ - iter = storage_.erase(iter); + iter = storage.erase(iter); } else { iter++; } @@ -256,7 +262,9 @@ } -void ScaffoldingUniqueEdgeAnalyzer::FillUniqueEdgesWithLongReads(shared_ptr long_reads_cov_map, ScaffoldingUniqueEdgeStorage& unique_storage_pb, const pe_config::LongReads lr_config) { +void ScaffoldingUniqueEdgeAnalyzer::FillUniqueEdgesWithLongReads(GraphCoverageMap &long_reads_cov_map, + ScaffoldingUniqueEdgeStorage &unique_storage_pb, + const pe_config::LongReads &lr_config) { for (auto iter = gp_.g.ConstEdgeBegin(); !iter.IsEnd(); ++iter) { EdgeId e = *iter; if (ConservativeByLength(e) && ConservativeByPaths(e, long_reads_cov_map, lr_config)) { diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/graph_support/scaff_supplementary.hpp spades-3.11.1+dfsg/src/common/assembly_graph/graph_support/scaff_supplementary.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/graph_support/scaff_supplementary.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/graph_support/scaff_supplementary.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -16,14 +16,20 @@ */ class ScaffoldingUniqueEdgeStorage { friend class ScaffoldingUniqueEdgeAnalyzer; -private: - set unique_edges_; + set unique_edges_; size_t min_unique_length_; + public: - ScaffoldingUniqueEdgeStorage(): unique_edges_(){ + ScaffoldingUniqueEdgeStorage(): unique_edges_(), min_unique_length_(0) { DEBUG("storage created, empty"); } + ScaffoldingUniqueEdgeStorage(const ScaffoldingUniqueEdgeStorage&) = delete; + ScaffoldingUniqueEdgeStorage& operator=(const ScaffoldingUniqueEdgeStorage&) = delete; + + ScaffoldingUniqueEdgeStorage(ScaffoldingUniqueEdgeStorage&&) = default; + ScaffoldingUniqueEdgeStorage& operator=(ScaffoldingUniqueEdgeStorage&&) = default; + bool IsUnique(EdgeId e) const { return (unique_edges_.find(e) != unique_edges_.end()); } @@ -36,34 +42,32 @@ return unique_edges_.end(); } - decltype(unique_edges_.begin()) erase(decltype(unique_edges_.begin()) iter){ + decltype(unique_edges_.begin()) erase(decltype(unique_edges_.begin()) iter) { return unique_edges_.erase(iter); } size_t size() const { return unique_edges_.size(); } - size_t GetMinLength() const { + size_t min_length() const { return min_unique_length_; } - void SetMinLength(size_t min_length) { + void set_min_length(size_t min_length) { min_unique_length_ = min_length; } - const set& GetSet() const { + const set& unique_edges() const { return unique_edges_; } protected: - DECL_LOGGER("ScaffoldingUniqueEdgeStorage") - + DECL_LOGGER("ScaffoldingUniqueEdgeStorage"); }; //Auxillary class required to fillin the unique edge storage. class ScaffoldingUniqueEdgeAnalyzer { - const debruijn_graph::conj_graph_pack &gp_; size_t length_cutoff_; double median_coverage_; @@ -73,13 +77,15 @@ static const size_t max_dijkstra_depth_ = 1000; static const size_t max_dijkstra_vertices_ = 1000; static const size_t overwhelming_majority_ = 10; - set GetChildren(VertexId v, map > &dijkstra_cash_) const; - bool FindCommonChildren(EdgeId e1, EdgeId e2, map > &dijkstra_cash_) const; - bool FindCommonChildren(vector> &next_weights) const; + set GetChildren(VertexId v, map> &dijkstra_cash) const; + bool FindCommonChildren(EdgeId e1, EdgeId e2, map> &dijkstra_cash) const; + bool FindCommonChildren(const vector> &next_weights) const; bool FindCommonChildren(EdgeId from, size_t lib_index) const; map FillNextEdgeVoting(BidirectionalPathMap& active_paths, int direction) const; - bool ConservativeByPaths(EdgeId e, shared_ptr long_reads_cov_map, const pe_config::LongReads lr_config) const; - bool ConservativeByPaths(EdgeId e, shared_ptr long_reads_cov_map, const pe_config::LongReads lr_config, int direction) const; + bool ConservativeByPaths(EdgeId e, const GraphCoverageMap &long_reads_cov_map, + const pe_config::LongReads &lr_config) const; + bool ConservativeByPaths(EdgeId e, const GraphCoverageMap &long_reads_cov_map, + const pe_config::LongReads &lr_config, int direction) const; bool ConservativeByLength(EdgeId e); void CheckCorrectness(ScaffoldingUniqueEdgeStorage& unique_storage_pb); protected: @@ -88,12 +94,78 @@ void SetCoverageBasedCutoff(); public: - ScaffoldingUniqueEdgeAnalyzer(const debruijn_graph::conj_graph_pack &gp, size_t apriori_length_cutoff, double max_relative_coverage):gp_(gp), length_cutoff_(apriori_length_cutoff), relative_coverage_variation_(max_relative_coverage){ + ScaffoldingUniqueEdgeAnalyzer(const debruijn_graph::conj_graph_pack &gp, size_t apriori_length_cutoff, + double max_relative_coverage): + gp_(gp), + length_cutoff_(apriori_length_cutoff), + relative_coverage_variation_(max_relative_coverage) { SetCoverageBasedCutoff(); } - void FillUniqueEdgeStorage(ScaffoldingUniqueEdgeStorage &storage_); - void ClearLongEdgesWithPairedLib(size_t lib_index, ScaffoldingUniqueEdgeStorage &storage_) const; - void FillUniqueEdgesWithLongReads(shared_ptr long_reads_cov_map, ScaffoldingUniqueEdgeStorage& unique_storage_pb, const pe_config::LongReads lr_config); + void FillUniqueEdgeStorage(ScaffoldingUniqueEdgeStorage &storage); + void ClearLongEdgesWithPairedLib(size_t lib_index, ScaffoldingUniqueEdgeStorage &storage) const; + void FillUniqueEdgesWithLongReads(GraphCoverageMap &long_reads_cov_map, + ScaffoldingUniqueEdgeStorage &unique_storage_pb, + const pe_config::LongReads &lr_config); +}; + +class UsedUniqueStorage { + set used_; + const ScaffoldingUniqueEdgeStorage& unique_; + +public: + UsedUniqueStorage(const UsedUniqueStorage&) = delete; + UsedUniqueStorage& operator=(const UsedUniqueStorage&) = delete; + + UsedUniqueStorage(UsedUniqueStorage&&) = default; + UsedUniqueStorage& operator=(UsedUniqueStorage&&) = default; + + explicit UsedUniqueStorage(const ScaffoldingUniqueEdgeStorage& unique): + unique_(unique) {} + + void insert(EdgeId e) { + if (unique_.IsUnique(e)) { + used_.insert(e); + used_.insert(e->conjugate()); + } + } + +// const ScaffoldingUniqueEdgeStorage& unique_edge_storage() const { +// return unique_; +// } + + bool IsUsedAndUnique(EdgeId e) const { + return (unique_.IsUnique(e) && used_.find(e) != used_.end()); + } + + bool UniqueCheckEnabled() const { + return unique_.size() > 0; + } + + bool TryUseEdge(BidirectionalPath &path, EdgeId e, const Gap &gap) { + if (UniqueCheckEnabled()) { + if (IsUsedAndUnique(e)) { + return false; + } else { + insert(e); + } + } + path.PushBack(e, gap); + return true; + } + +}; + +//FIXME rename +struct UniqueData { + size_t min_unique_length_; + double unique_variation_; + + ScaffoldingUniqueEdgeStorage main_unique_storage_; + vector unique_storages_; + + ScaffoldingUniqueEdgeStorage unique_pb_storage_; + vector long_reads_paths_; + vector long_reads_cov_map_; }; } diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/handlers/edge_labels_handler.hpp spades-3.11.1+dfsg/src/common/assembly_graph/handlers/edge_labels_handler.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/handlers/edge_labels_handler.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/handlers/edge_labels_handler.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -19,7 +19,7 @@ //#include "utils.hpp" #include "visualization/graph_labeler.hpp" -#include "utils/simple_tools.hpp" +#include "utils/stl_utils.hpp" #include #include diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/handlers/edges_position_handler.hpp spades-3.11.1+dfsg/src/common/assembly_graph/handlers/edges_position_handler.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/handlers/edges_position_handler.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/handlers/edges_position_handler.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -16,7 +16,7 @@ #define EDGES_POSITION_HANDLER_HPP_ //#include "utils.hpp" -#include "utils/simple_tools.hpp" +#include "utils/stl_utils.hpp" #include "assembly_graph/paths/mapping_path.hpp" #include "assembly_graph/core/action_handlers.hpp" @@ -46,8 +46,8 @@ map>> edges_positions_; //TODO extract set as a storage class - MappingRange EraseAndExtract(set &ranges, set::iterator &position, const MappingRange &new_pos) { - auto &old_pos = *position; + MappingRange EraseAndExtract(set &ranges, set::iterator &position, const MappingRange &new_pos) const { + auto old_pos = *position; if(old_pos.IntersectLeftOf(new_pos) || old_pos.StrictlyContinuesWith(new_pos, max_mapping_gap_, max_gap_diff_)) { ranges.erase(position); return old_pos.Merge(new_pos); @@ -59,8 +59,14 @@ } } + std::string RangeStr(const Range &range) const { + std::stringstream ss; + ss << "[" << (range.start_pos + 1) << " - " << range.end_pos << "]"; + return ss.str(); + } + public: - MappingRange EraseAndExtract(set &ranges, MappingRange new_pos) { + MappingRange EraseAndExtract(set &ranges, MappingRange new_pos) const { auto it = ranges.lower_bound(new_pos); if(it != ranges.end()) { new_pos = EraseAndExtract(ranges, it, new_pos); @@ -72,7 +78,7 @@ return new_pos; } - set GetEdgePositions(EdgeId edge, string contig_id) const { + set GetEdgePositions(EdgeId edge, const string &contig_id) const { VERIFY(this->IsAttached()); auto edge_it = edges_positions_.find(edge); if(edge_it == edges_positions_.end()) @@ -85,6 +91,12 @@ return it->second; } + MappingRange GetUniqueEdgePosition(EdgeId edge, const string &contig_id) const { + auto poss = GetEdgePositions(edge, contig_id); + VERIFY(poss.size() == 1); + return *poss.begin(); + } + vector GetEdgePositions(EdgeId edge) const { VERIFY(this->IsAttached()); auto edge_it = edges_positions_.find(edge); @@ -106,7 +118,7 @@ void AddEdgePosition(EdgeId edge, string contig_id, MappingRange new_pos) { VERIFY(this->IsAttached()); - if(new_pos.empty()) + if (new_pos.empty()) return; set &new_set = edges_positions_[edge][contig_id]; new_pos = EraseAndExtract(new_set, new_pos); @@ -136,9 +148,11 @@ vector positions = GetEdgePositions(edge); size_t counter = 0; for (auto pos_it = positions.begin(), end = positions.end(); pos_it != end; ++pos_it) { - ss << "(" << pos_it->contigId << ": " << pos_it->mr << ")\\n"; + ss << "(" << pos_it->contigId << ": " + << RangeStr(pos_it->mr.initial_range) << " --> " + << RangeStr(pos_it->mr.mapped_range) << ")\\n"; counter++; - if(counter > 30) { + if (counter > 30) { ss << "and many more. Totally " << positions.size() << " positions."; break; } diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/handlers/id_track_handler.hpp spades-3.11.1+dfsg/src/common/assembly_graph/handlers/id_track_handler.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/handlers/id_track_handler.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/handlers/id_track_handler.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -10,7 +10,7 @@ #include //#include "utils.hpp" #include "visualization/graph_labeler.hpp" -#include "utils/simple_tools.hpp" +#include "utils/stl_utils.hpp" #include "assembly_graph/core/action_handlers.hpp" using namespace omnigraph; @@ -78,33 +78,4 @@ } }; -template -class BaseIdTrackHandler { -public: - BaseIdTrackHandler() { - } - - size_t ReturnIntId(EdgeId e) const { - return e.int_id(); - } - - size_t ReturnIntId(VertexId v) const { - return v.int_id(); - } -}; - -template -class IdTrackHandler : public BaseIdTrackHandler { -private: - typedef typename Graph::EdgeId EdgeId; - typedef typename Graph::VertexId VertexId; - const Graph &graph_; -public: - IdTrackHandler(const Graph& g) : graph_(g) { - } - - ~IdTrackHandler() { - } -}; - } diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/index/edge_index_builders.hpp spades-3.11.1+dfsg/src/common/assembly_graph/index/edge_index_builders.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/index/edge_index_builders.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/index/edge_index_builders.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -0,0 +1,252 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "edge_info_updater.hpp" +#include "utils/ph_map/perfect_hash_map_builder.hpp" + +namespace debruijn_graph { + +template +class DeBruijnGraphKMerSplitter : public utils::DeBruijnKMerSplitter { + typedef typename omnigraph::GraphEdgeIterator EdgeIt; + typedef typename Graph::EdgeId EdgeId; + typedef typename adt::iterator_range EdgeRange; + + const Graph &g_; + + size_t FillBufferFromEdges(EdgeRange &r, unsigned thread_id); + +public: + DeBruijnGraphKMerSplitter(const std::string &work_dir, + unsigned K, const Graph &g, + size_t read_buffer_size = 0) + : utils::DeBruijnKMerSplitter(work_dir, K, KmerFilter(), read_buffer_size), + g_(g) {} + + fs::files_t Split(size_t num_files, unsigned nthreads) override; +}; + +template +size_t +DeBruijnGraphKMerSplitter::FillBufferFromEdges(EdgeRange &r, + unsigned thread_id) { + size_t seqs = 0; + for (auto &it = r.begin(); it != r.end(); ++it) { + const Sequence &nucls = g_.EdgeNucls(*it); + + seqs += 1; + if (this->FillBufferFromSequence(nucls, thread_id)) + break; + } + + return seqs; +} + +template +fs::files_t DeBruijnGraphKMerSplitter::Split(size_t num_files, unsigned nthreads) { + fs::files_t out = this->PrepareBuffers(num_files, nthreads, this->read_buffer_size_); + + omnigraph::IterationHelper edges(g_); + auto its = edges.Chunks(nthreads); + + // Turn chunks into iterator ranges + std::vector ranges; + for (size_t i = 0; i < its.size() - 1; ++i) + ranges.emplace_back(its[i], its[i+1]); + + VERIFY(ranges.size() <= nthreads); + + size_t counter = 0, n = 10; + while (!std::all_of(ranges.begin(), ranges.end(), + [](const EdgeRange &r) { return r.begin() == r.end(); })) { +# pragma omp parallel for num_threads(nthreads) reduction(+ : counter) + for (size_t i = 0; i < ranges.size(); ++i) + counter += FillBufferFromEdges(ranges[i], omp_get_thread_num()); + + this->DumpBuffers(out); + + if (counter >> n) { + INFO("Processed " << counter << " edges"); + n += 1; + } + } + + INFO("Used " << counter << " sequences."); + + this->ClearBuffers(); + + return out; +} + +template +class GraphPositionFillingIndexBuilder { +public: + typedef Index IndexT; + typedef typename Index::KMer Kmer; + + template + void BuildIndexFromGraph(Index &index, + const Graph/*T*/ &g, size_t read_buffer_size = 0) const { + unsigned nthreads = omp_get_max_threads(); + + DeBruijnGraphKMerSplitter> + splitter(index.workdir(), index.k(), g, read_buffer_size); + utils::KMerDiskCounter counter(index.workdir(), splitter); + BuildIndex(index, counter, 16, nthreads); + + // Now use the index to fill the coverage and EdgeId's + INFO("Collecting k-mer coverage information from graph, this takes a while."); + EdgeInfoUpdater updater(g, index); + updater.UpdateAll(); + } + +}; + +template struct Void { typedef void type; }; + +template +struct has_contains: std::false_type {}; + +template +struct has_contains< + T + , typename Void< + //decltype( std::declval().contains(typename T::KMerIdx(0), typename T::KMer()) ) + decltype( ((T*)(0))->contains(*((typename T::KeyWithHash*)(0))) ) + >::type +>: std::true_type {}; + +template +class CoverageFillingEdgeIndexBuilder : public Builder { + typedef Builder base; + public: + typedef typename Builder::IndexT IndexT; + typedef typename IndexT::KMer Kmer; + typedef typename IndexT::KMerIdx KmerIdx; + typedef typename IndexT::KeyWithHash KeyWithHash; + + private: + + + bool ContainsWrap(bool check_contains, IndexT& index, const KeyWithHash &kwh, std::true_type) const { + return !check_contains || index.contains(kwh); + } + + bool ContainsWrap(bool /*check_contains*/, IndexT&/* index*/, const KeyWithHash &/*kwh*/, std::false_type) const { + VERIFY(false); +// VERIFY(!check_contains); + return true; + } + + template + size_t FillCoverageFromStream(ReadStream &stream, + IndexT &index, bool check_contains) const { + unsigned k = index.k(); + size_t rl = 0; + + while (!stream.eof()) { + typename ReadStream::ReadT r; + stream >> r; + rl = std::max(rl, r.size()); + + const Sequence &seq = r.sequence(); + if (seq.size() < k) + continue; + + KeyWithHash kwh = index.ConstructKWH(seq.start(k) >> 'A'); + for (size_t j = k - 1; j < seq.size(); ++j) { + kwh <<= seq[j]; + //contains is not used since index might be still empty here + if (kwh.is_minimal() && index.valid(kwh) && ContainsWrap(check_contains, index, kwh, has_contains())) { +# pragma omp atomic + index.get_raw_value_reference(kwh).count += 1; + } + } + } + + return rl; + } + + public: + + template + size_t ParallelFillCoverage(IndexT &index, + Streams &streams, + bool check_contains = true) const { + INFO("Collecting k-mer coverage information from reads, this takes a while."); + unsigned nthreads = (unsigned) streams.size(); + size_t rl = 0; + streams.reset(); +#pragma omp parallel for num_threads(nthreads) shared(rl) + for (size_t i = 0; i < nthreads; ++i) { + size_t crl = FillCoverageFromStream(streams[i], index, check_contains); + + // There is no max reduction in C/C++ OpenMP... Only in FORTRAN :( +#pragma omp flush(rl) + if (crl > rl) +#pragma omp critical + { + rl = std::max(rl, crl); + } + } + + // Contigs have zero coverage! +#if 0 + if (contigs_stream) { + contigs_stream->reset(); + FillCoverageFromStream(*contigs_stream, index, check_contains); + } +#endif + +//todo if this verify is neede, put it outside +//#ifndef NDEBUG +// for (auto idx = index.kmer_idx_begin(), eidx = index.kmer_idx_end(); +// idx != eidx; ++idx) { +// +// Kmer k = index.kmer(idx); +// +// VERIFY(index[k].count == index[!k].count); +// } +//#endif + + return rl; + } + + template + size_t BuildIndexFromStream(IndexT &index, + Streams &streams, + io::SingleStream* contigs_stream = 0) const { + utils::BuildIndexFromStream(index, streams, contigs_stream); + + return ParallelFillCoverage(index, streams, false); + } + +// template +// size_t BuildIndexWithCoverageFromGraph( +// GraphT &graph, IndexT &index, +// Streams &streams, +// SingleReadStream* contigs_stream = 0) const { +// this->BuildIndexFromGraph(index, graph); +// +// return ParallelFillCoverage(index, streams, contigs_stream, true); +// } +}; + +template +struct EdgeIndexHelper { + typedef typename Index::KMer Kmer; + typedef typename Index::KMerIdx KMerIdx; + typedef typename Index::traits_t traits_t; + typedef CoverageFillingEdgeIndexBuilder CoverageFillingEdgeIndexBuilderT; + typedef GraphPositionFillingIndexBuilder GraphPositionFillingIndexBuilderT; + typedef CoverageFillingEdgeIndexBuilder CoverageAndGraphPositionFillingIndexBuilderT; +}; + +} diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/index/edge_info_updater.hpp spades-3.11.1+dfsg/src/common/assembly_graph/index/edge_info_updater.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/index/edge_info_updater.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/index/edge_info_updater.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -0,0 +1,109 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "utils/standard_base.hpp" +#include "utils/parallel/openmp_wrapper.h" +#include "sequence/sequence.hpp" +#include "assembly_graph/core/graph_iterators.hpp" +#include "edge_position_index.hpp" + +namespace debruijn_graph { + +template +class EdgeInfoUpdater { + typedef typename Index::KMer Kmer; + typedef typename Graph::EdgeId EdgeId; + typedef typename Index::KeyWithHash KeyWithHash; + + const Graph &g_; + Index &index_; + +// void PutInIndex(const KeyWithHash &kwh, EdgeId id, size_t offset) { +// if (index_.valid(kwh)) { +// auto &entry = index_.get_raw_value_reference(kwh); +// if (!entry.valid() || index_.contains(kwh)) { +// index_.put_value(kwh, EdgeInfo(id, (unsigned)offset, entry.count)); +// } +// } +// } + + //todo why do we need to check equality???!!! + bool DeleteIfEqual(const KeyWithHash& kwh, EdgeId e) { + if (!index_.contains(kwh)) + return false; + if (index_.get_value(kwh).edge_id == e) { + index_.get_raw_value_reference(kwh).clear(); + return true; + } + return false; + } + + void UpdateKMers(const Sequence &nucls, EdgeId e) { + VERIFY(nucls.size() >= index_.k()); + KeyWithHash kwh = index_.ConstructKWH(Kmer(index_.k(), nucls)); + if (kwh.is_minimal()) + index_.PutInIndex(kwh, e, 0); + for (size_t i = index_.k(), n = nucls.size(); i < n; ++i) { + kwh <<= nucls[i]; + if (kwh.is_minimal()) + index_.PutInIndex(kwh, e, i - index_.k() + 1); + } + } + + void DeleteKMers(const Sequence &nucls, EdgeId e) { + VERIFY(nucls.size() >= index_.k()); + KeyWithHash kwh = index_.ConstructKWH(Kmer(index_.k(), nucls)); + DeleteIfEqual(kwh, e); + for (size_t i = index_.k(), n = nucls.size(); i < n; ++i) { + kwh <<= nucls[i]; + DeleteIfEqual(kwh, e); + } + } + + public: + /** + * Creates DataHashRenewer for specified graph and index + * @param g graph to be indexed + * @param index index to be synchronized with graph + */ + EdgeInfoUpdater(const Graph& g, Index& index) + : g_(g), + index_(index) { + } + + void UpdateKmers(EdgeId e) { + Sequence nucls = g_.EdgeNucls(e); + UpdateKMers(nucls, e); + } + + void DeleteKmers(EdgeId e) { + Sequence nucls = g_.EdgeNucls(e); + DeleteKMers(nucls, e); + } + + void UpdateAll() { + unsigned nthreads = omp_get_max_threads(); + + omnigraph::IterationHelper edges(g_); + auto iters = edges.Chunks(16 * nthreads); + + #pragma omp parallel for schedule(guided) + for (size_t i = 0; i < iters.size() - 1; ++i) { + TRACE("Processing chunk #" << i); + for (auto it = iters[i]; it != iters[i + 1]; ++it) { + UpdateKmers(*it); + } + } + } + + private: + DECL_LOGGER("EdgeInfoUpdater") +}; + +} diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/index/edge_multi_index.hpp spades-3.11.1+dfsg/src/common/assembly_graph/index/edge_multi_index.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/index/edge_multi_index.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/index/edge_multi_index.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -0,0 +1,155 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "utils/ph_map/perfect_hash_map.hpp" +#include "edge_info_updater.hpp" +#include "edge_position_index.hpp" + +#include + +namespace debruijn_graph { + +template +class EdgeInfoStorage { +public: + typedef vector> Content; + typedef typename Content::iterator iterator; + typedef typename Content::const_iterator const_iterator; + Content content_; + folly::MicroSpinLock lock_; + + EdgeInfoStorage(const Content &content) : content_(content) { + lock_.init(); + } + + EdgeInfoStorage() { + lock_.init(); + } + + EdgeInfo &operator[](size_t i) { + return content_[i]; + } + + iterator begin() { + return content_.begin(); + } + + iterator end() { + return content_.end(); + } + + const_iterator begin() const { + return content_.cbegin(); + } + + const_iterator end() const { + return content_.cend(); + } + + iterator find(const EdgeInfo &info) { + return content_.find(info); + } + + const_iterator find(const EdgeInfo &info) const { + return content_.find(info); + } + + void push_back(const EdgeInfo &info) { + folly::MSLGuard g(lock_); + content_.push_back(info); + } + + template + void emplace_back(Args&&... args) { + folly::MSLGuard g(lock_); + content_.emplace_back(std::forward(args)...); + } + + size_t size() const{ + return content_.size(); + } + + bool valid() const { + //what's invalid edge info storage? + return true; + } + + EdgeInfoStorage conjugate(size_t k) const { + EdgeInfoStorage result; + for(auto it = content_.rbegin(); it != content_.rend(); ++it) { + result.push_back(it->conjugate(k)); + } + return result; + } +}; + +//todo it is not handling graph events!!! +template, class StoringType = utils::SimpleStoring > +class DeBruijnEdgeMultiIndex : public utils::KeyStoringMap, traits, StoringType > { + typedef utils::KeyStoringMap, traits, StoringType > base; + public: + typedef StoringType storing_type; + typedef typename base::traits_t traits_t; + typedef typename base::KMer KMer; + typedef typename base::KMerIdx KMerIdx; + typedef typename base::KeyWithHash KeyWithHash; + typedef EdgeInfoStorage Value; + + using base::ConstructKWH; +// typedef typename base::IdType IdType; + //todo move this typedef up in hierarchy (need some c++ tricks) + + DeBruijnEdgeMultiIndex(unsigned k, const std::string &workdir) + : base(k, workdir) { + INFO("Constructing multi-kmer index"); + } + + ~DeBruijnEdgeMultiIndex() {} + + + Value get(const KeyWithHash &kwh) const { + VERIFY(contains(kwh)); + return base::get_value(kwh); + } + + bool contains(const KeyWithHash &kwh) const { + if (!base::valid(kwh)) + return false; + return this->get_raw_value_reference(kwh).valid(); + } + + bool valid(const KMer &kmer) const { + KeyWithHash kwh = base::ConstructKWH(kmer); + return base::valid(kwh); + } + + void PutInIndex(const KeyWithHash &kwh, IdType id, size_t offset) { + if (!contains(kwh)) + return; + + EdgeInfoStorage &entry = this->get_raw_value_reference(kwh); + entry.emplace_back(id, (unsigned int)offset); + } + + const EdgeInfoStorage get(const KMer& kmer) const { + auto kwh = base::ConstructKWH(kmer); + auto entry = this->get_value(kwh); + return entry; + } + + //todo delete if equal seems to work improperly!!! + bool DeleteIfEqual(const KeyWithHash &, IdType) { + VERIFY(false); + return false; + } + +}; + +} diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/index/edge_position_index.hpp spades-3.11.1+dfsg/src/common/assembly_graph/index/edge_position_index.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/index/edge_position_index.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/index/edge_position_index.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -0,0 +1,216 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "utils/ph_map/perfect_hash_map.hpp" +#include "io/reads/single_read.hpp" + +namespace debruijn_graph { + +template +struct EdgeInfo { + IdType edge_id; + unsigned offset; + unsigned count; + + EdgeInfo(IdType edge_id_ = IdType(), unsigned offset_ = unsigned(-1), unsigned count_ = 0) : + edge_id(edge_id_), offset(offset_), count(count_) { + VERIFY(edge_id != IdType() || clean()); + } + + template + EdgeInfo conjugate(const KWH &kwh) const { + return conjugate(kwh.key().size()); + } + + EdgeInfo conjugate(size_t k) const { + if(!valid()) { + return EdgeInfo(IdType(0), unsigned(-1), count); + } else { + return EdgeInfo(edge_id->conjugate(), (unsigned)edge_id->length(k) - offset, count); + } + } + + void clear() { + offset = unsigned(-1); + } + + bool clean() const { + return offset == unsigned(-1); + } + + void remove() { + offset = unsigned(-2); + } + + bool removed() const { + return offset == unsigned(-2); + } + + bool valid() const { + return !clean() && !removed(); + } +}; + +template +stream &operator<<(stream &s, const EdgeInfo &info) { + return s << "EdgeInfo[" << info.edge_id.int_id() << ", " << info.offset << ", " << info.count << "]"; +} + +template +class KmerFreeEdgeIndex : public utils::KeyIteratingMap, + utils::kmer_index_traits, StoringType> { + typedef utils::KeyIteratingMap, + utils::kmer_index_traits, StoringType> base; + const Graph &graph_; + +public: + typedef typename base::traits_t traits_t; + typedef StoringType storing_type; + typedef typename base::KMer KMer; + typedef typename base::KMerIdx KMerIdx; + typedef Graph GraphT; + typedef typename Graph::EdgeId IdType; + typedef typename base::KeyWithHash KeyWithHash; + typedef EdgeInfo KmerPos; + using base::valid; + using base::ConstructKWH; + +public: + + KmerFreeEdgeIndex(const Graph &graph, const std::string &workdir) + : base(unsigned(graph.k() + 1), workdir), graph_(graph) {} + + /** + * Shows if kmer has some entry associated with it + */ + bool contains(const KeyWithHash &kwh) const { + // Sanity check + if (!valid(kwh)) + return false; + + KmerPos entry = base::get_value(kwh); + if (!entry.valid()) + return false; + return graph_.EdgeNucls(entry.edge_id).contains(kwh.key(), entry.offset); + } + + void PutInIndex(KeyWithHash &kwh, IdType id, size_t offset) { + if (!valid(kwh)) + return; + + KmerPos &entry = this->get_raw_value_reference(kwh); + if (entry.removed()) { + //VERIFY(false); + return; + } + if (entry.clean()) { + //put verify on this conversion! + this->put_value(kwh, KmerPos(id, (unsigned)offset, entry.count)); + } else if (contains(kwh)) { + //VERIFY(false); + entry.remove(); + } else { + //VERIFY(false); + //FIXME bad situation; some other kmer is there; think of putting verify + } + } + + //Only coverage is loaded + template + void BinWrite(Writer &writer) const { + this->index_ptr_->serialize(writer); + size_t sz = this->data_.size(); + writer.write((char*)&sz, sizeof(sz)); + for (size_t i = 0; i < sz; ++i) + writer.write((char*)&(this->data_[i].count), sizeof(this->data_[0].count)); + } + + template + void BinRead(Reader &reader, const std::string/* &FileName*/) { + this->clear(); + this->index_ptr_->deserialize(reader); + size_t sz = 0; + reader.read((char*)&sz, sizeof(sz)); + this->data_.resize(sz); + for (size_t i = 0; i < sz; ++i) + reader.read((char*)&(this->data_[i].count), sizeof(this->data_[0].count)); + } +}; + +template +class KmerStoringEdgeIndex : public utils::KeyStoringMap, + utils::kmer_index_traits, StoringType> { + typedef utils::KeyStoringMap, + utils::kmer_index_traits, StoringType> base; + +public: + typedef typename base::traits_t traits_t; + typedef StoringType storing_type; + typedef typename base::KMer KMer; + typedef typename base::KMerIdx KMerIdx; + typedef Graph GraphT; + typedef typename Graph::EdgeId IdType; + typedef typename base::KeyWithHash KeyWithHash; + typedef EdgeInfo KmerPos; + using base::valid; + using base::ConstructKWH; + + + KmerStoringEdgeIndex(const Graph& g, const std::string &workdir) + : base(unsigned(g.k() + 1), workdir) {} + + ~KmerStoringEdgeIndex() {} + + /** + * Shows if kmer has some entry associated with it + */ + bool contains(const KeyWithHash &kwh) const { + if (!base::valid(kwh)) + return false; + return this->get_raw_value_reference(kwh).valid(); + } + + template + void BinWrite(Writer &writer) const { + this->index_ptr_->serialize(writer); + size_t sz = this->data_.size(); + writer.write((char*)&sz, sizeof(sz)); + for (size_t i = 0; i < sz; ++i) + writer.write((char*)&(this->data_[i].count), sizeof(this->data_[0].count)); + this->BinWriteKmers(writer); + } + + template + void BinRead(Reader &reader, const std::string &FileName) { + this->clear(); + this->index_ptr_->deserialize(reader); + size_t sz = 0; + reader.read((char*)&sz, sizeof(sz)); + this->data_.resize(sz); + for (size_t i = 0; i < sz; ++i) + reader.read((char*)&(this->data_[i].count), sizeof(this->data_[0].count)); + this->BinReadKmers(reader, FileName); + } + + void PutInIndex(KeyWithHash &kwh, IdType id, size_t offset) { + //here valid already checks equality of query-kmer and stored-kmer sequences + if (base::valid(kwh)) { + KmerPos &entry = this->get_raw_value_reference(kwh); + if (entry.removed()) + return; + if (!entry.clean()) { + this->put_value(kwh, KmerPos(id, (unsigned)offset, entry.count)); + } else { + entry.remove(); + } + } + } +}; + +} diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/paths/bidirectional_path_container.hpp spades-3.11.1+dfsg/src/common/assembly_graph/paths/bidirectional_path_container.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/paths/bidirectional_path_container.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/paths/bidirectional_path_container.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -0,0 +1,210 @@ +// +// Created by andrey on 14.08.17. +// + +#pragma once + +#include "bidirectional_path.hpp" +#include "modules/path_extend/path_filter.hpp" +#include +#include +#include + +namespace path_extend { + +using namespace std; + +typedef std::pair PathPair; + +class PathComparator { +public: + bool operator()(const BidirectionalPath& p1, const BidirectionalPath& p2) const { + return p1.GetId() < p2.GetId(); + } + + bool operator()(const BidirectionalPath* p1, const BidirectionalPath* p2) const { + return p1->GetId() < p2->GetId(); + } +}; + +typedef set BidirectionalPathSet; + +template +using BidirectionalPathMap = map; + +typedef multiset BidirectionalPathMultiset; + +class PathContainer { +public: + + typedef vector PathContainerT; + + class Iterator : public PathContainerT::iterator { + public: + Iterator(const PathContainerT::iterator& iter) + : PathContainerT::iterator(iter) { + } + BidirectionalPath* get() const { + return this->operator *().first; + } + BidirectionalPath* getConjugate() const { + return this->operator *().second; + } + }; + + class ConstIterator : public PathContainerT::const_iterator { + public: + ConstIterator(const PathContainerT::const_iterator& iter) + : PathContainerT::const_iterator(iter) { + } + + ConstIterator(const PathContainer::Iterator& iter) + : PathContainerT::const_iterator(iter) { + } + + BidirectionalPath* get() const { + return this->operator *().first; + } + BidirectionalPath* getConjugate() const { + return this->operator *().second; + } + }; + + PathContainer() { + } + + + PathContainer(const PathContainer&) = delete; + PathContainer& operator=(const PathContainer&) = delete; + + PathContainer(PathContainer&&) = default; + PathContainer& operator=(PathContainer&&) = default; + + PathContainer(ConstIterator begin, ConstIterator end) { + DeleteAllPaths(); + for (ConstIterator it = begin; it != end; ++it) { + AddPair(new BidirectionalPath(*it.get()), new BidirectionalPath(*it.getConjugate())); + } + } + + BidirectionalPath& operator[](size_t index) const { + return *(data_[index].first); + } + + BidirectionalPath* Get(size_t index) const { + return data_[index].first; + } + + BidirectionalPath* GetConjugate(size_t index) const { + return data_[index].second; + } + + void Swap(size_t index) { + std::swap(data_[index].first, data_[index].second); + } + + void DeleteAllPaths() { + for (size_t i = 0; i < data_.size(); ++i) { + DeletePathPair(data_[i]); + } + clear(); + } + + ~PathContainer() { + DeleteAllPaths(); + } + + size_t size() const { + return data_.size(); + } + + void clear() { + data_.clear(); + } + + void reserve(size_t size) { + data_.reserve(size); + } + + bool AddPair(BidirectionalPath* p, BidirectionalPath* cp) { + p->SetConjPath(cp); + cp->SetConjPath(p); + p->Subscribe(cp); + cp->Subscribe(p); + data_.push_back(std::make_pair(p, cp)); + return true; + } + + void SortByLength(bool desc = true) { + std::stable_sort(data_.begin(), data_.end(), [=](const PathPair& p1, const PathPair& p2) { + if (p1.first->Empty() || p2.first->Empty() || p1.first->Length() != p2.first->Length()) { + return desc ? p1.first->Length() > p2.first->Length() + : p1.first->Length() < p2.first->Length(); + } + const Graph& g = p1.first->graph(); + return g.int_id(p1.first->Front()) < g.int_id(p2.first->Front()); + }); + } + + Iterator begin() { + return Iterator(data_.begin()); + } + + Iterator end() { + return Iterator(data_.end()); + } + + ConstIterator begin() const { + return ConstIterator(data_.begin()); + } + + ConstIterator end() const { + return ConstIterator(data_.end()); + } + + Iterator erase(Iterator iter) { + return Iterator(data_.erase(iter)); + } + + void print() const { + for (size_t i = 0; i < size(); ++i) { + Get(i)->PrintDEBUG(); + GetConjugate(i)->PrintDEBUG(); + } + } + + void FilterPaths(const func::TypedPredicate& pred) { + DEBUG("Removing empty paths"); + for (auto &pp : data_) { + if (pred(*pp.first)) { + VERIFY(pred(*pp.second)); //do we need it? + DeletePathPair(pp); + } + } + + const PathPair empty_pp(nullptr, nullptr); + data_.erase(std::remove(data_.begin(), data_.end(), empty_pp), data_.end()); + DEBUG("Empty paths removed"); + } + + void FilterEmptyPaths() { + FilterPaths(EmptyPathCondition()); + } + +private: + + void DeletePathPair(PathPair &pp) { + delete pp.first; + pp.first = nullptr; + delete pp.second; + pp.second = nullptr; + } + + vector data_; + +protected: + DECL_LOGGER("BidirectionalPath"); + +}; + +} \ No newline at end of file diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/paths/bidirectional_path.hpp spades-3.11.1+dfsg/src/common/assembly_graph/paths/bidirectional_path.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/paths/bidirectional_path.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/paths/bidirectional_path.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -14,6 +14,7 @@ #pragma once #include +#include #include "assembly_graph/core/graph.hpp" #include "assembly_graph/components/connected_component.hpp" @@ -26,90 +27,121 @@ class BidirectionalPath; struct Gap { - int gap_; - uint32_t trash_previous_; - uint32_t trash_current_; - Gap(int gap) - : gap_(gap), trash_previous_(0), trash_current_(0) - { } + int gap; + uint32_t trash_previous; + uint32_t trash_current; - Gap(int gap, uint32_t trash_previous, uint32_t trash_current) - : gap_(gap), trash_previous_(trash_previous), trash_current_(trash_current) + static const int INVALID_GAP = std::numeric_limits::min(); + + static const Gap& INVALID() { + static Gap gap = Gap(INVALID_GAP); + return gap; + } + + //gap is in k+1-mers and does not know about "trash" regions + explicit Gap(int gap_ = 0, uint32_t trash_previous_ = 0, uint32_t trash_current_ = 0) + : gap(gap_), trash_previous(trash_previous_), trash_current(trash_current_) { } + + Gap conjugate() const { + return Gap(gap, trash_current, trash_previous); + } + + bool operator==(const Gap &that) const { + return gap == that.gap && trash_previous == that.trash_previous && trash_current == that.trash_current; + } + + bool operator!=(const Gap &that) const { + return !(*this == that); + } + + int overlap(size_t k) const { + return int(k) - gap; + } + + int overlap_after_trim(size_t k) const { + return overlap(k) - trash_current - trash_previous; + } + + bool NoTrash() const { + return trash_current == 0 && trash_previous == 0; + } }; +inline std::ostream& operator<<(std::ostream& os, Gap gap) { + return os << "[" << gap.gap << ", " << gap.trash_previous << ", " << gap.trash_current << "]"; +} class PathListener { public: - virtual void FrontEdgeAdded(EdgeId e, BidirectionalPath * path, Gap gap) = 0; - virtual void BackEdgeAdded(EdgeId e, BidirectionalPath * path, Gap gap) = 0; - virtual void FrontEdgeRemoved(EdgeId e, BidirectionalPath * path) = 0; - virtual void BackEdgeRemoved(EdgeId e, BidirectionalPath * path) = 0; - virtual ~PathListener() { - } + virtual void FrontEdgeAdded(EdgeId e, BidirectionalPath *path, const Gap &gap) = 0; + virtual void BackEdgeAdded(EdgeId e, BidirectionalPath *path, const Gap &gap) = 0; + virtual void FrontEdgeRemoved(EdgeId e, BidirectionalPath *path) = 0; + virtual void BackEdgeRemoved(EdgeId e, BidirectionalPath *path) = 0; + virtual ~PathListener() {} }; - class BidirectionalPath : public PathListener { -private: static std::atomic path_id_; + const Graph& g_; + std::deque data_; + BidirectionalPath* conj_path_; + // Length from beginning of i-th edge to path end: L(e_i + gap_(i+1) + e_(i+1) + ... + gap_N + e_N) + std::deque cumulative_len_; + std::deque gap_len_; // e0 -> gap1 -> e1 -> ... -> gapN -> eN; gap0 = 0 + std::vector listeners_; + const uint64_t id_; //Unique ID + float weight_; public: BidirectionalPath(const Graph& g) : g_(g), - data_(), - conj_path_(NULL), - cumulative_len_(), - gap_len_(), - listeners_(), + conj_path_(nullptr), id_(path_id_++), - weight_(1.0), - has_overlaped_begin_(false), - has_overlaped_end_(false), - overlap_(false) { + weight_(1.0) { } BidirectionalPath(const Graph& g, const std::vector& path) : BidirectionalPath(g) { + //TODO cumulative_len takes O(N^2) to fill for (size_t i = 0; i < path.size(); ++i) { PushBack(path[i]); } - RecountLengths(); } - BidirectionalPath(const Graph& g, EdgeId startingEdge) + BidirectionalPath(const Graph& g, EdgeId e) : BidirectionalPath(g) { - PushBack(startingEdge); + PushBack(e); } BidirectionalPath(const BidirectionalPath& path) : g_(path.g_), data_(path.data_), - conj_path_(NULL), + conj_path_(nullptr), cumulative_len_(path.cumulative_len_), gap_len_(path.gap_len_), listeners_(), id_(path_id_++), - weight_(path.weight_), - has_overlaped_begin_(path.has_overlaped_begin_), - has_overlaped_end_(path.has_overlaped_end_), - overlap_(path.overlap_) { + weight_(path.weight_) { + } + + const Graph &g() const{ + return g_; } -public: void Subscribe(PathListener * listener) { listeners_.push_back(listener); } - void Unsubscribe(PathListener * listener) { - for (auto it = listeners_.begin(); it != listeners_.end(); ++it) { - if (*it == listener) { - listeners_.erase(it); - break; - } - } - } +// void Unsubscribe(PathListener * listener) { +// for (auto it = listeners_.begin(); it != listeners_.end(); ++it) { +// if (*it == listener) { +// listeners_.erase(it); +// break; +// } +// } +// } void SetConjPath(BidirectionalPath* path) { conj_path_ = path; @@ -144,10 +176,11 @@ } size_t Length() const { - if (gap_len_.size() == 0 || cumulative_len_.size() == 0) { + if (Empty()) { return 0; } - return cumulative_len_[0] + gap_len_[0].gap_; + VERIFY(gap_len_[0].gap == 0); + return cumulative_len_[0]; } //TODO iterators forward/reverse @@ -159,30 +192,21 @@ return data_[index]; } - EdgeId ReverseAt(size_t index) const { - return data_[data_.size() - index - 1]; + int ShiftLength(size_t index) const { + return gap_len_[index].gap + (int) g_.length(At(index)); } - // Length from beginning of i-th edge to path end for forward directed path: L(e1 + e2 + ... + eN) size_t LengthAt(size_t index) const { return cumulative_len_[index]; } - int GapAt(size_t index) const { - return gap_len_[index].gap_; - } - - const Gap& GapInfoAt(size_t index) const { + Gap GapAt(size_t index) const { return gap_len_[index]; } - uint32_t TrashCurrentAt(size_t index) const { - return gap_len_[index].trash_current_; - } - - uint32_t TrashPreviousAt(size_t index) const { - return gap_len_[index].trash_previous_; + void SetGapAt(size_t index, const Gap &gap) { + gap_len_[index] = gap; } size_t GetId() const { @@ -197,24 +221,21 @@ return data_.front(); } - void PushBack(EdgeId e, int gap = 0, uint32_t trash_previous = 0, uint32_t trash_current = 0) { - data_.push_back(e); - Gap gap_struct(gap, trash_previous, trash_current); - gap_len_.push_back(gap_struct); - IncreaseLengths(g_.length(e), gap_struct); - NotifyBackEdgeAdded(e, gap_struct); - } - - void PushBack(EdgeId e, Gap gap) { + void PushBack(EdgeId e, const Gap& gap = Gap()) { + VERIFY(!data_.empty() || gap == Gap()); data_.push_back(e); gap_len_.push_back(gap); - IncreaseLengths(g_.length(e), gap); + IncreaseLengths(g_.length(e), gap.gap); NotifyBackEdgeAdded(e, gap); } - void PushBack(const BidirectionalPath& path) { - for (size_t i = 0; i < path.Size(); ++i) { - PushBack(path.At(i), path.GapAt(i), path.TrashPreviousAt(i), path.TrashCurrentAt(i)); + void PushBack(const BidirectionalPath& path, const Gap& gap = Gap()) { + if (path.Size() > 0) { + VERIFY(path.GapAt(0) == Gap()); + PushBack(path.At(0), gap); + for (size_t i = 1; i < path.Size(); ++i) { + PushBack(path.At(i), path.GapAt(i)); + } } } @@ -241,25 +262,18 @@ } } - virtual void FrontEdgeAdded(EdgeId, BidirectionalPath*, int) { + void FrontEdgeAdded(EdgeId, BidirectionalPath*, const Gap&) override { + //FIXME is it ok to be empty? } - virtual void FrontEdgeAdded(EdgeId, BidirectionalPath*, Gap) { + void BackEdgeAdded(EdgeId e, BidirectionalPath*, const Gap& gap) override { + PushFront(g_.conjugate(e), gap.conjugate()); } - - virtual void BackEdgeAdded(EdgeId e, BidirectionalPath*, int gap) { - PushFront(g_.conjugate(e), gap); - } - - virtual void BackEdgeAdded(EdgeId e, BidirectionalPath*, Gap gap) { - PushFront(g_.conjugate(e), gap); + void FrontEdgeRemoved(EdgeId, BidirectionalPath*) override { } - virtual void FrontEdgeRemoved(EdgeId, BidirectionalPath*) { - } - - virtual void BackEdgeRemoved(EdgeId, BidirectionalPath *) { + void BackEdgeRemoved(EdgeId, BidirectionalPath *) override { PopFront(); } @@ -304,6 +318,7 @@ return result; } + //TODO is it ok not to compare gaps here? bool CompareFrom(size_t from, const BidirectionalPath& sample) const { if (from + sample.Size() > Size()) { return false; @@ -336,28 +351,6 @@ return 0; } - size_t OverlapEndSize(const BidirectionalPath* path2) const { - if (Size() == 0) { - return 0; - } - int last1 = (int) Size() - 1; - int max_over = 0; - vector begins2 = path2->FindAll(At(last1)); - for (size_t i = 0; i < begins2.size(); ++i) { - int begin2 = (int) begins2[i]; - int cur1 = last1; - while (begin2 > 0 && cur1 > 0 && path2->At(begin2 - 1) == At(cur1 - 1)) { - cur1--; - begin2--; - } - int over = last1 - cur1 + 1; - if (begin2 == 0 && cur1 > 0 && over > max_over) { - max_over = over; - } - } - return (size_t) max_over; - } - int FindFirst(const BidirectionalPath& path, size_t from = 0) const { if (path.Size() > Size()) { return -1; @@ -382,10 +375,6 @@ return -1; } - bool Contains(const BidirectionalPath& path) const { - return FindFirst(path) != -1; - } - bool Equal(const BidirectionalPath& path) const { return operator==(path); } @@ -398,91 +387,11 @@ return !operator==(path); } - void CheckConjugateEnd(size_t max_repeat_length) { - size_t prev_size = 0; - while (prev_size != Size()) { - prev_size = Size(); - FindConjEdges(max_repeat_length); - } - } - - size_t GetComponent(const debruijn_graph::ConnectedComponentCounter &component_counter) const { - std::unordered_map component_sizes; - for (size_t i = 0; i < this->Size(); i++) { - auto e = this->At(i); - size_t comp_id = component_counter.GetComponent(e); - if (component_sizes.find(comp_id) == component_sizes.end()) - component_sizes[comp_id] = 0; - component_sizes[comp_id] += g_.length(e); - } - size_t ans = 0; - size_t maxans = 0; - for (auto pp: component_sizes) { - if (pp.second > maxans) { - ans = pp.first; - maxans = pp.second; - } - } - return ans; - } - - void FindConjEdges(size_t max_repeat_length) { - for (size_t begin_pos = 0; begin_pos < Size(); ++begin_pos) { - size_t begin = begin_pos; - vector conj_pos = FindAll(g_.conjugate(At(begin_pos)), begin + 1); - for (auto end_pos = conj_pos.rbegin(); end_pos != conj_pos.rend(); ++end_pos) { - VERIFY(*end_pos < Size()); - size_t end = *end_pos; - if (end <= begin) { - continue; - } - while (begin < end && At(begin) == g_.conjugate(At(end))) { - begin++; - end--; - } - DEBUG("Found palindromic fragment from " << begin_pos << " to " << *end_pos); - Print(); - VERIFY(*end_pos < Size()); - size_t tail_size = Size() - *end_pos - 1; - size_t head_size = begin_pos; - size_t palindrom_half_size = begin - begin_pos; - size_t head_len = Length() - LengthAt(begin_pos); - size_t tail_len = *end_pos < Size() - 1 ? LengthAt(*end_pos + 1) : 0; -//TODO : this is not true in case of gaps inside the palindrom_len; - size_t palindrom_len = (size_t) max((int) LengthAt(begin_pos) - (int) LengthAt(begin), 0); - size_t between = (size_t) max(0, (int) LengthAt(begin) - (int) (end < Size() - 1 ? LengthAt(end + 1) : 0)); - DEBUG("tail len " << tail_len << " head len " << head_len << " palindrom_len "<< palindrom_len << " between " << between); - if (palindrom_len <= max_repeat_length) { - if (palindrom_len < head_len && palindrom_len < tail_len) { - DEBUG("too big head and end"); - continue; - } - if (between > palindrom_len) { - DEBUG("too big part between"); - continue; - } - } - bool delete_tail = tail_size < head_size; - if (tail_size == head_size) { - delete_tail = tail_len < head_len; - } - if (delete_tail) { - PopBack(tail_size + palindrom_half_size); - DEBUG("Deleting tail because of palindrom removal"); - return; - } else { - GetConjPath()->PopBack(head_size + palindrom_half_size); - DEBUG("Deleting head because of palindrom removal"); - return; - } - } - } - } - BidirectionalPath SubPath(size_t from, size_t to) const { + VERIFY(from <= to && to <= Size()); BidirectionalPath result(g_); - for (size_t i = from; i < min(to, Size()); ++i) { - result.PushBack(data_[i], gap_len_[i]); + for (size_t i = from; i < to; ++i) { + result.PushBack(data_[i], i == from ? Gap() : gap_len_[i]); } return result; } @@ -505,165 +414,80 @@ if (Empty()) { return result; } - result.PushBack(g_.conjugate(Back()), 0); + result.PushBack(g_.conjugate(Back())); for (int i = ((int) Size()) - 2; i >= 0; --i) { - result.PushBack(g_.conjugate(data_[i]), gap_len_[i + 1].gap_ + gap_len_[i + 1].trash_current_ - gap_len_[i + 1].trash_previous_, gap_len_[i + 1].trash_current_, gap_len_[i + 1].trash_previous_); + result.PushBack(g_.conjugate(data_[i]), gap_len_[i + 1].conjugate()); } return result; } + //FIXME remove vector ToVector() const { return vector(data_.begin(), data_.end()); } - bool CameToInterstrandBulge() const { - if (Empty()) - return false; - - EdgeId lastEdge = Back(); - VertexId lastVertex = g_.EdgeEnd(lastEdge); - - if (g_.OutgoingEdgeCount(lastVertex) == 2) { - vector bulgeEdges(g_.out_begin(lastVertex), g_.out_end(lastVertex)); - VertexId nextVertex = g_.EdgeEnd(bulgeEdges[0]); - - if (bulgeEdges[0] == g_.conjugate(bulgeEdges[1]) && nextVertex == g_.EdgeEnd(bulgeEdges[1]) && g_.CheckUniqueOutgoingEdge(nextVertex) - && *(g_.out_begin(nextVertex)) == g_.conjugate(lastEdge)) { - - DEBUG("Came to interstrand bulge " << g_.int_id(lastEdge)); - return true; - } - } - return false; - } - - bool IsInterstrandBulge() const { - if (Empty()) - return false; - - EdgeId lastEdge = Back(); - VertexId lastVertex = g_.EdgeEnd(lastEdge); - VertexId prevVertex = g_.EdgeStart(lastEdge); - - if (g_.OutgoingEdgeCount(prevVertex) == 2 && g_.IncomingEdgeCount(lastVertex) == 2 && g_.CheckUniqueOutgoingEdge(lastVertex) - && g_.CheckUniqueIncomingEdge(prevVertex) && *(g_.in_begin(prevVertex)) == g_.conjugate(*(g_.out_begin(lastVertex)))) { - - vector bulgeEdges(g_.out_begin(prevVertex), g_.out_end(prevVertex)); - EdgeId bulgeEdge = bulgeEdges[0] == lastEdge ? bulgeEdges[1] : bulgeEdges[0]; - - if (bulgeEdge == g_.conjugate(lastEdge)) { - DEBUG("In interstrand bulge " << g_.int_id(lastEdge)); - return true; - } - } - return false; - } - - void Print() const { - DEBUG("Path " << id_); - DEBUG("Length " << Length()); - DEBUG("Weight " << weight_); - DEBUG("#, edge, length, gap length, trash length, total length, total length from begin"); - for (size_t i = 0; i < Size(); ++i) { - DEBUG(i << ", " << g_.int_id(At(i)) << ", " - << g_.length(At(i)) << ", " << GapAt(i) << ", " - << TrashPreviousAt(i) << "-" << TrashCurrentAt(i) - << ", " << LengthAt(i) << ", " - << ((Length() < LengthAt(i)) ? 0 : Length() - LengthAt(i))); + void PrintDEBUG() const { + for (const auto& s: PrintLines()) { + DEBUG(s); } } - void PrintInString() const { - stringstream str; - for (size_t i = 0; i < Size(); ++i) { - str << g_.int_id(At(i)) << " "; - } - DEBUG(str.str()); - } - void PrintInfo() const { - INFO("Path " << id_); - INFO("Length " << Length()); - INFO("Weight " << weight_); - INFO("#, edge, length, gap length, total length"); - for (size_t i = 0; i < Size(); ++i) { - INFO(i << ", " << g_.int_id(At(i)) << ", " << g_.length(At(i)) << ", " << GapAt(i) << ", " << LengthAt(i)); + void PrintINFO() const { + for (const auto& s: PrintLines()) { + INFO(s); } } - void Print(std::ostream& os) { + void Print(std::ostream &os) const { if (Empty()) { return; } - os << "Path " << GetId() << endl; - os << "Length " << Length() << endl; - os << "#, edge, length, gap, total length" << endl; + os << "Path " << GetId() << "\n"; + os << "Length " << Length() << "\n"; + os << "Weight " << weight_ << "\n"; + os << "#, edge (length), gap info, total length, total length from start" << "\n"; for (size_t i = 0; i < Size(); ++i) { - os << i << ", " << g_.int_id(At(i)) << ", " << g_.length(At(i)) << ", " << GapAt(i) << ", " << LengthAt(i) << endl; - } - } - - void SetOverlapedBeginTo(BidirectionalPath* to) { - if (has_overlaped_begin_) { - to->SetOverlapBegin(); - } - SetOverlapBegin(); - to->SetOverlapEnd(); - } - - void SetOverlapedEndTo(BidirectionalPath* to) { - if (has_overlaped_end_) { - to->SetOverlapEnd(); + os << i << ", " << g_.str(At(i)) + << ", " << GapAt(i) + << ", " << LengthAt(i) + << ", " << ((Length() < LengthAt(i)) ? 0 : Length() - LengthAt(i)) << "\n"; } - SetOverlapEnd(); - to->SetOverlapBegin(); } - void SetOverlap(bool overlap = true) { - overlap_ = overlap; - conj_path_->overlap_ = overlap; + std::string str() const { + stringstream ss; + Print(ss); + return ss.str(); } - bool HasOverlapedBegin() const { - return has_overlaped_begin_; + auto begin() const -> decltype(data_.begin()) { + return data_.begin(); } - bool HasOverlapedEnd() const { - return has_overlaped_end_; + auto end() const -> decltype(data_.end()) { + return data_.end(); } - bool IsOverlap() const { - return overlap_; - } - - void ResetOverlaps() { - overlap_ = false; - has_overlaped_begin_ = false; - has_overlaped_end_ = false; - conj_path_->overlap_ = false; - conj_path_->has_overlaped_begin_ = false; - conj_path_->has_overlaped_end_ = false; - } private: - void RecountLengths() { - cumulative_len_.clear(); - size_t currentLength = 0; - for (auto iter = data_.rbegin(); iter != data_.rend(); ++iter) { - currentLength += g_.length((EdgeId) *iter); - cumulative_len_.push_front(currentLength); - } + vector PrintLines() const { + auto as_str = str(); + boost::trim(as_str); + std::vector result; + boost::split(result, as_str, boost::is_any_of("\n"), boost::token_compress_on); + return result; } - void IncreaseLengths(size_t length, Gap gap_struct) { + void IncreaseLengths(size_t length, int gap) { for (auto iter = cumulative_len_.begin(); iter != cumulative_len_.end(); ++iter) { - *iter += length + gap_struct.gap_ - gap_struct.trash_previous_; + *iter += length + gap; } cumulative_len_.push_back(length); } void DecreaseLengths() { - size_t length = g_.length(data_.back()) + gap_len_.back().gap_ - gap_len_.back().trash_previous_; + size_t length = g_.length(data_.back()) + gap_len_.back().gap; for (auto iter = cumulative_len_.begin(); iter != cumulative_len_.end(); ++iter) { *iter -= length; @@ -671,13 +495,13 @@ cumulative_len_.pop_back(); } - void NotifyFrontEdgeAdded(EdgeId e, const Gap& gap) { + void NotifyFrontEdgeAdded(EdgeId e, Gap gap) { for (auto i = listeners_.begin(); i != listeners_.end(); ++i) { (*i)->FrontEdgeAdded(e, this, gap); } } - void NotifyBackEdgeAdded(EdgeId e, const Gap& gap) { + void NotifyBackEdgeAdded(EdgeId e, Gap gap) { for (auto i = listeners_.begin(); i != listeners_.end(); ++i) { (*i)->BackEdgeAdded(e, this, gap); } @@ -695,16 +519,11 @@ } } - void PushFront(EdgeId e, const Gap& gap) { - PushFront(e, gap.gap_ + gap.trash_current_ - gap.trash_previous_, gap.trash_current_, gap.trash_previous_); - } - - void PushFront(EdgeId e, int gap = 0, uint32_t trash_previous = 0, uint32_t trash_current = 0) { + void PushFront(EdgeId e, Gap gap) { data_.push_front(e); if (gap_len_.size() > 0) { - gap_len_[0].gap_ += gap; - gap_len_[0].trash_previous_ += trash_previous; - gap_len_[0].trash_current_ += trash_current; + VERIFY(gap_len_[0] == Gap()); + gap_len_[0]= gap; } gap_len_.push_front(Gap(0, 0, 0)); @@ -712,49 +531,23 @@ if (cumulative_len_.empty()) { cumulative_len_.push_front(length); } else { - cumulative_len_.push_front(length + cumulative_len_.front() + gap - trash_previous ); + cumulative_len_.push_front(cumulative_len_.front() + length + gap.gap); } - NotifyFrontEdgeAdded(e, Gap(gap, trash_previous, trash_current)); + NotifyFrontEdgeAdded(e, gap); } void PopFront() { EdgeId e = data_.front(); - if (gap_len_.size() > 1) { - gap_len_[1].gap_ = 0; - gap_len_[1].trash_previous_ = 0; - gap_len_[1].trash_current_ = 0; - } data_.pop_front(); gap_len_.pop_front(); - cumulative_len_.pop_front(); - NotifyFrontEdgeRemoved(e); - } - - void SetOverlapBegin(bool overlap = true) { - if (has_overlaped_begin_ != overlap) { - has_overlaped_begin_ = overlap; - } - if (GetConjPath()->has_overlaped_end_ != overlap) { - GetConjPath()->has_overlaped_end_ = overlap; + if (!gap_len_.empty()) { + gap_len_.front() = Gap(); } - } - void SetOverlapEnd(bool overlap = true) { - GetConjPath()->SetOverlapBegin(overlap); + NotifyFrontEdgeRemoved(e); } - const Graph& g_; - std::deque data_; - BidirectionalPath* conj_path_; - std::deque cumulative_len_; // Length from beginning of i-th edge to path end for forward directed path: L(e1 + e2 + ... + eN) ... L(eN) - std::deque gap_len_; // e1 - gap2 - e2 - ... - gapN - eN - std::vector listeners_; - const uint64_t id_; //Unique ID - float weight_; - bool has_overlaped_begin_; - bool has_overlaped_end_; - bool overlap_; DECL_LOGGER("BidirectionalPath"); }; @@ -794,8 +587,8 @@ inline size_t FirstNotEqualPosition(const BidirectionalPath& path1, size_t pos1, const BidirectionalPath& path2, size_t pos2, bool use_gaps) { int cur_pos1 = (int) pos1; int cur_pos2 = (int) pos2; - int gap1 = path1.GapAt(cur_pos1); - int gap2 = path2.GapAt(cur_pos2); + int gap1 = path1.GapAt(cur_pos1).gap; + int gap2 = path2.GapAt(cur_pos2).gap; while (cur_pos1 >= 0 && cur_pos2 >= 0) { if (path1.At(cur_pos1) == path2.At(cur_pos2)) { cur_pos1--; @@ -810,8 +603,8 @@ SkipGaps(path1, p1, gap1, path2, p2, gap2, use_gaps, false); cur_pos1 = (int) p1; cur_pos2 = (int) p2; - gap1 = path1.GapAt(cur_pos1); - gap2 = path2.GapAt(cur_pos2); + gap1 = path1.GapAt(cur_pos1).gap; + gap2 = path2.GapAt(cur_pos2).gap; } } DEBUG("Equal!!"); @@ -832,8 +625,8 @@ } else { return cur_pos1; } - int gap1 = cur_pos1 < path1.Size() ? path1.GapAt(cur_pos1) : 0; - int gap2 = cur_pos2 < path2.Size() ? path2.GapAt(cur_pos2) : 0; + int gap1 = cur_pos1 < path1.Size() ? path1.GapAt(cur_pos1).gap : 0; + int gap2 = cur_pos2 < path2.Size() ? path2.GapAt(cur_pos2).gap : 0; SkipGaps(path1, cur_pos1, gap1, path2, cur_pos2, gap2, use_gaps, true); } return -1UL; @@ -843,248 +636,19 @@ return LastNotEqualPosition(path1, pos1, path2, pos2, use_gaps) == -1UL; } -inline bool PathIdCompare(const BidirectionalPath* p1, const BidirectionalPath* p2) { - return p1->GetId() < p2->GetId(); -} - - - -typedef std::pair PathPair; - -inline bool compare_path_pairs(const PathPair& p1, const PathPair& p2) { - if (p1.first->Length() != p2.first->Length() || p1.first->Size() == 0 || p2.first->Size() == 0) { - return p1.first->Length() > p2.first->Length(); - } - const Graph& g = p1.first->graph(); - return g.int_id(p1.first->Front()) < g.int_id(p2.first->Front()); -} - -class PathComparator { -public: - bool operator()(const BidirectionalPath& p1, const BidirectionalPath& p2) const { - return p1.GetId() < p2.GetId(); - } - - bool operator()(const BidirectionalPath* p1, const BidirectionalPath* p2) const { - return p1->GetId() < p2->GetId(); - } -}; - -typedef set BidirectionalPathSet; - -template -using BidirectionalPathMap = map; - -typedef std::multiset BidirectionalPathMultiset; - -class PathContainer { - -public: - - typedef std::vector PathContainerT; - - class Iterator : public PathContainerT::iterator { - public: - Iterator(const PathContainerT::iterator& iter) - : PathContainerT::iterator(iter) { - } - BidirectionalPath* get() const { - return this->operator *().first; - } - BidirectionalPath* getConjugate() const { - return this->operator *().second; - } - }; - - class ConstIterator : public PathContainerT::const_iterator { - public: - ConstIterator(const PathContainerT::const_iterator& iter) - : PathContainerT::const_iterator(iter) { - } - BidirectionalPath* get() const { - return this->operator *().first; - } - BidirectionalPath* getConjugate() const { - return this->operator *().second; - } - }; - - PathContainer() { - } - - BidirectionalPath& operator[](size_t index) const { - return *(data_[index].first); - } - - BidirectionalPath* Get(size_t index) const { - return data_[index].first; - } - - BidirectionalPath* GetConjugate(size_t index) const { - return data_[index].second; - } - - void DeleteAllPaths() { - for (size_t i = 0; i < data_.size(); ++i) { - delete data_[i].first; - delete data_[i].second; - } - clear(); - } - - ~PathContainer() { - DeleteAllPaths(); - } - - size_t size() const { - return data_.size(); - } - - void clear() { - data_.clear(); - } - - void reserve(size_t size) { - data_.reserve(size); - } - - bool AddPair(BidirectionalPath* p, BidirectionalPath* cp) { - p->SetConjPath(cp); - cp->SetConjPath(p); - p->Subscribe(cp); - cp->Subscribe(p); - data_.push_back(std::make_pair(p, cp)); - return true; - } - - void SortByLength() { - std::stable_sort(data_.begin(), data_.end(), compare_path_pairs); - } - - Iterator begin() { - return Iterator(data_.begin()); - } - - Iterator end() { - return Iterator(data_.end()); - } - - - ConstIterator begin() const { - return ConstIterator(data_.begin()); - } - - ConstIterator end() const { - return ConstIterator(data_.end()); - } - - Iterator erase(Iterator iter) { - return Iterator(data_.erase(iter)); - } - - void print() const { - for (size_t i = 0; i < size(); ++i) { - Get(i)->Print(); - GetConjugate(i)->Print(); - } - } - - void FilterEmptyPaths() { - DEBUG ("try to delete empty paths"); - for (Iterator iter = begin(); iter != end();) { - if (iter.get()->Size() == 0) { - // FIXME: This is trash. PathContainer should own paths - delete iter.get(); - delete iter.getConjugate(); - iter = erase(iter); - } else { - ++iter; - } - } - DEBUG("empty paths are removed"); - } - - void FilterInterstandBulges() { - DEBUG ("Try to delete paths with interstand bulges"); - for (Iterator iter = begin(); iter != end(); ++iter) { - if (iter.get()->IsInterstrandBulge()) { - iter.get()->PopBack(); - } - if (iter.getConjugate()->IsInterstrandBulge()) { - iter.getConjugate()->PopBack(); - } - } - DEBUG("deleted paths with interstand bulges"); - } - -private: - std::vector data_; - -protected: - DECL_LOGGER("BidirectionalPath"); - -}; - -inline pair ComparePaths(size_t start_pos1, size_t start_pos2, const BidirectionalPath& path1, const BidirectionalPath& path2, - size_t max_diff) { - path1.Print(); - path2.Print(); - if (start_pos1 >= path1.Size() || start_pos2 >= path2.Size()) { - return make_pair(start_pos1, start_pos2); - } - const Graph& g = path1.graph(); - size_t cur_pos = start_pos1; - size_t last2 = start_pos2; - size_t last1 = cur_pos; - cur_pos++; - size_t diff_len = 0; - while (cur_pos < path1.Size()) { - if (diff_len > max_diff) { - return make_pair(last1, last2); - } - EdgeId e = path1[cur_pos]; - vector poses2 = path2.FindAll(e); - bool found = false; - for (size_t pos2 = 0; pos2 < poses2.size(); ++pos2) { - if (poses2[pos2] > last2) { - int diff = int(path2.LengthAt(last2)) - int(path2.LengthAt(poses2[pos2])) - int(g.length(path2.At(last2))) - path2.GapAt(poses2[pos2]); - if (std::abs(diff) > max_diff) { - break; - } - last2 = poses2[pos2]; - last1 = cur_pos; - DEBUG("found " << cur_pos); - found = true; - break; - } - } - if (!found) { - diff_len += g.length(e) + path1.GapAt(cur_pos); - DEBUG("not found " << cur_pos << " now diff len " << diff_len); - } else { - diff_len = 0; - } - cur_pos++; - } - return make_pair(last1, last2); -} - -inline void DeletePaths(BidirectionalPathSet& paths) { - for (auto i = paths.begin(); i != paths.end(); ++i) { - delete (*i); - } -} - -inline void DeletePaths(vector& paths) { - for (auto i = paths.begin(); i != paths.end(); ++i) { - delete (*i); - } -} +inline bool EndsWithInterstrandBulge(const BidirectionalPath &path) { + if (path.Empty()) + return false; -inline void DeleteMapWithPaths(map m) { - for (auto i = m.begin(); i != m.end(); ++i){ - delete i->second; - } + const Graph &g = path.g(); + EdgeId e = path.Back(); + VertexId v1 = g.EdgeStart(e); + VertexId v2 = g.EdgeEnd(e); + + return v2 == g.conjugate(v1) && + e != g.conjugate(e) && + g.OutgoingEdgeCount(v1) == 2 && + g.CheckUniqueIncomingEdge(v1); } } // path extend diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/paths/bidirectional_path_io/bidirectional_path_output.cpp spades-3.11.1+dfsg/src/common/assembly_graph/paths/bidirectional_path_io/bidirectional_path_output.cpp --- spades-3.10.1+dfsg/src/common/assembly_graph/paths/bidirectional_path_io/bidirectional_path_output.cpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/paths/bidirectional_path_io/bidirectional_path_output.cpp 2017-09-28 09:05:09.000000000 +0000 @@ -6,63 +6,36 @@ namespace path_extend { +void path_extend::ContigWriter::OutputPaths(const PathContainer &paths, const vector& writers) const { + ScaffoldStorage storage; -string path_extend::ContigWriter::ToFASTGPathFormat(const BidirectionalPath &path) const { - if (path.Empty()) - return ""; - string res = ids_.at(path.Front()).short_id_; - for (size_t i = 1; i < path.Size(); ++i) { - if (g_.EdgeEnd(path[i - 1]) != g_.EdgeStart(path[i])) { - res += ";\n" + ids_.at(path[i]).short_id_; - } - else { - res += "," + ids_.at(path[i]).short_id_; + ScaffoldSequenceMaker scaffold_maker(g_); + for (auto iter = paths.begin(); iter != paths.end(); ++iter) { + BidirectionalPath* path = iter.get(); + if (path->Length() <= 0) + continue; + string path_string = scaffold_maker.MakeSequence(*path); + if (path_string.length() >= g_.k()) { + storage.emplace_back(path_string, path); } } - return res; -} -void path_extend::ContigWriter::OutputPaths(const PathContainer &paths, - const string &filename_base, - bool write_fastg) const { - name_generator_->Preprocess(paths); - IOContigStorage storage(g_, constructor_, paths); + //sorting by length and coverage + std::sort(storage.begin(), storage.end(), [] (const ScaffoldInfo &a, const ScaffoldInfo &b) { + if (a.length() == b.length()) + return math::gr(a.coverage(), b.coverage()); + return a.length() > b.length(); + }); - INFO("Writing contigs to " << filename_base); - io::osequencestream_simple oss(filename_base + ".fasta"); - std::ofstream os_fastg; - if (write_fastg) - os_fastg.open((filename_base + ".paths").c_str()); - - size_t i = 0; - for (const auto& precontig : storage.Storage()) { - ++i; - std::string contig_id = name_generator_->MakeContigName(i, precontig); - oss.set_header(contig_id); - oss << precontig.sequence_; - - if (write_fastg) { - os_fastg << contig_id << endl; - os_fastg << ToFASTGPathFormat(*precontig.path_) << endl; - os_fastg << contig_id << "'" << endl; - os_fastg << ToFASTGPathFormat(*precontig.path_->GetConjPath()) << endl; - } + name_generator_->Preprocess(paths); + for (size_t i = 0; i < storage.size(); ++i) { + storage[i].name = name_generator_->MakeContigName(i+1, storage[i]); } - if (write_fastg) - os_fastg.close(); - DEBUG("Contigs written"); -} - - -void path_extend::PathInfoWriter::WritePaths(const PathContainer &paths, const string &filename) const { - std::ofstream oss(filename.c_str()); - - for (auto iter = paths.begin(); iter != paths.end(); ++iter) { - iter.get()->Print(oss); + for (auto& writer : writers) { + writer(storage); } - - oss.close(); + DEBUG("Contigs written"); } } \ No newline at end of file diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/paths/bidirectional_path_io/bidirectional_path_output.hpp spades-3.11.1+dfsg/src/common/assembly_graph/paths/bidirectional_path_io/bidirectional_path_output.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/paths/bidirectional_path_io/bidirectional_path_output.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/paths/bidirectional_path_io/bidirectional_path_output.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -7,54 +7,274 @@ #pragma once - #include "io_support.hpp" - namespace path_extend { -using namespace debruijn_graph; +template +using EdgeNamingF = std::function; +template +EdgeNamingF IdNamingF(const string &prefix = "") { + return [=](const Graph &g, EdgeId e) { + return io::MakeContigId(g.int_id(e), prefix); + }; +} -class ContigWriter { -protected: - DECL_LOGGER("PathExtendIO") +template +EdgeNamingF BasicNamingF(const string &prefix = "EDGE") { + return [=](const Graph &g, EdgeId e) { + return io::MakeContigId(g.int_id(e), g.length(e) + g.k(), g.coverage(e), prefix); + }; +} + +template +class CanonicalEdgeHelper { + const Graph &g_; + const EdgeNamingF naming_f_; + const string pos_orient_; + const string neg_orient_; +public: + + CanonicalEdgeHelper(const Graph &g, + EdgeNamingF naming_f = IdNamingF(), + const string& pos_orient = "+", + const string& neg_orient = "-") : + g_(g), naming_f_(naming_f), + pos_orient_(pos_orient), neg_orient_(neg_orient) { + } + + bool IsCanonical(EdgeId e) const { + return e <= g_.conjugate(e); + } + + EdgeId Canonical(EdgeId e) const { + return IsCanonical(e) ? e : g_.conjugate(e); + } + + std::string GetOrientation(EdgeId e) const { + return IsCanonical(e) ? pos_orient_ : neg_orient_; + } + + std::string EdgeOrientationString(EdgeId e, + const std::string &delim = "") const { + return naming_f_(g_, Canonical(e)) + delim + GetOrientation(e); + } + + std::string EdgeString(EdgeId e) const { + VERIFY(IsCanonical(e)); + return naming_f_(g_, e); + } +}; + +template +class FastgWriter { + typedef typename Graph::EdgeId EdgeId; + const Graph &graph_; + CanonicalEdgeHelper short_namer_; + CanonicalEdgeHelper extended_namer_; + + string ToPathString(const BidirectionalPath &path) const { + if (path.Empty()) + return ""; + string res = short_namer_.EdgeOrientationString(path.Front()); + for (size_t i = 1; i < path.Size(); ++i) { + if (graph_.EdgeEnd(path[i - 1]) != graph_.EdgeStart(path[i]) || path.GapAt(i).gap > 0) { + res += ";\n" + short_namer_.EdgeOrientationString(path[i]); + } else { + res += "," + short_namer_.EdgeOrientationString(path[i]); + } + } + return res; + } + + string FormHeader(const string &id, + const set& next_ids) { + std::stringstream ss; + ss << id; + if (next_ids.size() > 0) { + auto delim = ":"; + for (const auto &s : next_ids) { + ss << delim << s; + delim = ","; + } + } + ss << ";"; + return ss.str(); + } + +public: + + FastgWriter(const Graph &graph, + EdgeNamingF edge_naming_f = BasicNamingF()) + : graph_(graph), + short_namer_(graph_), + extended_namer_(graph_, edge_naming_f, "", "'") { + } + + void WriteSegmentsAndLinks(const string &fn) { + io::OutputSequenceStream os(fn); + for (auto it = graph_.ConstEdgeBegin(); !it.IsEnd(); ++it) { + EdgeId e = *it; + set next; + for (EdgeId next_e : graph_.OutgoingEdges(graph_.EdgeEnd(e))) { + next.insert(extended_namer_.EdgeOrientationString(next_e)); + } + os << io::SingleRead(FormHeader(extended_namer_.EdgeOrientationString(e), next), + graph_.EdgeNucls(e).str()); + } + } + + void WritePaths(const ScaffoldStorage &scaffold_storage, const string &fn) const { + std::ofstream os(fn); + for (const auto& scaffold_info : scaffold_storage) { + os << scaffold_info.name << "\n"; + os << ToPathString(*scaffold_info.path) << "\n"; + os << scaffold_info.name << "'" << "\n"; + os << ToPathString(*scaffold_info.path->GetConjPath()) << "\n"; + } + } + +}; + +template +class GFAWriter { + typedef typename Graph::EdgeId EdgeId; + const Graph &graph_; + CanonicalEdgeHelper edge_namer_; + std::ostream &os_; + + void WriteSegment(const std::string& edge_id, const Sequence &seq, double cov) { + os_ << "S\t" << edge_id << "\t" + << seq.str() << "\t" + << "KC:i:" << size_t(math::round(cov)) << "\n"; + } + + void WriteSegments() { + for (auto it = graph_.ConstEdgeBegin(true); !it.IsEnd(); ++it) { + EdgeId e = *it; + WriteSegment(edge_namer_.EdgeString(e), graph_.EdgeNucls(e), + graph_.coverage(e) * double(graph_.length(e))); + } + } + + void WriteLink(EdgeId e1, EdgeId e2, + size_t overlap_size) { + os_ << "L\t" << edge_namer_.EdgeOrientationString(e1, "\t") << "\t" + << edge_namer_.EdgeOrientationString(e2, "\t") << "\t" + << overlap_size << "M\n"; + } + + void WriteLinks() { + //TODO switch to constant vertex iterator + for (auto it = graph_.SmartVertexBegin(/*canonical only*/true); !it.IsEnd(); ++it) { + VertexId v = *it; + for (auto inc_edge : graph_.IncomingEdges(v)) { + for (auto out_edge : graph_.OutgoingEdges(v)) { + WriteLink(inc_edge, out_edge, graph_.k()); + } + } + } + } -protected: + void WritePath(const std::string& name, size_t segment_id, const vector &edge_strs) { + os_ << "P" << "\t" ; + os_ << name << "_" << segment_id << "\t"; + std::string delimeter = ""; + for (const auto& e : edge_strs) { + os_ << delimeter << e; + delimeter = ","; + } + os_ << "\t*\n"; +// delimeter = ""; +// for (size_t i = 0; i < edge_strs.size() - 1; ++i) { +// os_ << delimeter << "*"; +// delimeter = ","; +// } +// os_ << "\n"; + } + +public: + GFAWriter(const Graph &graph, std::ostream &os, + EdgeNamingF naming_f = IdNamingF()) + : graph_(graph), + edge_namer_(graph_, naming_f), + os_(os) { + } + + void WriteSegmentsAndLinks() { + WriteSegments(); + WriteLinks(); + } + + void WritePaths(const ScaffoldStorage &scaffold_storage) { + for (const auto& scaffold_info : scaffold_storage) { + const path_extend::BidirectionalPath &p = *scaffold_info.path; + if (p.Size() == 0) { + continue; + } + std::vector segmented_path; + //size_t id = p.GetId(); + size_t segment_id = 1; + for (size_t i = 0; i < p.Size() - 1; ++i) { + EdgeId e = p[i]; + segmented_path.push_back(edge_namer_.EdgeOrientationString(e)); + if (graph_.EdgeEnd(e) != graph_.EdgeStart(p[i+1]) || p.GapAt(i+1).gap > 0) { + WritePath(scaffold_info.name, segment_id, segmented_path); + segment_id++; + segmented_path.clear(); + } + } + + segmented_path.push_back(edge_namer_.EdgeOrientationString(p.Back())); + WritePath(scaffold_info.name, segment_id, segmented_path); + } + } + +}; + +typedef std::function PathsWriterT; + +class ContigWriter { const Graph& g_; - ContigConstructor &constructor_; - map ids_; shared_ptr name_generator_; - string ToFASTGPathFormat(const BidirectionalPath &path) const; +public: + static void WriteScaffolds(const ScaffoldStorage &scaffold_storage, const string &fn) { + io::OutputSequenceStream oss(fn); + std::ofstream os_fastg; + + for (const auto& scaffold_info : scaffold_storage) { + TRACE("Scaffold " << scaffold_info.name << " originates from path " << scaffold_info.path->str()); + oss << io::SingleRead(scaffold_info.name, scaffold_info.sequence); + } + } + + static PathsWriterT BasicFastaWriter(const string &fn) { + return [=](const ScaffoldStorage& scaffold_storage) { + WriteScaffolds(scaffold_storage, fn); + }; + } -public: ContigWriter(const Graph& g, - ContigConstructor &constructor, - const ConnectedComponentCounter &c_counter, shared_ptr name_generator) : g_(g), - constructor_(constructor), - ids_(), name_generator_(name_generator) { - MakeContigIdMap(g_, ids_, c_counter, "NODE"); } - void OutputPaths(const PathContainer &paths, - const string &filename_base, - bool write_fastg = true) const; - -}; - + void OutputPaths(const PathContainer &paths, const vector& writers) const; -class PathInfoWriter { -protected: - DECL_LOGGER("PathExtendIO") + void OutputPaths(const PathContainer &paths, PathsWriterT writer) const { + OutputPaths(paths, vector{writer}); + } -public: + void OutputPaths(const PathContainer &paths, const string &fn) const { + OutputPaths(paths, BasicFastaWriter(fn)); + } - void WritePaths(const PathContainer &paths, const string &filename) const; +private: + DECL_LOGGER("ContigWriter") }; } diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/paths/bidirectional_path_io/io_support.cpp spades-3.11.1+dfsg/src/common/assembly_graph/paths/bidirectional_path_io/io_support.cpp --- spades-3.10.1+dfsg/src/common/assembly_graph/paths/bidirectional_path_io/io_support.cpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/paths/bidirectional_path_io/io_support.cpp 2017-09-28 09:05:09.000000000 +0000 @@ -84,73 +84,43 @@ } } -string path_extend::IOContigStorage::ToString(const BidirectionalPath &path) const { - stringstream ss; - if (path.IsInterstrandBulge() && path.Size() == 1) { - ss << constructor_.construct(path.Back()).first.substr(k_, g_.length(path.Back()) - k_); - return ss.str(); +std::string path_extend::ScaffoldSequenceMaker::MakeSequence(const BidirectionalPath &path) const { + TRACE("Forming sequence for path " << path.str()); + //TODO what is it and why is it here? + if (path.Size() == 1 && EndsWithInterstrandBulge(path)) { + TRACE("Interstrand bulge edge"); + return g_.EdgeNucls(path.Back()).Subseq(k_, g_.length(path.Back())).str(); } - if (!path.Empty()) { - ss << constructor_.construct(path[0]).first.substr(0, k_); - } + if (path.Empty()) + return ""; + std::string answer = g_.EdgeNucls(path[0]).Subseq(0, k_).str(); + VERIFY(path.GapAt(0) == Gap()); - size_t i = 0; - while (i < path.Size()) { - int gap = i == 0 ? 0 : path.GapAt(i); - if (gap > (int) k_) { - for (size_t j = 0; j < gap - k_; ++j) { - ss << "N"; - } - auto temp_str = constructor_.construct(path[i]).first; - if (i != path.Size() - 1) { - for (size_t j = 0; j < path.TrashPreviousAt(i + 1); ++j) { - temp_str.pop_back(); - if (temp_str.size() == 0) { - break; - } - } - } - ss << temp_str; - } - else { - int overlapLen = (int) k_ - gap; - if (overlapLen >= (int) g_.length(path[i]) + (int) k_) { - overlapLen -= (int) g_.length(path[i]) + (int) k_; - ++i; - //skipping overlapping edges - while (i < path.Size() && overlapLen >= (int) g_.length(path[i]) + path.GapAt(i)) { - overlapLen -= (int) g_.length(path[i]) + path.GapAt(i); - ++i; - } - if (i == path.Size()) { - break; - } - - overlapLen = overlapLen + (int) k_ - path.GapAt(i); - - if(overlapLen < 0) { - for (int j = 0; j < abs(overlapLen); ++j) { - ss << "N"; - } - overlapLen = 0; - } - } - auto temp_str = g_.EdgeNucls(path[i]).Subseq(overlapLen).str(); - if (i != path.Size() - 1) { - for (size_t j = 0; j < path.TrashPreviousAt(i + 1); ++j) { - temp_str.pop_back(); - if (temp_str.size() == 0) { - break; - } - } - } - ss << temp_str; + for (size_t i = 0; i < path.Size(); ++i) { + Gap gap = path.GapAt(i); + TRACE("Adding edge " << g_.str(path[i])); + TRACE("Gap " << gap); + + answer.erase((gap.trash_previous <= answer.length()) ? + answer.length() - gap.trash_previous : 0); + + int overlap_after_trim = gap.overlap_after_trim(k_); + TRACE("Overlap after trim " << overlap_after_trim); + if (overlap_after_trim < 0) { + answer += std::string(abs(overlap_after_trim), 'N'); + overlap_after_trim = 0; } - ++i; + TRACE("Corrected overlap after trim " << overlap_after_trim); + + VERIFY(overlap_after_trim >= 0); + + answer += g_.EdgeNucls(path[i]).Subseq(gap.trash_current + overlap_after_trim).str(); } - return ss.str(); + TRACE("Sequence formed"); + + return answer; } void path_extend::ScaffoldBreaker::SplitPath(const BidirectionalPath &path, PathContainer &result) const { @@ -160,14 +130,14 @@ BidirectionalPath *p = new BidirectionalPath(path.graph(), path[i]); ++i; - while (i < path.Size() and path.GapAt(i) <= min_gap_) { - p->PushBack(path[i], path.GapAt(i), path.TrashPreviousAt(i), path.TrashCurrentAt(i)); + while (i < path.Size() && path.GapAt(i).gap <= min_gap_) { + p->PushBack(path[i], path.GapAt(i)); ++i; } if (i < path.Size()) { - DEBUG("split path " << i << " gap " << path.GapAt(i)); - p->Print(); + DEBUG("split path " << i << " gap " << path.GapAt(i).gap); + p->PrintDEBUG(); } BidirectionalPath *cp = new BidirectionalPath(p->Conjugate()); diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/paths/bidirectional_path_io/io_support.hpp spades-3.11.1+dfsg/src/common/assembly_graph/paths/bidirectional_path_io/io_support.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/paths/bidirectional_path_io/io_support.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/paths/bidirectional_path_io/io_support.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -11,57 +11,35 @@ namespace path_extend { using namespace debruijn_graph; +struct ScaffoldInfo { + std::string sequence; + BidirectionalPath* path; + std::string name; -struct IOContig { - std::string sequence_; - BidirectionalPath* path_; + ScaffoldInfo(const std::string& sequence, BidirectionalPath* path) : + sequence(sequence), path(path) { } - IOContig(const std::string& sequence, BidirectionalPath* path) : - sequence_(sequence), path_(path) { } -}; + size_t length() const { + return sequence.length(); + } -struct IOContigGreater -{ - bool operator()(const IOContig &a, const IOContig &b) const { - if (a.sequence_.length() == b.sequence_.length()) - return math::gr(a.path_->Coverage(), b.path_->Coverage()); - return a.sequence_.length() > b.sequence_.length(); + double coverage() const { + return path->Coverage(); } }; -class IOContigStorage { -private: +typedef vector ScaffoldStorage; + +class ScaffoldSequenceMaker { const Graph &g_; - ContigConstructor &constructor_; - size_t k_; - vector storage_; - - string ToString(const BidirectionalPath& path) const; -public: - IOContigStorage(const Graph &g, ContigConstructor &constructor, const PathContainer &paths): - g_(g), - constructor_(constructor), - k_(g.k()), - storage_() { - - for (auto iter = paths.begin(); iter != paths.end(); ++iter) { - BidirectionalPath* path = iter.get(); - if (path->Length() <= 0) - continue; - string path_string = ToString(*path); - if (path_string.length() >= g.k()) { - storage_.emplace_back(path_string, path); - } - } - std::sort(storage_.begin(), storage_.end(), IOContigGreater()); + const size_t k_; +public: + ScaffoldSequenceMaker(const Graph& g) : g_(g), k_(g_.k()) { } - const vector& Storage() const { - return storage_; - } + string MakeSequence(const BidirectionalPath &scaffold) const; }; - //Finds common long edges in paths and joins them into //Based on disjoint set union class TranscriptToGeneJoiner { @@ -89,13 +67,11 @@ void Construct(const PathContainer &paths); }; - - class ContigNameGenerator { public: virtual void Preprocess(const PathContainer& paths) = 0; - virtual std::string MakeContigName(size_t index, const IOContig &precontig) = 0; + virtual std::string MakeContigName(size_t index, const ScaffoldInfo &scaffold_info) = 0; virtual ~ContigNameGenerator() { } @@ -105,8 +81,8 @@ public: void Preprocess(const PathContainer&) override {} - std::string MakeContigName(size_t index, const IOContig &precontig) override { - return io::MakeContigId(index, precontig.sequence_.length(), precontig.path_->Coverage()); + std::string MakeContigName(size_t index, const ScaffoldInfo &scaffold_info) override { + return io::MakeContigId(index, scaffold_info.length(), scaffold_info.coverage()); } }; @@ -118,10 +94,9 @@ void Preprocess(const PathContainer&) override {} - std::string MakeContigName(size_t index, const IOContig &precontig) override { - EdgeId e = precontig.path_->At(0); - size_t component = c_counter_.GetComponent(e); - return io::MakeContigComponentId(index, precontig.sequence_.length(), precontig.path_->Coverage(), component); + std::string MakeContigName(size_t index, const ScaffoldInfo &scaffold_info) override { + return io::AddComponentId(io::MakeContigId(index, scaffold_info.length(), scaffold_info.coverage()), + c_counter_.GetComponent(scaffold_info.path->Front())); } }; @@ -145,8 +120,8 @@ transcript_joiner_.Construct(paths); } - std::string MakeContigName(size_t index, const IOContig &precontig) override { - size_t id = transcript_joiner_.GetPathId(precontig.path_); + std::string MakeContigName(size_t index, const ScaffoldInfo &scaffold_info) override { + size_t id = transcript_joiner_.GetPathId(scaffold_info.path); size_t parent_id = transcript_joiner_.FindTree(id); DEBUG("Path " << id << " Parent " << parent_id); if (gene_ids_.find(parent_id) == gene_ids_.end()) { @@ -154,7 +129,11 @@ isoform_num_[parent_id] = 0; gene_num_++; } - string contig_id = io::MakeRNAContigId(index, precontig.sequence_.length(), precontig.path_->Coverage(), gene_ids_[parent_id], isoform_num_[parent_id]); + string contig_id = io::MakeRNAContigId(index, + scaffold_info.length(), + scaffold_info.coverage(), + gene_ids_[parent_id], + isoform_num_[parent_id]); isoform_num_[parent_id]++; return contig_id; } diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/paths/mapping_path.hpp spades-3.11.1+dfsg/src/common/assembly_graph/paths/mapping_path.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/paths/mapping_path.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/paths/mapping_path.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -8,7 +8,8 @@ #pragma once #include "sequence/sequence.hpp" -#include "utils/range.hpp" +#include "sequence/range.hpp" +#include namespace omnigraph { @@ -167,6 +168,62 @@ : edges_(edges), range_mappings_(range_mappings) {} + MappingPath(const std::vector>& edge_mappings) { + edges_.reserve(edge_mappings.size()); + range_mappings_.reserve(edge_mappings.size()); + for (const auto &em : edge_mappings) { + edges_.push_back(em.first); + range_mappings_.push_back(em.second); + } + } + + class mapping_path_iter : public boost::iterator_facade, + boost::random_access_traversal_tag, + std::pair> { + friend class boost::iterator_core_access; + + const MappingPath &mapping_path_; + size_t pos_; + + std::pair dereference() const { + return mapping_path_[pos_]; + }; + + bool equal(const mapping_path_iter &that) const { + return &mapping_path_ == &that.mapping_path_ && pos_ == that.pos_; + } + + ptrdiff_t distance_to(const mapping_path_iter &that) const { + return that.pos_ - pos_; + } + + void advance(ptrdiff_t n) { + pos_ += n; + } + + void increment() { + advance(1); + } + + void decrement() { + advance(-1); + } + + public: + mapping_path_iter(const MappingPath &mapping_path, size_t pos) : + mapping_path_(mapping_path), + pos_(pos) {} + }; + + mapping_path_iter begin() const { + return mapping_path_iter(*this, 0); + } + + mapping_path_iter end() const { + return mapping_path_iter(*this, size()); + }; + size_t size() const { return edges_.size(); } size_t empty() const { return edges_.empty(); } @@ -180,6 +237,7 @@ }; std::pair operator[](size_t idx) const { + VERIFY(idx < size()); return std::make_pair(edges_[idx], range_mappings_[idx]); } @@ -240,62 +298,157 @@ } template -struct GapDescription { +class GapDescription { typedef typename Graph::EdgeId EdgeId; - EdgeId start, end; - Sequence gap_seq; - //FIXME discuss using size_t - size_t edge_gap_start_position, edge_gap_end_position; - - GapDescription() : - start(0), - end(0), - edge_gap_start_position(0), - edge_gap_end_position(0) { - } - - GapDescription(EdgeId start_e, EdgeId end_e, - const Sequence &gap, - size_t gap_start, size_t gap_end) : - start(start_e), - end(end_e), - gap_seq(gap.str()), - edge_gap_start_position(gap_start), - edge_gap_end_position(gap_end) { + + //Edges on the left and on the right of the gap + EdgeId left_; + EdgeId right_; + + //Estimated nucleotide gap/overlap between the edges !after trimming! (see further). + // Negative values indicate the overlap between edges. + // Should be non-negative for proper final joinings. + int estimated_dist_; + + //Number of nucleotides to trim from the (end of the left)/(beginning of the right) edge + size_t left_trim_; + size_t right_trim_; + + //Optional "filling" sequence, giving "additional" nucleotides which + // should be added while closing the gap. + // Length guaranteed to be equal to estimated_gap (if present). + boost::optional filling_seq_; + + GapDescription(EdgeId left, EdgeId right, + int estimated_dist, + size_t left_trim, size_t right_trim, + boost::optional filling_seq) : + left_(left), + right_(right), + estimated_dist_(estimated_dist), + left_trim_(left_trim), + right_trim_(right_trim), + filling_seq_(filling_seq) { + } + + auto AsTuple() const -> + decltype(std::make_tuple(left_, right_, left_trim_, right_trim_, estimated_dist_, filling_seq_)) { + return std::make_tuple(left_, right_, left_trim_, right_trim_, estimated_dist_, filling_seq_); + } + +public: + static const int INVALID_GAP = std::numeric_limits::min(); + + GapDescription(EdgeId left, EdgeId right, + int estimated_dist, + size_t left_trim = 0, size_t right_trim = 0) : + GapDescription(left, right, + estimated_dist, + left_trim, right_trim, + boost::none) { + } + + GapDescription() : GapDescription(EdgeId(0), EdgeId(0), INVALID_GAP) { + } + + GapDescription(EdgeId left, EdgeId right, + const Sequence &filling_seq, + size_t left_trim = 0, size_t right_trim = 0) : + left_(left), + right_(right), + estimated_dist_(int(filling_seq.size())), + left_trim_(left_trim), + right_trim_(right_trim), + filling_seq_(filling_seq) { + } + + EdgeId left() const { + return left_; + } + + EdgeId right() const { + return right_; + } + + size_t left_trim() const { + return left_trim_; + } + + size_t right_trim() const { + return right_trim_; + } + + bool no_trim() const { + return left_trim_ == 0 && right_trim() == 0; + } + + int estimated_dist() const { + return estimated_dist_; + } + + bool has_filling() const { + return filling_seq_; + } + + Sequence filling_seq() const { + return *filling_seq_; + } + + void set_left(EdgeId e) { + left_ = e; + } + + void set_right(EdgeId e) { + right_ = e; + } + + void set_left_trim(size_t trim) { + left_trim_ = trim; + } + + void set_estimated_dist(int dist) { + VERIFY_MSG(!filling_seq_, "Filling sequence specified"); + estimated_dist_ = dist; + } + + void set_filling_seq(Sequence fill_seq) { + estimated_dist_ = fill_seq.size(); + filling_seq_ = boost::make_optional(fill_seq); } GapDescription conjugate(const Graph &g) const { - GapDescription res( - g.conjugate(end), g.conjugate(start), !gap_seq, - g.length(end) - edge_gap_end_position, - g.length(start) - edge_gap_start_position); + GapDescription res(g.conjugate(right_), + g.conjugate(left_), + estimated_dist_, + right_trim_, + left_trim_, + filling_seq_ ? boost::make_optional(!*filling_seq_) : boost::none); return res; } string str(const Graph &g) const { stringstream s; - s << g.int_id(start) << " " << edge_gap_start_position << endl - << g.int_id(end) << " " << edge_gap_end_position << endl - << gap_seq.str() << endl; + s << "left: " << g.int_id(left_) + << "; right: " << g.int_id(right_) + << "; estimated distance : " << estimated_dist_ + << "; left trim: " << left_trim_ + << "; right trim: " << right_trim_ + << "; sequence " << (filling_seq_ ? filling_seq_->str() : "no_sequence") << endl; return s.str(); } - bool operator<(const GapDescription &b) const { - return start < b.start || - (start == b.start && end < b.end) || - (start == b.start && end == b.end && - edge_gap_start_position < b.edge_gap_start_position); + bool operator<(const GapDescription &rhs) const { + return AsTuple() < rhs.AsTuple(); } bool operator!=(const GapDescription rhs) const { - return start != rhs.start - || end != rhs.end - || gap_seq != rhs.gap_seq - || edge_gap_start_position != rhs.edge_gap_start_position - || edge_gap_end_position != rhs.edge_gap_end_position; + return AsTuple() != rhs.AsTuple(); } -}; + bool operator==(const GapDescription rhs) const { + return !(*this != rhs); + } +}; } diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/paths/path_processor.hpp spades-3.11.1+dfsg/src/common/assembly_graph/paths/path_processor.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/paths/path_processor.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/paths/path_processor.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -8,7 +8,7 @@ #pragma once #include "utils/standard_base.hpp" -#include "common/adt/bag.hpp" +#include "adt/bag.hpp" #include "assembly_graph/dijkstra/dijkstra_helper.hpp" namespace omnigraph { @@ -65,7 +65,7 @@ size_t curr_depth_; size_t call_cnt_; Path reversed_edge_path_; - bag vertex_cnts_; + adt::bag vertex_cnts_; const Graph& g_; const DijkstraT& dijkstra_; @@ -173,10 +173,12 @@ public: - PathProcessor(const Graph& g, VertexId start, size_t length_bound) : + PathProcessor(const Graph& g, VertexId start, size_t length_bound, + size_t dijkstra_vertex_limit = MAX_DIJKSTRA_VERTICES) : g_(g), start_(start), - dijkstra_(DijkstraHelper::CreateBoundedDijkstra(g, length_bound, MAX_DIJKSTRA_VERTICES)) { + dijkstra_(DijkstraHelper::CreateBoundedDijkstra(g, length_bound, + dijkstra_vertex_limit)) { TRACE("Dijkstra launched"); dijkstra_.Run(start); TRACE("Dijkstra finished"); @@ -184,7 +186,9 @@ // dfs from the end vertices // 3 two mistakes, 2 bad dijkstra, 1 some bad dfs, 0 = okay - int Process(VertexId end, size_t min_len, size_t max_len, Callback& callback, size_t edge_depth_bound = -1ul) const { + int Process(VertexId end, size_t min_len, size_t max_len, + Callback& callback, + size_t edge_depth_bound = std::numeric_limits::max()) const { TRACE("Process launched"); int error_code = 0; @@ -219,7 +223,8 @@ template int ProcessPaths(const Graph& g, size_t min_len, size_t max_len, typename Graph::VertexId start, typename Graph::VertexId end, - typename PathProcessor::Callback& callback, size_t max_edge_cnt = -1ul) { + typename PathProcessor::Callback& callback, + size_t max_edge_cnt = std::numeric_limits::max()) { PathProcessor processor(g, start, max_len); return processor.Process(end, min_len, max_len, callback, max_edge_cnt); } diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/stats/picture_dump.hpp spades-3.11.1+dfsg/src/common/assembly_graph/stats/picture_dump.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/stats/picture_dump.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/stats/picture_dump.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -26,7 +26,7 @@ #include "io/reads/wrapper_collection.hpp" #include "io/reads/osequencestream.hpp" #include "io/dataset_support/dataset_readers.hpp" -#include "utils/copy_file.hpp" +#include "utils/filesystem/copy_file.hpp" #include @@ -63,11 +63,11 @@ template void CollectContigPositions(graph_pack &gp) { if (!cfg::get().pos.contigs_for_threading.empty() && - path::FileExists(cfg::get().pos.contigs_for_threading)) + fs::FileExists(cfg::get().pos.contigs_for_threading)) visualization::position_filler::FillPos(gp, cfg::get().pos.contigs_for_threading, "thr_", true); if (!cfg::get().pos.contigs_to_analyze.empty() && - path::FileExists(cfg::get().pos.contigs_to_analyze)) + fs::FileExists(cfg::get().pos.contigs_to_analyze)) visualization::position_filler::FillPos(gp, cfg::get().pos.contigs_to_analyze, "anlz_", true); } @@ -263,6 +263,17 @@ private: + template + std::string ToString(const T& t, size_t length) { + std::ostringstream ss; + ss << t; + std::string result = ss.str(); + while (result.size() < length) + result = "0" + result; + return result; + } + + void ProduceDetailedInfo(const string &pos_name, config::info_printer_pos pos) { using namespace visualization; @@ -281,9 +292,9 @@ } if (config.save_graph_pack) { - string saves_folder = path::append_path(path::append_path(folder_, "saves/"), + string saves_folder = fs::append_path(fs::append_path(folder_, "saves/"), ToString(call_cnt++, 2) + "_" + pos_name + "/"); - path::make_dirs(saves_folder); + fs::make_dirs(saves_folder); graphio::ConjugateDataPrinter printer(gp_.g); graphio::PrintGraphPack(saves_folder + "graph_pack", printer, gp_); //TODO: separate @@ -291,9 +302,9 @@ } if (config.save_all) { - string saves_folder = path::append_path(path::append_path(folder_, "saves/"), + string saves_folder = fs::append_path(fs::append_path(folder_, "saves/"), ToString(call_cnt++, 2) + "_" + pos_name); - path::make_dirs(saves_folder); + fs::make_dirs(saves_folder); string p = saves_folder + "/saves"; INFO("Saving current state to " << p); @@ -302,17 +313,17 @@ } if (config.save_full_graph) { - string saves_folder = path::append_path(path::append_path(folder_, "saves/"), + string saves_folder = fs::append_path(fs::append_path(folder_, "saves/"), ToString(call_cnt++, 2) + "_" + pos_name + "/"); - path::make_dirs(saves_folder); + fs::make_dirs(saves_folder); graphio::ConjugateDataPrinter printer(gp_.g); graphio::PrintBasicGraph(saves_folder + "graph", printer); } if (config.lib_info) { - string saves_folder = path::append_path(path::append_path(folder_, "saves/"), + string saves_folder = fs::append_path(fs::append_path(folder_, "saves/"), ToString(call_cnt++, 2) + "_" + pos_name + "/"); - path::make_dirs(saves_folder); + fs::make_dirs(saves_folder); config::write_lib_data(saves_folder + "lib_info"); } @@ -333,9 +344,9 @@ } VERIFY(cfg::get().developer_mode); - string pics_folder = path::append_path(path::append_path(folder_, "pictures/"), + string pics_folder = fs::append_path(fs::append_path(folder_, "pictures/"), ToString(call_cnt++, 2) + "_" + pos_name + "/"); - path::make_dirs(pics_folder); + fs::make_dirs(pics_folder); PrepareForDrawing(gp_); auto path1 = FindGenomeMappingPath(gp_.genome.GetSequence(), gp_.g, gp_.index, @@ -365,10 +376,10 @@ } if (!config.components_for_kmer.empty()) { - string kmer_folder = path::append_path(pics_folder, "kmer_loc/"); + string kmer_folder = fs::append_path(pics_folder, "kmer_loc/"); make_dir(kmer_folder); auto kmer = RtSeq(gp_.k_value + 1, config.components_for_kmer.substr(0, gp_.k_value + 1).c_str()); - string file_name = path::append_path(kmer_folder, pos_name + ".dot"); + string file_name = fs::append_path(kmer_folder, pos_name + ".dot"); WriteKmerComponent(gp_, kmer, file_name, colorer, labeler_); } @@ -385,7 +396,7 @@ } if (!config.components_for_genome_pos.empty()) { - string pos_loc_folder = path::append_path(pics_folder, "pos_loc/"); + string pos_loc_folder = fs::append_path(pics_folder, "pos_loc/"); make_dir(pos_loc_folder); vector positions; boost::split(positions, config.components_for_genome_pos, @@ -394,9 +405,9 @@ boost::optional close_kp1mer = FindCloseKP1mer(gp_, std::stoi(*it), gp_.k_value); if (close_kp1mer) { - string locality_folder = path::append_path(pos_loc_folder, *it + "/"); + string locality_folder = fs::append_path(pos_loc_folder, *it + "/"); make_dir(locality_folder); - WriteKmerComponent(gp_, *close_kp1mer, path::append_path(locality_folder, pos_name + ".dot"), colorer, labeler_); + WriteKmerComponent(gp_, *close_kp1mer, fs::append_path(locality_folder, pos_name + ".dot"), colorer, labeler_); } else { WARN( "Failed to find genome kp1mer close to the one at position " diff -Nru spades-3.10.1+dfsg/src/common/assembly_graph/stats/statistics.hpp spades-3.11.1+dfsg/src/common/assembly_graph/stats/statistics.hpp --- spades-3.10.1+dfsg/src/common/assembly_graph/stats/statistics.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/assembly_graph/stats/statistics.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -7,7 +7,7 @@ #pragma once -#include "utils/simple_tools.hpp" +#include "utils/stl_utils.hpp" #include "math/xmath.h" #include "pipeline/config_struct.hpp" #include "assembly_graph/paths/mapping_path.hpp" @@ -148,10 +148,13 @@ } } if (edge_count > 0) { + size_t total_genome_size = 0; + for (const auto &chr: cfg::get().ds.reference_genome) + total_genome_size += 2*chr.size(); INFO("Error edges count: " << black_count << " which is " << 100.0 * (double) black_count / (double) edge_count << "% of all edges"); INFO("Total length of all black edges: " << sum_length << ". While double genome length is " << - (2 * cfg::get().ds.reference_genome.size())); + total_genome_size); } else { INFO("Error edges count: " << black_count << " which is 0% of all edges"); } diff -Nru spades-3.10.1+dfsg/src/common/CMakeLists.txt spades-3.11.1+dfsg/src/common/CMakeLists.txt --- spades-3.10.1+dfsg/src/common/CMakeLists.txt 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/CMakeLists.txt 2017-09-28 09:05:08.000000000 +0000 @@ -11,12 +11,13 @@ add_subdirectory(assembly_graph) add_subdirectory(modules/path_extend) add_subdirectory(modules) +add_subdirectory(paired_info) add_subdirectory(stages) add_subdirectory(utils) add_subdirectory(io) -add_subdirectory(utils/mph_index) -add_subdirectory(utils/coverage_model) +add_subdirectory(utils/kmer_mph) +add_subdirectory(modules/coverage_model) add_library(common_modules STATIC empty.cpp) -target_link_libraries(common_modules assembly_graph input pipeline coverage_model path_extend stages utils mph_index modules) +target_link_libraries(common_modules assembly_graph input pipeline coverage_model paired_info path_extend stages utils mph_index modules) diff -Nru spades-3.10.1+dfsg/src/common/io/dataset_support/dataset_readers.hpp spades-3.11.1+dfsg/src/common/io/dataset_support/dataset_readers.hpp --- spades-3.10.1+dfsg/src/common/io/dataset_support/dataset_readers.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/io/dataset_support/dataset_readers.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -8,7 +8,7 @@ #pragma once #include "utils/logger/logger.hpp" -#include "utils/simple_tools.hpp" +#include "utils/stl_utils.hpp" #include "io/reads/io_helper.hpp" #include "pipeline/library.hpp" #include "pipeline/config_struct.hpp" diff -Nru spades-3.10.1+dfsg/src/common/io/dataset_support/read_converter.hpp spades-3.11.1+dfsg/src/common/io/dataset_support/read_converter.hpp --- spades-3.10.1+dfsg/src/common/io/dataset_support/read_converter.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/io/dataset_support/read_converter.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -17,7 +17,7 @@ #include "io/reads/binary_converter.hpp" #include "io/reads/io_helper.hpp" #include "dataset_readers.hpp" -#include "utils/simple_tools.hpp" +#include "utils/stl_utils.hpp" #include @@ -33,7 +33,7 @@ const static size_t current_binary_format_version = 11; static bool CheckBinaryReadsExist(SequencingLibraryT& lib) { - return path::FileExists(lib.data().binary_reads_info.bin_reads_info_file); + return fs::FileExists(lib.data().binary_reads_info.bin_reads_info_file); } static bool LoadLibIfExists(SequencingLibraryT& lib) { diff -Nru spades-3.10.1+dfsg/src/common/io/kmers/kmer_iterator.hpp spades-3.11.1+dfsg/src/common/io/kmers/kmer_iterator.hpp --- spades-3.10.1+dfsg/src/common/io/kmers/kmer_iterator.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/io/kmers/kmer_iterator.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -35,7 +35,7 @@ size_t chunk = round_up(file_size / amount, getpagesize() * Seq::GetDataSize(K) * sizeof(typename Seq::DataType)); size_t offset = 0; - if (chunk > file_size) + if (chunk == 0 || chunk > file_size) chunk = file_size; while (offset < file_size) { diff -Nru spades-3.10.1+dfsg/src/common/io/kmers/mmapped_reader.hpp spades-3.11.1+dfsg/src/common/io/kmers/mmapped_reader.hpp --- spades-3.10.1+dfsg/src/common/io/kmers/mmapped_reader.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/io/kmers/mmapped_reader.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -14,6 +14,7 @@ #include "utils/verify.hpp" #include +#include "common/adt/pointer_iterator.hpp" #include #include @@ -200,8 +201,8 @@ template class MMappedRecordReader : public MMappedReader { public: - typedef pointer_iterator iterator; - typedef const pointer_iterator const_iterator; + typedef adt::pointer_iterator iterator; + typedef const adt::pointer_iterator const_iterator; MMappedRecordReader(const std::string &FileName, bool unlink = true, size_t blocksize = 64 * 1024 * 1024 / (sizeof(T) * (unsigned) getpagesize()) * @@ -289,8 +290,8 @@ size_t elcnt_; public: - typedef typename array_vector::iterator iterator; - typedef typename array_vector::const_iterator const_iterator; + typedef typename adt::array_vector::iterator iterator; + typedef typename adt::array_vector::const_iterator const_iterator; MMappedRecordArrayReader(const std::string &FileName, size_t elcnt = 1, diff -Nru spades-3.10.1+dfsg/src/common/io/kmers/mmapped_writer.hpp spades-3.11.1+dfsg/src/common/io/kmers/mmapped_writer.hpp --- spades-3.10.1+dfsg/src/common/io/kmers/mmapped_writer.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/io/kmers/mmapped_writer.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -8,8 +8,9 @@ #ifndef HAMMER_MMAPPED_WRITER_HPP #define HAMMER_MMAPPED_WRITER_HPP -#include "common/adt/pointer_iterator.hpp" -#include "common/adt/array_vector.hpp" +#include "adt/pointer_iterator.hpp" +#include "adt/array_vector.hpp" +#include "common/utils/verify.hpp" #include @@ -19,6 +20,7 @@ #include #include #include +#include class MMappedWriter { int StreamFile; @@ -98,8 +100,8 @@ template class MMappedRecordWriter : public MMappedWriter { public: - typedef pointer_iterator iterator; - typedef const pointer_iterator const_iterator; + typedef adt::pointer_iterator iterator; + typedef const adt::pointer_iterator const_iterator; MMappedRecordWriter() = default; @@ -142,8 +144,8 @@ class MMappedRecordArrayWriter : public MMappedWriter { size_t elcnt_; public: - typedef typename array_vector::iterator iterator; - typedef typename array_vector::const_iterator const_iterator; + typedef typename adt::array_vector::iterator iterator; + typedef typename adt::array_vector::const_iterator const_iterator; MMappedRecordArrayWriter() = default; diff -Nru spades-3.10.1+dfsg/src/common/io/reads/binary_converter.hpp spades-3.11.1+dfsg/src/common/io/reads/binary_converter.hpp --- spades-3.10.1+dfsg/src/common/io/reads/binary_converter.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/io/reads/binary_converter.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -206,7 +206,7 @@ std::string fname; for (size_t i = 0; i < file_num_; ++i) { - fname = file_name_prefix_ + "_" + ToString(i) + ".seq"; + fname = file_name_prefix_ + "_" + std::to_string(i) + ".seq"; file_ds_.push_back(new std::ofstream(fname, std::ios_base::binary)); } } diff -Nru spades-3.10.1+dfsg/src/common/io/reads/binary_streams.hpp spades-3.11.1+dfsg/src/common/io/reads/binary_streams.hpp --- spades-3.10.1+dfsg/src/common/io/reads/binary_streams.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/io/reads/binary_streams.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -29,7 +29,7 @@ BinaryFileSingleStream(const std::string& file_name_prefix, size_t file_num) { std::string fname; - fname = file_name_prefix + "_" + ToString(file_num) + ".seq"; + fname = file_name_prefix + "_" + std::to_string(file_num) + ".seq"; stream_.open(fname.c_str(), std::ios_base::binary | std::ios_base::in); reset(); @@ -90,7 +90,7 @@ BinaryFilePairedStream(const std::string& file_name_prefix, size_t file_num, size_t insert_szie): stream_(), insert_size_ (insert_szie) { std::string fname; - fname = file_name_prefix + "_" + ToString(file_num) + ".seq"; + fname = file_name_prefix + "_" + std::to_string(file_num) + ".seq"; stream_.open(fname.c_str(), std::ios_base::binary | std::ios_base::in); reset(); diff -Nru spades-3.10.1+dfsg/src/common/io/reads/fasta_fastq_gz_parser.hpp spades-3.11.1+dfsg/src/common/io/reads/fasta_fastq_gz_parser.hpp --- spades-3.10.1+dfsg/src/common/io/reads/fasta_fastq_gz_parser.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/io/reads/fasta_fastq_gz_parser.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -39,7 +39,12 @@ namespace fastafastqgz { // STEP 1: declare the type of file handler and the read() function +// Silence bogus gcc warnings +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +// STEP 1: declare the type of file handler and the read() function KSEQ_INIT(gzFile, gzread) +#pragma GCC diagnostic pop } class FastaFastqGzParser: public Parser { diff -Nru spades-3.10.1+dfsg/src/common/io/reads/file_reader.hpp spades-3.11.1+dfsg/src/common/io/reads/file_reader.hpp --- spades-3.10.1+dfsg/src/common/io/reads/file_reader.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/io/reads/file_reader.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -18,7 +18,7 @@ #include "ireader.hpp" #include "single_read.hpp" #include "parser.hpp" -#include "utils/path_helper.hpp" +#include "utils/filesystem/path_helper.hpp" namespace io { @@ -35,7 +35,7 @@ explicit FileReadStream(const std::string &filename, OffsetType offset_type = PhredOffset) : filename_(filename), offset_type_(offset_type), parser_(NULL) { - path::CheckFileExistenceFATAL(filename_); + fs::CheckFileExistenceFATAL(filename_); parser_ = SelectParser(filename_, offset_type_); } diff -Nru spades-3.10.1+dfsg/src/common/io/reads/ireadstream.hpp spades-3.11.1+dfsg/src/common/io/reads/ireadstream.hpp --- spades-3.10.1+dfsg/src/common/io/reads/ireadstream.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/io/reads/ireadstream.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -21,8 +21,12 @@ #include "read.hpp" #include "sequence/nucl.hpp" +// Silence bogus gcc warnings +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" // STEP 1: declare the type of file handler and the read() function KSEQ_INIT(gzFile, gzread) +#pragma GCC diagnostic pop /* * Read name, seq and qual strings from FASTQ data (one by one) diff -Nru spades-3.10.1+dfsg/src/common/io/reads/osequencestream.hpp spades-3.11.1+dfsg/src/common/io/reads/osequencestream.hpp --- spades-3.10.1+dfsg/src/common/io/reads/osequencestream.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/io/reads/osequencestream.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -22,38 +22,45 @@ namespace io { +inline std::string MakeContigId(size_t number, const std::string& prefix = "NODE") { + return prefix.empty() ? std::to_string(number) : (prefix + "_" + std::to_string(number)); +} + inline std::string MakeContigId(size_t number, size_t length, const std::string& prefix = "NODE") { - return prefix + "_" + ToString(number) + "_length_" + ToString(length); + return MakeContigId(number, prefix) + "_length_" + std::to_string(length); } inline std::string MakeContigId(size_t number, size_t length, double coverage, const std::string& prefix = "NODE") { - return MakeContigId(number, length, prefix) + "_cov_" + ToString(coverage); + return MakeContigId(number, length, prefix) + "_cov_" + std::to_string(coverage); } inline std::string MakeContigId(size_t number, size_t length, double coverage, size_t id, const std::string& prefix = "NODE") { - return MakeContigId(number, length, coverage, prefix) + "_ID_" + ToString(id); + return MakeContigId(number, length, coverage, prefix) + "_ID_" + std::to_string(id); } inline std::string MakeRNAContigId(size_t number, size_t length, double coverage, size_t gene_id, size_t isoform_id, const std::string& prefix = "NODE") { - return MakeContigId(number, length, coverage, prefix) + "_g" + ToString(gene_id) + "_i" + ToString(isoform_id); + return MakeContigId(number, length, coverage, prefix) + "_g" + std::to_string(gene_id) + "_i" + std::to_string(isoform_id); } -inline std::string MakeContigComponentId(size_t number, size_t length, double coverage, size_t component_id, const std::string& prefix = "NODE") { - return MakeContigId(number, length, coverage, prefix) + "_component_" + ToString(component_id); +inline std::string AddComponentId(const string& s, size_t component_id) { + return s + "_component_" + std::to_string(component_id); +} + +inline void WriteWrapped(const std::string &s, ostream &os, size_t max_width = 60) { + size_t cur = 0; + while (cur < s.size()) { + os << s.substr(cur, max_width) << "\n"; + cur += max_width; + } } class osequencestream { protected: std::ofstream ofstream_; - size_t id_; void write_str(const std::string& s) { - size_t cur = 0; - while (cur < s.size()) { - ofstream_ << s.substr(cur, 60) << std::endl; - cur += 60; - } + WriteWrapped(s, ofstream_); } virtual void write_header(const std::string& s) { @@ -62,81 +69,25 @@ } public: - osequencestream(const std::string& filename): id_(1) { - ofstream_.open(filename.c_str()); + osequencestream(const std::string& filename): + ofstream_(filename), id_(1) { } + virtual ~osequencestream() {} - virtual ~osequencestream() { - ofstream_.close(); - } - - virtual osequencestream& operator<<(const std::string& s) { + osequencestream& operator<<(const std::string& s) { write_header(s); write_str(s); return *this; } - virtual osequencestream& operator<<(const Sequence& seq) { + osequencestream& operator<<(const Sequence& seq) { std::string s = seq.str(); return operator <<(s); } - /** - * Has different way of making headers - * Doesn't increase counters, don't mix with other methods! - */ - virtual osequencestream& operator<<(const SingleRead& read) { - ofstream_ << ">" << read.name() << std::endl; - size_t cur = 0; - std::string s = read.GetSequenceString(); - while (cur < s.size()) { - ofstream_ << s.substr(cur, 60) << std::endl; - cur += 60; - } - return *this; - } }; - - - - - -class PairedOutputSequenceStream { -protected: - std::ofstream ofstreaml_; - std::ofstream ofstreamr_; - - static void write(const SingleRead& read, std::ofstream& stream) { - stream << ">" << read.name() << std::endl; - size_t cur = 0; - std::string s = read.GetSequenceString(); - while (cur < s.size()) { - stream << s.substr(cur, 60) << std::endl; - cur += 60; - } - } - -public: - PairedOutputSequenceStream(const std::string& filename1, const std::string &filename2) { - ofstreaml_.open(filename1); - ofstreamr_.open(filename2); - } - - virtual ~PairedOutputSequenceStream() { - ofstreaml_.close(); - ofstreamr_.close(); - } - - PairedOutputSequenceStream& operator<<(const PairedRead& read) { - write(read.first(), ofstreaml_); - write(read.second(), ofstreamr_); - return *this; - } -}; - - class osequencestream_cov: public osequencestream { protected: double coverage_; @@ -151,231 +102,46 @@ osequencestream_cov(const std::string& filename) : osequencestream(filename), coverage_(0.) { } - virtual ~osequencestream_cov() { - ofstream_.close(); - } - osequencestream_cov& operator<<(double coverage) { coverage_ = coverage; return *this; } - osequencestream_cov& operator<<(const std::string& s) { - write_header(s); - write_str(s); - return *this; - } - - osequencestream_cov& operator<<(const Sequence& seq) { - std::string s = seq.str(); - return operator <<(s); - } - -}; - - -class osequencestream_simple: public osequencestream { -protected: - std::string header_; - - double cov_; - - virtual void write_header(const std::string& /*s*/) { - ofstream_ << ">" << header_ << std::endl; - } - -public: - osequencestream_simple(const std::string& filename) - : osequencestream(filename), header_("") { } - - virtual ~osequencestream_simple() { - ofstream_.close(); - } - - void set_header(const std::string &header) { - header_ = header; - } - - osequencestream_simple& operator<<(const std::string& s) { - write_header(s); - write_str(s); - return *this; - } - - osequencestream_simple& operator<<(const Sequence& seq) { - std::string s = seq.str(); - return operator <<(s); - } + using osequencestream::operator<<; }; -class osequencestream_with_id: public osequencestream { -protected: - size_t uid_; - - double cov_; - - virtual void write_header(const std::string& s) { - ofstream_ << ">" << GetId(s) << std::endl; - id_++; - } - -public: - osequencestream_with_id(const std::string& filename) - : osequencestream(filename), uid_(0), cov_(0.0) { } - - virtual ~osequencestream_with_id() { - ofstream_.close(); - } - - std::string GetId(const std::string& s) const { - return MakeContigId(id_, s.size(), cov_, uid_); - } - - void setCoverage(double c) { - cov_ = c; - } - - void setID(size_t uid) { - uid_ = uid; - } - - osequencestream_with_id& operator<<(const std::string& s) { - write_header(s); - write_str(s); - return *this; - } - - osequencestream_with_id& operator<<(double coverage) { - cov_ = coverage; - return *this; - } - - osequencestream_with_id& operator<<(const Sequence& seq) { - std::string s = seq.str(); - return operator <<(s); - } - -}; - -class osequencestream_with_manual_node_id: public osequencestream_with_id { - bool is_id_set_; - virtual void write_header(const std::string& s) { - //for manual NODE ID setting osequencestream need to chech that node ID is really manually set - if (!is_id_set_) { - WARN ("NODE ID is not set manually, setting to 0"); - id_ = 0; - } - ofstream_ << ">" << MakeContigId(id_, s.size(), cov_, uid_) << std::endl; - is_id_set_ = false; - } - -public: -//unfortunately constructor inheritance is supported only since g++4.8 - osequencestream_with_manual_node_id(const std::string& filename): osequencestream_with_id(filename) { - is_id_set_ = false; - } - - void setNodeID(int id) { - id_ = id; - is_id_set_ = true; - } - - osequencestream_with_manual_node_id& operator<<(const std::string& s) { - write_header(s); - write_str(s); - return *this; - } - - osequencestream_with_manual_node_id& operator<<(const Sequence& seq) { - std::string s = seq.str(); - return operator <<(s); - } - - -}; - - -class osequencestream_with_data_for_scaffold: public osequencestream_with_id { -protected: - std::ofstream scstream_; - - virtual void write_header(const std::string& s) { - scstream_ << id_ << "\tNODE_" << id_ << "\t" << s.size() << "\t" << (int) round(cov_) << std::endl; - ofstream_ << ">" << MakeContigId(id_++, s.size(), cov_, uid_) << std::endl; - } - +class OutputSequenceStream { + std::ofstream ofstream_; public: - osequencestream_with_data_for_scaffold(const std::string& filename): osequencestream_with_id(filename) { - id_ = 1; - std::string sc_filename = filename + ".info"; - scstream_.open(sc_filename.c_str()); - } - virtual ~osequencestream_with_data_for_scaffold() { - ofstream_.close(); - scstream_.close(); + OutputSequenceStream(const std::string& filename): + ofstream_(filename) { } - osequencestream_with_data_for_scaffold& operator<<(const std::string& s) { - write_header(s); - write_str(s); + OutputSequenceStream& operator<<(const SingleRead& read) { + ofstream_ << ">" << read.name() << "\n"; + WriteWrapped(read.GetSequenceString(), ofstream_); return *this; } - - osequencestream_with_data_for_scaffold& operator<<(const Sequence& seq) { - std::string s = seq.str(); - return operator <<(s); - } }; -class osequencestream_for_fastg: public osequencestream_with_id { -protected: - std::string header_; - - virtual void write_header(const std::string& s) { - ofstream_ << ">" << s; - } +class PairedOutputSequenceStream { + OutputSequenceStream os_l_; + OutputSequenceStream os_r_; public: - osequencestream_for_fastg(const std::string& filename): - osequencestream_with_id(filename) { - id_ = 1; - } - - virtual ~osequencestream_for_fastg() { - ofstream_.close(); - } - - void set_header(const std::string& h) { - header_= h; - } - - osequencestream_for_fastg& operator<<(const std::set& s) { - write_header(header_); - if (s.size() > 0) { - auto iter = s.begin(); - ofstream_ << ":" << *iter; - ++iter; - while (iter != s.end()) { - ofstream_ << "," << *iter; - ++iter; - } - } - ofstream_ << ";" << std::endl; - return *this; + PairedOutputSequenceStream(const std::string& filename1, + const std::string &filename2) : + os_l_(filename1), + os_r_(filename2) { } - osequencestream_for_fastg& operator<<(const std::string& s) { - write_str(s); + PairedOutputSequenceStream& operator<<(const PairedRead& read) { + os_l_ << read.first(); + os_r_ << read.second(); return *this; } - - osequencestream_for_fastg& operator<<(const Sequence& seq) { - std::string s = seq.str(); - return operator <<(s); - } - }; } diff -Nru spades-3.10.1+dfsg/src/common/io/reads/paired_read.hpp spades-3.11.1+dfsg/src/common/io/reads/paired_read.hpp --- spades-3.10.1+dfsg/src/common/io/reads/paired_read.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/io/reads/paired_read.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -86,11 +86,6 @@ return !file.fail(); } - void print_size() const { - first_.print_size(); - second_.print_size(); - } - private: SingleRead first_; SingleRead second_; diff -Nru spades-3.10.1+dfsg/src/common/io/reads/read.hpp spades-3.11.1+dfsg/src/common/io/reads/read.hpp --- spades-3.10.1+dfsg/src/common/io/reads/read.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/io/reads/read.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -23,7 +23,7 @@ #include "sequence/sequence.hpp" #include "sequence/nucl.hpp" #include "sequence/sequence_tools.hpp" -#include "utils/simple_tools.hpp" +#include "utils/stl_utils.hpp" //fixme deprecated!!! used in hammer! class Read { diff -Nru spades-3.10.1+dfsg/src/common/io/reads/read_processor.hpp spades-3.11.1+dfsg/src/common/io/reads/read_processor.hpp --- spades-3.10.1+dfsg/src/common/io/reads/read_processor.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/io/reads/read_processor.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -10,7 +10,7 @@ #include "io/reads/mpmc_bounded.hpp" -#include "utils/openmp_wrapper.h" +#include "utils/parallel/openmp_wrapper.h" #pragma GCC diagnostic push #ifdef __clang__ diff -Nru spades-3.10.1+dfsg/src/common/io/reads/sequence_reader.hpp spades-3.11.1+dfsg/src/common/io/reads/sequence_reader.hpp --- spades-3.10.1+dfsg/src/common/io/reads/sequence_reader.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/io/reads/sequence_reader.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -8,7 +8,7 @@ #pragma once #include "io/reads/ireader.hpp" -#include "common/basic/reads/single_read.hpp" +#include "basic/reads/single_read.hpp" namespace io { diff -Nru spades-3.10.1+dfsg/src/common/io/reads/single_read.hpp spades-3.11.1+dfsg/src/common/io/reads/single_read.hpp --- spades-3.10.1+dfsg/src/common/io/reads/single_read.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/io/reads/single_read.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -12,7 +12,7 @@ #include "sequence/sequence.hpp" #include "sequence/nucl.hpp" #include "sequence/sequence_tools.hpp" -#include "utils/simple_tools.hpp" +#include "utils/stl_utils.hpp" #include @@ -39,14 +39,11 @@ public: static std::string EmptyQuality(const std::string &seq) { - return std::string(seq.size(), (char) 33); + return std::string(seq.size(), (char) PhredOffset); } - static const int BAD_QUALITY_THRESHOLD = 2; - SingleRead() : name_(""), seq_(""), qual_(""), left_offset_(0), right_offset_(0), valid_(false) { - DEBUG(name_ << " created"); } SingleRead(const std::string &name, const std::string &seq, @@ -54,7 +51,6 @@ SequenceOffsetT left_offset = 0, SequenceOffsetT right_offset = 0) : name_(name), seq_(seq), qual_(qual), left_offset_(left_offset), right_offset_(right_offset) { Init(); - DEBUG(name_ << " created"); for (size_t i = 0; i < qual_.size(); ++i) { qual_[i] = (char) (qual_[i] - offset); } @@ -64,7 +60,6 @@ const std::string &qual, SequenceOffsetT left_offset = 0, SequenceOffsetT right_offset = 0) : name_(name), seq_(seq), qual_(qual), left_offset_(left_offset), right_offset_(right_offset) { - DEBUG(name_ << " created"); Init(); } @@ -72,7 +67,6 @@ SequenceOffsetT left_offset = 0, SequenceOffsetT right_offset = 0) : name_(name), seq_(seq), qual_(EmptyQuality(seq_)), left_offset_(left_offset), right_offset_(right_offset) { - DEBUG(name_ << " created"); Init(); } @@ -196,12 +190,6 @@ return !file.fail(); } - - void print_size() const { - std::cerr << size() << std::endl; - } - - private: /* * @variable The name of SingleRead in input file. @@ -237,10 +225,10 @@ // TODO remove naming? std::string new_name; if (name_.length() >= 3 && name_.substr(name_.length() - 3) == "_RC") { - new_name = name_.substr(0, name_.length() - 3) + "_SUBSTR(" + ToString(size() - to) + "," + - ToString(size() - from) + ")" + "_RC"; + new_name = name_.substr(0, name_.length() - 3) + "_SUBSTR(" + std::to_string(size() - to) + "," + + std::to_string(size() - from) + ")" + "_RC"; } else { - new_name = name_ + "_SUBSTR(" + ToString(from) + "," + ToString(to) + ")"; + new_name = name_ + "_SUBSTR(" + std::to_string(from) + "," + std::to_string(to) + ")"; } return SingleRead(new_name, seq_.substr(from, len), qual_.substr(from, len), SequenceOffsetT(from + (size_t) left_offset_), diff -Nru spades-3.10.1+dfsg/src/common/io/reads/wrapper_collection.hpp spades-3.11.1+dfsg/src/common/io/reads/wrapper_collection.hpp --- spades-3.10.1+dfsg/src/common/io/reads/wrapper_collection.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/io/reads/wrapper_collection.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -24,7 +24,7 @@ /* virtual */ IdSettingReaderWrapper& operator>>(SingleRead& read) { this->reader() >> read; - read.ChangeName(ToString(next_id_++)); + read.ChangeName(std::to_string(next_id_++)); return *this; } }; diff -Nru spades-3.10.1+dfsg/src/common/math/log.hpp spades-3.11.1+dfsg/src/common/math/log.hpp --- spades-3.10.1+dfsg/src/common/math/log.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/math/log.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -0,0 +1,33 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +/* + * Compile time log(n,base) function for use in templates + * + * Created on: 02.03.2011 + * Author: vyahhi + */ + +#ifndef LOG_HPP_ +#define LOG_HPP_ + +template +struct log_ { + const static size_t value = 1 + log_::value; +}; + +template +struct log_<1, base> { + const static size_t value = 0; +}; + +template +struct log_<0, base> { + const static size_t value = 0; +}; + +#endif /* LOG_HPP_ */ diff -Nru spades-3.10.1+dfsg/src/common/modules/alignment/bwa_index.cpp spades-3.11.1+dfsg/src/common/modules/alignment/bwa_index.cpp --- spades-3.10.1+dfsg/src/common/modules/alignment/bwa_index.cpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/alignment/bwa_index.cpp 2017-09-28 09:05:09.000000000 +0000 @@ -262,7 +262,7 @@ std::string seq = sequence.str(); mem_alnreg_v ar = mem_align1(memopt_.get(), idx_->bwt, idx_->bns, idx_->pac, - seq.length(), seq.data()); + int(seq.length()), seq.data()); for (size_t i = 0; i < ar.n; ++i) { const mem_alnreg_t &a = ar.a[i]; if (a.secondary >= 0) continue; // skip secondary alignments @@ -281,10 +281,10 @@ size_t read_length = seq.length() ; //we had to reduce the range to kmer-based if (pos + (a.re - a.rb) >= g_.length(ids_[a.rid]) ){ - if (a.qe > g_.k() + a.qb) + if (a.qe > int(g_.k()) + a.qb) initial_range_end -= g_.k(); else continue; - if (a.re > g_.k() + a.rb) + if (a.re > int(g_.k()) + a.rb) mapping_range_end -= g_.k(); else continue; if (read_length >= g_.k()) @@ -302,8 +302,8 @@ // fprintf (stderr,"%d %d %d\n", pos, pos + a.re - a.rb , g_.length(ids_[a.rid]) ); res.push_back(g_.conjugate(ids_[a.rid]), - { omnigraph::Range(a.qb, initial_range_end).Invert(read_length), - omnigraph::Range(pos, mapping_range_end ).Invert(g_.length(ids_[a.rid])) }); + { Range(a.qb, initial_range_end).Invert(read_length), + Range(pos, mapping_range_end ).Invert(g_.length(ids_[a.rid])) }); } diff -Nru spades-3.10.1+dfsg/src/common/modules/alignment/edge_index.hpp spades-3.11.1+dfsg/src/common/modules/alignment/edge_index.hpp --- spades-3.10.1+dfsg/src/common/modules/alignment/edge_index.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/alignment/edge_index.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -7,9 +7,9 @@ #pragma once -#include "common/assembly_graph/core/graph.hpp" -#include "common/assembly_graph/core/action_handlers.hpp" -#include "utils/indices/edge_info_updater.hpp" +#include "assembly_graph/core/graph.hpp" +#include "assembly_graph/core/action_handlers.hpp" +#include "assembly_graph/index/edge_info_updater.hpp" #include "edge_index_refiller.hpp" namespace debruijn_graph { @@ -24,7 +24,7 @@ public: typedef typename Graph::EdgeId EdgeId; - using InnerIndex = KmerFreeEdgeIndex; + using InnerIndex = KmerFreeEdgeIndex; typedef Graph GraphT; typedef typename InnerIndex::KMer KMer; typedef typename InnerIndex::KMerIdx KMerIdx; diff -Nru spades-3.10.1+dfsg/src/common/modules/alignment/edge_index_refiller.cpp spades-3.11.1+dfsg/src/common/modules/alignment/edge_index_refiller.cpp --- spades-3.10.1+dfsg/src/common/modules/alignment/edge_index_refiller.cpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/alignment/edge_index_refiller.cpp 2017-09-28 09:05:09.000000000 +0000 @@ -4,8 +4,8 @@ //* See file LICENSE for details. //*************************************************************************** -#include "utils/indices/edge_index_builders.hpp" -#include "utils/indices/edge_multi_index.hpp" +#include "index/edge_index_builders.hpp" +#include "index/edge_multi_index.hpp" #include "core/graph.hpp" #include "edge_index_refiller.hpp" diff -Nru spades-3.10.1+dfsg/src/common/modules/alignment/kmer_mapper.hpp spades-3.11.1+dfsg/src/common/modules/alignment/kmer_mapper.hpp --- spades-3.10.1+dfsg/src/common/modules/alignment/kmer_mapper.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/alignment/kmer_mapper.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -8,7 +8,7 @@ #pragma once #include "sequence/sequence_tools.hpp" -#include "common/adt/kmer_vector.hpp" +#include "adt/kmer_vector.hpp" #include "edge_index.hpp" #include "kmer_map.hpp" @@ -67,7 +67,7 @@ if (normalized_) return; - KMerVector all(k_, size()); + adt::KMerVector all(k_, size()); for (auto it = begin(); it != end(); ++it) all.push_back(it->first); diff -Nru spades-3.10.1+dfsg/src/common/modules/alignment/kmer_mapper_logger.hpp spades-3.11.1+dfsg/src/common/modules/alignment/kmer_mapper_logger.hpp --- spades-3.10.1+dfsg/src/common/modules/alignment/kmer_mapper_logger.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/alignment/kmer_mapper_logger.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -16,7 +16,7 @@ #define KMER_MAPPER_LOGGER_H_ #include "sequence/sequence.hpp" -#include "common/assembly_graph/core/action_handlers.hpp" +#include "assembly_graph/core/action_handlers.hpp" #include "utils/standard_base.hpp" namespace debruijn { diff -Nru spades-3.10.1+dfsg/src/common/modules/alignment/pacbio/pacbio_read_structures.hpp spades-3.11.1+dfsg/src/common/modules/alignment/pacbio/pacbio_read_structures.hpp --- spades-3.10.1+dfsg/src/common/modules/alignment/pacbio/pacbio_read_structures.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/alignment/pacbio/pacbio_read_structures.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -7,14 +7,15 @@ #pragma once -#include "utils/indices/perfect_hash_map.hpp" -#include "common/modules/alignment/sequence_mapper.hpp" -#include "common/assembly_graph/core/graph.hpp" +#include "utils/ph_map/perfect_hash_map.hpp" +#include "modules/alignment/sequence_mapper.hpp" +#include "assembly_graph/core/graph.hpp" #include #include #include namespace pacbio { +static const int STRING_DIST_INF = 1e8; typedef omnigraph::GapDescription GapDescription; template @@ -149,55 +150,6 @@ ; }; -//template -//struct GapDescription { -// typedef typename Graph::EdgeId EdgeId; -// EdgeId start, end; -// Sequence gap_seq; -// int edge_gap_start_position, edge_gap_end_position; -// -// -// GapDescription(EdgeId start_e, EdgeId end_e, const Sequence &gap, int gap_start, int gap_end) : -// start(start_e), end(end_e), gap_seq(gap.str()), edge_gap_start_position(gap_start), edge_gap_end_position(gap_end) { -// } -// -// GapDescription(const KmerCluster &a, const KmerCluster & b, Sequence read, int pacbio_k) { -// edge_gap_start_position = a.sorted_positions[a.last_trustable_index].edge_position; -// edge_gap_end_position = b.sorted_positions[b.first_trustable_index].edge_position + pacbio_k - 1; -// start = a.edgeId; -// end = b.edgeId; -// DEBUG(read.str()); -// gap_seq = read.Subseq(a.sorted_positions[a.last_trustable_index].read_position, -// b.sorted_positions[b.first_trustable_index].read_position + pacbio_k - 1); -// DEBUG(gap_seq.str()); -// DEBUG("gap added"); -// } -// -// GapDescription conjugate(Graph &g, int shift) const { -// GapDescription res( -// g.conjugate(end), g.conjugate(start), (!gap_seq), -// (int) g.length(end) + shift - edge_gap_end_position, -// (int) g.length(start) + shift - edge_gap_start_position); -// DEBUG("conjugate created" << res.str(g)); -// return res; -// } -// -// string str(Graph &g) const { -// stringstream s; -// s << g.int_id(start) << " " << edge_gap_start_position < path_len_in_edges; vector subreads_length; @@ -268,9 +220,9 @@ int high = min(min(b_len, i + d + 1), i + a_len - b_len + d + 1); TRACE(low << " " <edgeId); + auto sorted_by_edge = i_iter->sorted_positions; + + DEBUG("filtering with cluster edge, stage 2 "<< edge_id << " len " << sorted_by_edge.size() << " clusters still alive: "<< clusters.size()); + for (auto j_iter = clusters.begin(); j_iter != clusters.end();) { + if (i_iter != j_iter) { + if (dominates(*i_iter, *j_iter)) { + TRACE("cluster is dominated"); + auto tmp_iter = j_iter; + tmp_iter++; + TRACE("cluster on edge " << g_.int_id(j_iter->edgeId)); + TRACE("erased - dominated"); + clusters.erase(j_iter); + j_iter = tmp_iter; + } else { + j_iter++; + } + } else { + j_iter++; + } + } + DEBUG("cluster size "<< i_iter->sorted_positions.size() << "survived filtering"); + i_iter++; + } + } + //filter clusters that are too small or fully located on a vertex or dominated by some other cluster. void FilterClusters(ClustersSet &clusters) const { for (auto i_iter = clusters.begin(); i_iter != clusters.end();) { @@ -250,10 +279,10 @@ double good = 0; DEBUG("filtering cluster of size " << sorted_by_edge.size()); DEBUG(edge_id <<" : edgeId"); - for (auto iter = sorted_by_edge.begin(); - iter < sorted_by_edge.end(); iter++) { + for (auto iter = sorted_by_edge.begin(); iter < sorted_by_edge.end(); iter++) { if (iter->IsUnique()) good++; +//TODO:: back to quality for laaarge genomes (kmer size)? //good += 1.0 / (iter->quality * iter->quality); } DEBUG("good " << good); @@ -280,31 +309,7 @@ } } } - for (auto i_iter = clusters.begin(); i_iter != clusters.end();) { - size_t edge_id = g_.int_id(i_iter->edgeId); - auto sorted_by_edge = i_iter->sorted_positions; - - DEBUG("filtering with cluster edge, stage 2 "<< edge_id << " len " << sorted_by_edge.size() << " clusters still alive: "<< clusters.size()); - for (auto j_iter = clusters.begin(); j_iter != clusters.end();) { - if (i_iter != j_iter) { - if (dominates(*i_iter, *j_iter)) { - TRACE("cluster is dominated"); - auto tmp_iter = j_iter; - tmp_iter++; - TRACE("cluster on edge " << g_.int_id(j_iter->edgeId)); - TRACE("erased - dominated"); - clusters.erase(j_iter); - j_iter = tmp_iter; - } else { - j_iter++; - } - } else { - j_iter++; - } - } - DEBUG("cluster size "<< i_iter->sorted_positions.size() << "survived filtering"); - i_iter++; - } + FilterDominatedClusters(clusters); } // is "non strictly dominates" required? @@ -329,8 +334,7 @@ vector> res; EdgeId prev_edge = EdgeId(0); - for (auto iter = cur_cluster.begin(); iter != cur_cluster.end(); - ++iter) { + for (auto iter = cur_cluster.begin(); iter != cur_cluster.end();) { EdgeId cur_edge = iter->second->edgeId; if (prev_edge != EdgeId(0)) { //Need to find sequence of edges between clusters @@ -374,26 +378,6 @@ vector intermediate_path = BestScoredPath(s, start_v, end_v, limits.first, limits.second, seq_start, seq_end, s_add, e_add); if (intermediate_path.size() == 0) { DEBUG("Tangled region between edgees "<< g_.int_id(prev_edge) << " " << g_.int_id(cur_edge) << " is not closed, additions from edges: " << int(g_.length(prev_edge)) - int(prev_last_index.edge_position) <<" " << int(cur_first_index.edge_position) - int(debruijn_k - pacbio_k ) << " and seq "<< - seq_start + seq_end); - if (pb_config_.additional_debug_info) { - DEBUG(" escpected gap length: " << -int(g_.length(prev_edge)) + int(prev_last_index.edge_position) - int(cur_first_index.edge_position) + int(debruijn_k - pacbio_k ) - seq_start + seq_end); - omnigraph::PathStorageCallback callback(g_); - ProcessPaths(g_, 0, 4000, - start_v, end_v, - callback); - vector > paths = callback.paths(); - stringstream s_buf; - for (auto p_iter = paths.begin(); - p_iter != paths.end(); p_iter++) { - size_t tlen = 0; - for (auto path_iter = p_iter->begin(); - path_iter != p_iter->end(); - path_iter++) { - tlen += g_.length(*path_iter); - } - s_buf << tlen << " "; - } - DEBUG(s_buf.str()); - } res.push_back(cur_sorted); cur_sorted.clear(); prev_edge = EdgeId(0); @@ -406,6 +390,7 @@ } cur_sorted.push_back(cur_edge); prev_edge = cur_edge; + ++iter; } if (cur_sorted.size() > 0) res.push_back(cur_sorted); @@ -420,60 +405,60 @@ return res; } - vector GetWeightedColors(const ClustersSet &mapping_descr) const { - int len = (int) mapping_descr.size(); + vector> FillConnectionsTable (const ClustersSet &mapping_descr) const{ + size_t len = mapping_descr.size(); DEBUG("getting colors, table size "<< len); vector > cons_table(len); + for (size_t i = 0; i < len; i++) { + cons_table[i].resize(len); + cons_table[i][i] = 0; + } + size_t i = 0; + for (auto i_iter = mapping_descr.begin(); i_iter != mapping_descr.end(); + ++i_iter, ++i) { + size_t j = i; + for (auto j_iter = i_iter; + j_iter != mapping_descr.end(); ++j_iter, ++j) { + if (i_iter == j_iter) + continue; + cons_table[i][j] = IsConsistent(*i_iter, *j_iter); + } + } + return cons_table; + } + vector GetWeightedColors(const ClustersSet &mapping_descr) const { + size_t len = mapping_descr.size(); vector colors(len); vector cluster_size(len); vector max_size(len); - vector prev(len); - - for (int i = 0; i < len; i++) { - cons_table[i].resize(len); - cons_table[i][i] = 0; + vector prev(len); + size_t i = 0; + for (i = 0; i < len; i++) { prev[i] = -1; } - int i = 0; - - for (int i = 0; i < len; i++) { -//-1 not initialized, -2 - removed as trash + for (i = 0; i < len; i++) { colors[i] = UNDEF_COLOR; } + i = 0; for (auto i_iter = mapping_descr.begin(); i_iter != mapping_descr.end(); ++i_iter, ++i) { cluster_size[i] = i_iter->size; } - i = 0; - if (len > 1) { - TRACE(len << "clusters"); - } - for (auto i_iter = mapping_descr.begin(); i_iter != mapping_descr.end(); - ++i_iter, ++i) { - int j = i; - for (auto j_iter = i_iter; - j_iter != mapping_descr.end(); ++j_iter, ++j) { - if (i_iter == j_iter) - continue; - cons_table[i][j] = IsConsistent(*i_iter, *j_iter); - } - } - i = 0; + auto cons_table = FillConnectionsTable(mapping_descr); int cur_color = 0; - while (true) { for (i = 0; i < len; i++) { max_size[i] = 0; - prev[i] = -1; + prev[i] = -1ul; } i = 0; for (auto i_iter = mapping_descr.begin(); i_iter != mapping_descr.end(); ++i_iter, ++i) { if (colors[i] != UNDEF_COLOR) continue; max_size[i] = cluster_size[i]; - for (int j = 0; j < i; j ++) { + for (size_t j = 0; j < i; j ++) { if (colors[j] != -1) continue; if (cons_table[j][i] && max_size[i] < cluster_size[i] + max_size[j]) { max_size[i] = max_size[j] + cluster_size[i]; @@ -483,10 +468,10 @@ } int maxx = 0; int maxi = -1; - for (int j = 0; j < len; j++) { + for (size_t j = 0; j < len; j++) { if (max_size[j] > maxx) { maxx = max_size[j]; - maxi = j; + maxi = int(j); } } if (maxi == -1) { @@ -496,9 +481,9 @@ colors[maxi] = cur_color; int real_maxi = maxi, min_i = maxi; - while (prev[maxi] != -1) { + while (prev[maxi] != -1ul) { min_i = maxi; - maxi = prev[maxi]; + maxi = int(prev[maxi]); colors[maxi] = cur_color; } while (real_maxi >= min_i) { @@ -511,7 +496,6 @@ return colors; } - GapDescription CreateGapDescription(const KmerCluster& a, const KmerCluster& b, const Sequence& read) const { @@ -524,30 +508,58 @@ return GapDescription(a.edgeId, b.edgeId, read.Subseq(seq_start, seq_end), - a.sorted_positions[a.last_trustable_index].edge_position + pacbio_k - debruijn_k, + g_.length(a.edgeId) - a.sorted_positions[a.last_trustable_index].edge_position - pacbio_k + debruijn_k, b.sorted_positions[b.first_trustable_index].edge_position); } + OneReadMapping AddGapDescriptions(const vector &start_clusters, + const vector &end_clusters, + const vector> &sortedEdges, const Sequence &s, + const vector &block_gap_closer, size_t used_seeds_count) const { + DEBUG("adding gaps between subreads"); + vector illumina_gaps; + for (size_t i = 0; i + 1 < sortedEdges.size() ; i++) { + if (block_gap_closer[i]) + continue; + size_t j = i + 1; + EdgeId before_gap = sortedEdges[i][sortedEdges[i].size() - 1]; + EdgeId after_gap = sortedEdges[j][0]; +//do not add "gap" for rc-jumping + if (before_gap != after_gap && before_gap != g_.conjugate(after_gap)) { + if (TopologyGap(before_gap, after_gap, true)) { + if (start_clusters[j]->CanFollow(*end_clusters[i])) { + auto gap = CreateGapDescription(*end_clusters[i], + *start_clusters[j], + s); + if (gap != GapDescription()) { + illumina_gaps.push_back(gap); + DEBUG("adding gap between alignments number " << i<< " and " << j); + } + } + + } + } + + } + return OneReadMapping(sortedEdges, illumina_gaps, vector(0), used_seeds_count); + } OneReadMapping GetReadAlignment(Sequence &s) const { ClustersSet mapping_descr = GetOrderClusters(s); - DEBUG("clusters got"); - int len = (int) mapping_descr.size(); - vector real_length; - vector colors = GetWeightedColors(mapping_descr); - vector > sortedEdges; + size_t len = mapping_descr.size(); + vector real_length; + vector> sortedEdges; vector block_gap_closer; vector start_clusters, end_clusters; - vector illumina_gaps; vector used(len); size_t used_seed_count = 0; auto iter = mapping_descr.begin(); - for (int i = 0; i < len; i++, iter ++) { - used[i] = 0; + for (size_t i = 0; i < len; i++, iter ++) { + used[i] = 0; DEBUG(colors[i] <<" " << iter->str(g_)); } - for (int i = 0; i < len; i++) { + for (size_t i = 0; i < len; i++) { if (!used[i]) { DEBUG("starting new subread"); size_t cur_seed_count = 0; @@ -613,33 +625,7 @@ } } } - DEBUG("adding gaps between subreads"); - - for (size_t i = 0; i + 1 < sortedEdges.size() ; i++) { - if (block_gap_closer[i]) - continue; - size_t j = i + 1; - EdgeId before_gap = sortedEdges[i][sortedEdges[i].size() - 1]; - EdgeId after_gap = sortedEdges[j][0]; -//do not add "gap" for rc-jumping - if (before_gap != after_gap - && before_gap != g_.conjugate(after_gap)) { - if (i != j && TopologyGap(before_gap, after_gap, true)) { - if (start_clusters[j]->CanFollow(*end_clusters[i])) { - auto gap = CreateGapDescription(*end_clusters[i], - *start_clusters[j], - s); - if (gap != GapDescription()) { - illumina_gaps.push_back(gap); - DEBUG("adding gap between alignments number " << i<< " and " << j); - } - } - - } - } - - } - return OneReadMapping(sortedEdges, illumina_gaps, real_length, used_seed_count); + return AddGapDescriptions(start_clusters,end_clusters, sortedEdges, s, block_gap_closer, used_seed_count); } std::pair GetPathLimits(const KmerCluster &a, @@ -688,7 +674,6 @@ not_found = (distance_it == distance_cashed.end()); } if (not_found) { -//TODO: constants omnigraph::DijkstraHelper::BoundedDijkstra dijkstra( omnigraph::DijkstraHelper::CreateBoundedDijkstra(g_, pb_config_.max_path_in_dijkstra, pb_config_.max_vertex_in_dijkstra)); dijkstra.Run(start_v); @@ -696,36 +681,34 @@ result = dijkstra.GetDistance(end_v); } #pragma omp critical(pac_index) - { - distance_it = distance_cashed.insert({vertex_pair, result}).first; - } + { + distance_it = distance_cashed.insert({vertex_pair, result}).first; + } } else { DEBUG("taking from cashed"); } - result = distance_it->second; DEBUG (result); if (result == size_t(-1)) { return 0; } //TODO: Serious optimization possible - + int near_to_cluster_end = 500; for (auto a_iter = a.sorted_positions.begin(); a_iter != a.sorted_positions.end(); ++a_iter) { - if (a_iter - a.sorted_positions.begin() > 500 && a.sorted_positions.end() - a_iter >500) continue; + if (a_iter - a.sorted_positions.begin() > near_to_cluster_end && a.sorted_positions.end() - a_iter > near_to_cluster_end) continue; int cnt = 0; for (auto b_iter = b.sorted_positions.begin(); - b_iter != b.sorted_positions.end() && cnt <500; ++b_iter, cnt ++) { - if (similar_in_graph(*a_iter, *b_iter, - (int) (result + addition))) { + b_iter != b.sorted_positions.end() && cnt < near_to_cluster_end; ++b_iter, cnt ++) { + if (similar_in_graph(*a_iter, *b_iter, (int) (result + addition))) { return 1; } } cnt = 0; - if (b.sorted_positions.size() > 500) { + if ( (int) b.sorted_positions.size() > near_to_cluster_end) { for (auto b_iter = b.sorted_positions.end() - 1; - b_iter != b.sorted_positions.begin() && cnt < 500; --b_iter, cnt ++) { + b_iter != b.sorted_positions.begin() && cnt < near_to_cluster_end; --b_iter, cnt ++) { if (similar_in_graph(*a_iter, *b_iter, (int) (result + addition))) { return 1; @@ -733,9 +716,7 @@ } } } - return 0; - } string PathToString(const vector& path) const { @@ -747,7 +728,7 @@ } return res; } - +//TODO this should be replaced by Dijkstra based graph-read alignment vector BestScoredPath(const Sequence &s, VertexId start_v, VertexId end_v, int path_min_length, int path_max_length, int start_pos, int end_pos, string &s_add, @@ -763,7 +744,7 @@ int s_len = int(s.size()); string seq_string = s.Subseq(start_pos, min(end_pos + 1, s_len)).str(); size_t best_path_ind = paths.size(); - size_t best_score = 1000000000; + int best_score = STRING_DIST_INF; DEBUG("need to find best scored path between "< 1 && paths.size() < 10) { DEBUG("score: "<< cur_score); } @@ -795,7 +776,7 @@ } } DEBUG(best_score); - if (best_score == 1000000000) + if (best_score == STRING_DIST_INF) return vector(0); if (paths.size() > 1 && paths.size() < 10) { DEBUG("best score found! Path " <second.sorted_positions[iter->second.first_trustable_index]; auto last_cluster = iter->second.sorted_positions[iter->second.last_trustable_index]; - omnigraph::MappingRange range(omnigraph::Range(first_cluster.read_position, last_cluster.read_position), - omnigraph::Range(first_cluster.edge_position, last_cluster.edge_position)); + omnigraph::MappingRange range(Range(first_cluster.read_position, last_cluster.read_position), + Range(first_cluster.edge_position, last_cluster.edge_position)); result.join({iter->second.edgeId, range}); } @@ -861,7 +842,7 @@ }; template -typename PacBioMappingIndex::MappingDescription PacBioMappingIndex::Locate(const Sequence &s) const { +typename PacBioMappingIndex::MappingDescription PacBioMappingIndex::GetSeedsFromRead(const Sequence &s) const { MappingDescription res; //WARNING: removed read_count from here to make const methods int local_read_count = 0; @@ -881,12 +862,14 @@ auto keys = tmp_index.get(kwh); TRACE("Valid key, size: "<< keys.size()); + int quality = (int) keys.size(); + if (quality > 1000) { + DEBUG ("Ignoring repretive kmer") + continue; + } for (auto iter = keys.begin(); iter != keys.end(); ++iter) { - int quality = (int) keys.size(); TRACE("and quality:" << quality); - if (banned_kmers.find(Sequence(kwh.key())) != banned_kmers.end()) - continue; int offset = (int)iter->offset; int s_stretched = int ((double)s.size() * 1.2 + 50); int edge_len = int(g_.length(iter->edge_id)); diff -Nru spades-3.10.1+dfsg/src/common/modules/alignment/rna/ss_coverage_filler.hpp spades-3.11.1+dfsg/src/common/modules/alignment/rna/ss_coverage_filler.hpp --- spades-3.10.1+dfsg/src/common/modules/alignment/rna/ss_coverage_filler.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/alignment/rna/ss_coverage_filler.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -0,0 +1,62 @@ +// +// Created by andrey on 23.05.17. +// + +#pragma once + +#include +#include "modules/alignment/rna/ss_coverage.hpp" + +namespace debruijn_graph { + +class SSCoverageFiller: public SequenceMapperListener { +private: + const Graph& g_; + + SSCoverageStorage& storage_; + + std::vector tmp_storages_; + + bool symmetric_; + + void ProcessRange(size_t thread_index, const MappingPath& read) { + for (size_t i = 0; i < read.size(); ++i) { + const auto& range = read[i].second; + size_t kmer_count = range.mapped_range.end_pos - range.mapped_range.start_pos; + tmp_storages_[thread_index].IncreaseKmerCount(read[i].first, kmer_count, symmetric_); + } + } +public: + SSCoverageFiller(const Graph& g, SSCoverageStorage& storage, bool symmertic = false): + g_(g), storage_(storage), tmp_storages_(), symmetric_(symmertic) {} + + void StartProcessLibrary(size_t threads_count) override { + tmp_storages_.clear(); + + for (size_t i = 0; i < threads_count; ++i) { + tmp_storages_.emplace_back(g_); + } + } + + void StopProcessLibrary() override { + for (auto& storage : tmp_storages_) + storage.Clear(); + storage_.RecalculateCoverage(); + } + + void ProcessSingleRead(size_t thread_index, const io::SingleRead& /* r */, const MappingPath& read) override { + ProcessRange(thread_index, read); + } + + void ProcessSingleRead(size_t thread_index, const io::SingleReadSeq& /* r */, const MappingPath& read) override { + ProcessRange(thread_index, read); + } + + void MergeBuffer(size_t thread_index) override { + for (const auto& it : tmp_storages_[thread_index]) + storage_.IncreaseKmerCount(it.first, size_t(it.second)); + tmp_storages_[thread_index].Clear(); + } +}; + +} \ No newline at end of file diff -Nru spades-3.10.1+dfsg/src/common/modules/alignment/rna/ss_coverage.hpp spades-3.11.1+dfsg/src/common/modules/alignment/rna/ss_coverage.hpp --- spades-3.10.1+dfsg/src/common/modules/alignment/rna/ss_coverage.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/alignment/rna/ss_coverage.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -0,0 +1,61 @@ +// +// Created by andrey on 22.05.17. +// + +#pragma once + +#include + +namespace debruijn_graph { + + +class SSCoverageStorage { +public: + typedef std::unordered_map InnerMap; + +private: + const Graph& g_; + + InnerMap storage_; + +public: + SSCoverageStorage(const Graph& g): g_(g), storage_() {} + + double GetCoverage(EdgeId e, bool reverse = false) const { + if (reverse) { + e = g_.conjugate(e); + } + + auto it = storage_.find(e); + if (it == storage_.end()) + return 0.0; + return it->second; + } + + void IncreaseKmerCount(EdgeId e, size_t count, bool add_reverse = false) { + storage_[e] += (double) count; + if (add_reverse) + storage_[g_.conjugate(e)] += (double) count; + } + + void Clear() { + storage_.clear(); + } + + void RecalculateCoverage() { + for(auto& it : storage_) { + it.second = it.second / double(g_.length(it.first)); + } + } + + InnerMap::const_iterator begin() const { + return storage_.begin(); + } + + InnerMap::const_iterator end() const { + return storage_.end(); + } +}; + + +} \ No newline at end of file diff -Nru spades-3.10.1+dfsg/src/common/modules/alignment/sequence_mapper.hpp spades-3.11.1+dfsg/src/common/modules/alignment/sequence_mapper.hpp --- spades-3.10.1+dfsg/src/common/modules/alignment/sequence_mapper.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/alignment/sequence_mapper.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -10,19 +10,18 @@ #include "assembly_graph/paths/mapping_path.hpp" #include "assembly_graph/paths/path_processor.hpp" #include "sequence/sequence_tools.hpp" -#include "common/assembly_graph/core/basic_graph_stats.hpp" +#include "assembly_graph/core/basic_graph_stats.hpp" #include "edge_index.hpp" #include "kmer_mapper.hpp" #include -#include "common/assembly_graph/core/basic_graph_stats.hpp" +#include "assembly_graph/core/basic_graph_stats.hpp" namespace debruijn_graph { using omnigraph::MappingPath; using omnigraph::Path; using omnigraph::MappingRange; -using omnigraph::Range; template MappingPath ConjugateMapping(const Graph& g, diff -Nru spades-3.10.1+dfsg/src/common/modules/alignment/sequence_mapper_notifier.hpp spades-3.11.1+dfsg/src/common/modules/alignment/sequence_mapper_notifier.hpp --- spades-3.10.1+dfsg/src/common/modules/alignment/sequence_mapper_notifier.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/alignment/sequence_mapper_notifier.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -8,12 +8,12 @@ #ifndef SEQUENCE_MAPPER_NOTIFIER_HPP_ #define SEQUENCE_MAPPER_NOTIFIER_HPP_ -#include "utils/memory_limit.hpp" #include "sequence_mapper.hpp" #include "short_read_mapper.hpp" #include "io/reads/paired_read.hpp" #include "io/reads/read_stream_vector.hpp" #include "pipeline/graph_pack.hpp" +#include "common/utils/perf/memory_limit.hpp" #include #include @@ -43,14 +43,13 @@ public: typedef SequenceMapper SequenceMapperT; - SequenceMapperNotifier(const conj_graph_pack& gp) - : gp_(gp) { } + typedef std::vector ListenersContainer; + + SequenceMapperNotifier(const conj_graph_pack& gp, size_t lib_count) + : gp_(gp), listeners_(lib_count) { } void Subscribe(size_t lib_index, SequenceMapperListener* listener) { - while ((int)lib_index >= (int)listeners_.size() - 1) { - std::vector vect; - listeners_.push_back(vect); - } + VERIFY(lib_index < listeners_.size()); listeners_[lib_index].push_back(listener); } @@ -63,7 +62,7 @@ streams.reset(); NotifyStartProcessLibrary(lib_index, threads_count); size_t counter = 0, n = 15; - size_t fmem = get_free_memory(); + size_t fmem = utils::get_free_memory(); #pragma omp parallel for num_threads(threads_count) shared(counter) for (size_t i = 0; i < streams.size(); ++i) { @@ -74,7 +73,7 @@ if (size == BUFFER_SIZE || // Stop filling buffer if the amount of available is smaller // than half of free memory. - (10 * get_free_memory() / 4 < fmem && size > 10000)) { + (10 * utils::get_free_memory() / 4 < fmem && size > 10000)) { #pragma omp critical { counter += size; diff -Nru spades-3.10.1+dfsg/src/common/modules/alignment/short_read_mapper.hpp spades-3.11.1+dfsg/src/common/modules/alignment/short_read_mapper.hpp --- spades-3.10.1+dfsg/src/common/modules/alignment/short_read_mapper.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/alignment/short_read_mapper.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -9,7 +9,7 @@ #include "sequence_mapper.hpp" -#include "common/modules/alignment/pacbio/pac_index.hpp" +#include "modules/alignment/pacbio/pac_index.hpp" #include "modules/alignment/bwa_sequence_mapper.hpp" namespace debruijn_graph { diff -Nru spades-3.10.1+dfsg/src/common/modules/coverage_model/CMakeLists.txt spades-3.11.1+dfsg/src/common/modules/coverage_model/CMakeLists.txt --- spades-3.10.1+dfsg/src/common/modules/coverage_model/CMakeLists.txt 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/coverage_model/CMakeLists.txt 2017-09-28 09:05:09.000000000 +0000 @@ -0,0 +1,14 @@ +############################################################################ +# Copyright (c) 2015 Saint Petersburg State University +# Copyright (c) 2011-2014 Saint Petersburg Academic University +# All Rights Reserved +# See file LICENSE for details. +############################################################################ + +project(coverage_model CXX) + +add_library(coverage_model STATIC + kmer_coverage_model.cpp) + +target_link_libraries(coverage_model nlopt) + diff -Nru spades-3.10.1+dfsg/src/common/modules/coverage_model/kmer_coverage_model.cpp spades-3.11.1+dfsg/src/common/modules/coverage_model/kmer_coverage_model.cpp --- spades-3.10.1+dfsg/src/common/modules/coverage_model/kmer_coverage_model.cpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/coverage_model/kmer_coverage_model.cpp 2017-09-28 09:05:09.000000000 +0000 @@ -0,0 +1,378 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "kmer_coverage_model.hpp" + +#include "utils/logger/logger.hpp" +#include "utils/verify.hpp" +#include "math/xmath.h" +#include "math/smooth.hpp" + +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include + +namespace coverage_model { + +using std::isfinite; + +static const size_t MaxCopy = 10; + +static double dzeta(double x, double p) { + return pow(x, -p - 1) / boost::math::zeta(p + 1); +} + +static double perr(size_t i, double scale, double shape) { + return pow((1 + shape * ((double) (i - 1)) / scale), -1.0 / shape) - + pow((1 + shape * ((double) i) / scale), -1.0 / shape); +} + +static double pgood(size_t i, double zp, double u, double sd, double shape, + double* mixprobs = NULL) { + double res = 0; + + for (unsigned copy = 0; copy < MaxCopy; ++copy) { + boost::math::skew_normal snormal((copy + 1) * u, sd * sqrt(copy + 1), shape); + // res += (mixprobs ? mixprobs[copy] : dzeta(copy + 1, zp)) * (boost::math::cdf(snormal, i + 1) - boost::math::cdf(snormal, i)); + res += (mixprobs ? mixprobs[copy] : dzeta(copy + 1, zp)) * boost::math::pdf(snormal, i); + } + + return res; +} + +class CovModelLogLike { + const std::vector& cov; + +public: + CovModelLogLike(const std::vector& cov) + : cov(cov) {} + + int getN() const { return 7; }; + +private: + + double eval_(const double* x) const { + double zp = x[0], p = x[1], shape = x[2], u = x[3], sd = x[4], scale = x[5], shape2 = x[6]; + + if (zp <= 1 || shape <= 0 || sd <= 0 || p < 1e-9 || p > 1 - 1e-9 || u <= 0 || scale <= 0 || + !isfinite(zp) || !isfinite(shape) || !isfinite(sd) || !isfinite(p) || !isfinite(u) || + !isfinite(scale) || !isfinite(shape2)) + return +std::numeric_limits::infinity(); + + std::vector kmer_probs(cov.size()); + + // Error + for (size_t i = 0; i < kmer_probs.size(); ++i) + kmer_probs[i] += p * perr(i + 1, scale, shape); + + // Good + for (size_t i = 0; i < kmer_probs.size(); ++i) + kmer_probs[i] += (1 - p) * pgood(i + 1, zp, u, sd, shape2); + + double res = 0; + for (size_t i = 0; i < kmer_probs.size(); ++i) + res += (double) (cov[i]) * log(kmer_probs[i]); + + return -res; + } +}; + +struct CovModelLogLikeEMData { + const std::vector& cov; + const std::vector& z; +}; + +static double CovModelLogLikeEM(unsigned, const double* x, double*, void* data) { + double zp = x[0], shape = x[1], u = x[2], sd = x[3], scale = x[4], shape2 = x[5]; + + // INFO("Entry: " << x[0] << " " << x[1] << " " << x[2] << " " << x[3] << " " << x[4]); + + if (zp <= 1 || shape <= 0 || sd <= 0 || u <= 0 || scale <= 0 || + !isfinite(zp) || !isfinite(shape) || !isfinite(sd) || !isfinite(u) || + !isfinite(scale) || !isfinite(shape2)) + return -std::numeric_limits::infinity(); + + const std::vector& cov = static_cast(data)->cov; + const std::vector& z = static_cast(data)->z; + + std::vector kmer_probs(cov.size(), 0); + + // Error + for (size_t i = 0; i < kmer_probs.size(); ++i) { + if (cov[i] == 0) + continue; + + kmer_probs[i] += z[i] * log(perr(i + 1, scale, shape)); + } + + // Good + // Pre-compute mixing probabilities + std::vector mixprobs(MaxCopy, 0); + for (unsigned copy = 0; copy < MaxCopy; ++copy) + mixprobs[copy] = dzeta(copy + 1, zp); + + // Compute the density + for (size_t i = 0; i < kmer_probs.size(); ++i) { + if (cov[i] == 0) + continue; + + double val = log(pgood(i + 1, zp, u, sd, shape2, &mixprobs[0])); + if (!isfinite(val)) + val = -1000.0; + kmer_probs[i] += (1 - z[i]) * val; + } + + double res = 0; + for (size_t i = 0; i < kmer_probs.size(); ++i) + res += (double) (cov[i]) * kmer_probs[i]; + + // INFO("f: " << res); + return res; +} + + +static std::vector EStep(const std::vector& x, + double p, size_t N) { + double zp = x[0], shape = x[1], u = x[2], sd = x[3], scale = x[4], shape2 = x[5]; + + std::vector res(N); + for (size_t i = 0; i < N; ++i) { + double pe = p * perr(i + 1, scale, shape); + res[i] = pe / (pe + (1 - p) * pgood(i + 1, zp, u, sd, shape2)); + if (!isfinite(res[i])) + res[i] = 1.0; + } + + return res; +} + +// Estimate the coverage mean by finding the max past the +// first valley. +size_t KMerCoverageModel::EstimateValley() const { + // Smooth the histogram + std::vector scov; + math::Smooth3RS3R(scov, cov_); + + size_t Valley = scov[0]; + + // Start finding the valley + size_t Idx = 1; + while (scov[Idx] < Valley && Idx < scov.size()) { + Valley = scov[Idx]; + Idx += 1; + } + Idx -= 1; + + INFO("Kmer coverage valley at: " << Idx); + + return Idx; +} + +void KMerCoverageModel::Fit() { + VERIFY_MSG(cov_.size() > 10, "Invalid kmer coverage histogram, make sure that the coverage is indeed uniform"); + + // Find the minimal coverage point using smoothed histogram. + Valley_ = EstimateValley(); + + // First estimate of coverage is the first maximum after the valley. + MaxCov_ = Valley_ + 1; + size_t MaxHist = cov_[MaxCov_]; + for (size_t i = Valley_ + 1; i < cov_.size(); ++i) { + if (cov_[i] > MaxHist) { + MaxHist = cov_[i]; + MaxCov_ = i; + } + } + INFO("K-mer histogram maximum: " << MaxCov_); + + // Refine the estimate via median + size_t AfterValley = 0, SecondValley = std::min(2 * MaxCov_ - Valley_, cov_.size()); + for (size_t i = Valley_ + 1; i < SecondValley; ++i) + AfterValley += cov_[i]; + + size_t ccov = 0; + for (size_t i = Valley_ + 1; i < SecondValley; ++i) { + if (ccov > AfterValley / 2) { + MaxCov_ = std::max(i, MaxCov_); + break; + } + ccov += cov_[i]; + } + + if (MaxCov_ - Valley_ < 3) + WARN("Too many erroneous kmers, the estimates might be unreliable"); + + std::vector mvals(1 + MaxCov_ - Valley_); + mvals[0] = cov_[MaxCov_]; + size_t tmadcov = mvals[0]; + for (size_t i = 1; i < std::min(MaxCov_ - Valley_, cov_.size() - MaxCov_); ++i) { + mvals[i] = cov_[MaxCov_ + i] + cov_[MaxCov_ - i]; + tmadcov += mvals[i]; + } + size_t madcov = 0; + double CovSd = sqrt((double) (5 * MaxCov_)); + for (size_t i = 0; i < MaxCov_ - Valley_; ++i) { + if (madcov > tmadcov / 2) { + CovSd = (double) i; + break; + } + madcov += mvals[i]; + } + CovSd *= 1.4826; + INFO("Estimated median coverage: " << MaxCov_ << ". Coverage mad: " << CovSd); + + // Estimate error probability as ratio of kmers before the valley. + size_t BeforeValley = 0, Total = 0; + double ErrorProb = 0; + for (size_t i = 0; i < cov_.size(); ++i) { + if (i <= Valley_) + BeforeValley += cov_[i]; + Total += cov_[i]; + } + ErrorProb = (double) BeforeValley / (double) Total; + // Allow some erroneous / good kmers. + ErrorProb = std::min(1 - 1e-3, ErrorProb); + ErrorProb = std::max(1e-3, ErrorProb); + + TRACE("Total: " << Total << ". Before: " << BeforeValley); + TRACE("p: " << ErrorProb); + + std::vector x = {3.0, 3.0, (double) MaxCov_, CovSd, 1.0, 0.0}, + lb = {0.0, 0.0, 0.0, (double) (MaxCov_ - Valley_), 0.0, -6.0}, + ub = {2000.0, 2000.0, (double) (2 * MaxCov_), (double) SecondValley, 2000.0, 6.0}; + + INFO("Fitting coverage model"); + // Ensure that there will be at least 2 iterations. + double PrevErrProb = 2; + const double ErrProbThr = 1e-8; + auto GoodCov = cov_; + GoodCov.resize(std::min(cov_.size(), 5 * MaxCopy * MaxCov_ / 4)); + converged_ = true; + unsigned it = 1; + while (fabs(PrevErrProb - ErrorProb) > ErrProbThr) { + // Recalculate the vector of posterior error probabilities + std::vector z = EStep(x, ErrorProb, GoodCov.size()); + + // Recalculate the probability of error + PrevErrProb = ErrorProb; + ErrorProb = 0; + for (size_t i = 0; i < GoodCov.size(); ++i) + ErrorProb += z[i] * (double) GoodCov[i]; + ErrorProb /= (double) Total; + + bool LastIter = fabs(PrevErrProb - ErrorProb) <= ErrProbThr; + + nlopt::opt opt(nlopt::LN_NELDERMEAD, 6); + CovModelLogLikeEMData data = {GoodCov, z}; + opt.set_max_objective(CovModelLogLikeEM, &data); + if (!LastIter) + opt.set_maxeval(5 * 6 * it); + opt.set_xtol_rel(1e-8); + opt.set_ftol_rel(1e-8); + + double fMin; + nlopt::result Results = nlopt::FAILURE; + try { + Results = opt.optimize(x, fMin); + } catch (nlopt::roundoff_limited&) { + } + + VERBOSE_POWER_T2(it, 1, "... iteration " << it); + TRACE("Results: "); + TRACE("Converged: " << Results << " " << "F: " << fMin); + + double zp = x[0], shape = x[1], u = x[2], sd = x[3], scale = x[4], shape2 = x[5]; + TRACE("zp: " << zp << " p: " << ErrorProb << " shape: " << shape << " u: " << u << " sd: " << sd << + " scale: " << scale << " shape2: " << shape2); + + it += 1; + } + + double delta = x[5] / sqrt(1 + x[5] * x[5]); + mean_coverage_ = x[2] + x[3] * delta * sqrt(2 / M_PI); + sd_coverage_ = x[3] * sqrt(1 - 2 * delta * delta / M_PI); + INFO("Fitted mean coverage: " << mean_coverage_ << ". Fitted coverage std. dev: " << sd_coverage_); + + // Now let us check whether we have sane results + for (size_t i = 0; i < x.size(); ++i) + if (!isfinite(x[i])) { + converged_ = false; + break; + } + + if (!isfinite(ErrorProb)) + converged_ = false; + + // See, if we can deduce proper threshold + + // First, check whether initial estimate of Valley was sane. + ErrorThreshold_ = 0; + if (converged_ && Valley_ > x[2] && x[2] > 2) { + Valley_ = (size_t) math::round(x[2] / 2.0); + WARN("Valley value was estimated improperly, reset to " << Valley_); + } + + // If the model converged, then use it to estimate the thresholds. + if (converged_) { + std::vector z = EStep(x, ErrorProb, GoodCov.size()); + + INFO("Probability of erroneous kmer at valley: " << z[Valley_]); + converged_ = false; + for (size_t i = 0; i < z.size(); ++i) + if (z[i] > strong_probability_threshold_) //0.999 + LowThreshold_ = std::min(i + 1, Valley_); + else if (z[i] < probability_threshold_) {//0.05? + ErrorThreshold_ = std::max(i + 1, Valley_); + converged_ = true; + break; + } + +#if 0 + for (size_t i = 0; i < z.size(); ++i) { + double zp = x[0], shape = x[1], u = x[2], sd = x[3], scale = x[4], shape2 = x[5]; + double pe = ErrorProb * perr(i + 1, scale, shape); + double pg = (1 - ErrorProb) * pgood(i + 1, zp, u, sd, shape2); + + fprintf(stderr, "%e %e %e %e\n", pe, pg, z[i], perr(i + 1, scale, shape)); + } +#endif + } + + // See, if we have sane ErrorThreshold_ and go down to something convervative, if not. + if (converged_) { + INFO("Preliminary threshold calculated as: " << ErrorThreshold_); + ErrorThreshold_ = (Valley_ < mean_coverage_ ? + std::min(Valley_ + (size_t) (mean_coverage_ - (double) Valley_) / 2, ErrorThreshold_) : + Valley_); + INFO("Threshold adjusted to: " << ErrorThreshold_); + } else { + ErrorThreshold_ = Valley_; + LowThreshold_ = 1; + WARN("Failed to determine erroneous kmer threshold. Threshold set to: " << ErrorThreshold_); + } + + // Now the bonus: estimate the genome size! + GenomeSize_ = 0; + for (size_t i = ErrorThreshold_ - 1; i < GoodCov.size(); ++i) + GenomeSize_ += GoodCov[i]; + GenomeSize_ /= 2; + + INFO("Estimated genome size (ignoring repeats): " << GenomeSize_); +} + +} diff -Nru spades-3.10.1+dfsg/src/common/modules/coverage_model/kmer_coverage_model.hpp spades-3.11.1+dfsg/src/common/modules/coverage_model/kmer_coverage_model.hpp --- spades-3.10.1+dfsg/src/common/modules/coverage_model/kmer_coverage_model.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/coverage_model/kmer_coverage_model.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -0,0 +1,46 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include +#include + +namespace coverage_model { + +class KMerCoverageModel { + const std::vector& cov_; + size_t MaxCov_, Valley_, ErrorThreshold_, LowThreshold_, GenomeSize_; + double probability_threshold_, strong_probability_threshold_, mean_coverage_, sd_coverage_; + bool converged_; + +public: + KMerCoverageModel(const std::vector& cov, double probability_threshold, + double strong_probability_threshold) + : cov_(cov), LowThreshold_(0), probability_threshold_(probability_threshold), + strong_probability_threshold_(strong_probability_threshold), + mean_coverage_(0.0), sd_coverage_(0.0), converged_(false) {} + + void Fit(); + + size_t GetErrorThreshold() const { return ErrorThreshold_; } + + size_t GetLowThreshold() const { return LowThreshold_; } + + size_t GetGenomeSize() const { return GenomeSize_; } + + double GetMeanCoverage() const { return mean_coverage_; } + + double GetSdCoverage() const { return sd_coverage_; } + + bool converged() const { return converged_; } + +private: + size_t EstimateValley() const; +}; + +} diff -Nru spades-3.10.1+dfsg/src/common/modules/genome_consistance_checker.cpp spades-3.11.1+dfsg/src/common/modules/genome_consistance_checker.cpp --- spades-3.10.1+dfsg/src/common/modules/genome_consistance_checker.cpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/genome_consistance_checker.cpp 2017-09-28 09:05:09.000000000 +0000 @@ -1,174 +1,315 @@ #include "modules/genome_consistance_checker.hpp" +#include "modules/path_extend/paired_library.hpp" #include "assembly_graph/core/graph.hpp" #include +#include #include + namespace debruijn_graph { using omnigraph::MappingRange; using namespace std; //gap or overlap size. WITHOUT SIGN! -static size_t gap(const Range &a, const Range &b) { - return max(a.end_pos, b.start_pos) - min (a.end_pos, b.start_pos); +size_t AbsGap(const Range &a, const Range &b) { + return max(a.end_pos, b.start_pos) - min(a.end_pos, b.start_pos); } -bool GenomeConsistenceChecker::consequent(const Range &mr1, const Range &mr2) const{ - if (mr1.end_pos > mr2.start_pos + absolute_max_gap_) - return false; - if (mr1.end_pos + absolute_max_gap_ < mr2.start_pos) - return false; - return true; -} -bool GenomeConsistenceChecker::consequent(const MappingRange &mr1, const MappingRange &mr2) const { +bool GenomeConsistenceChecker::Consequent(const MappingRange &mr1, const MappingRange &mr2) const { //do not want to think about handling gaps near 0 position. - if (!consequent(mr1.initial_range, mr2.initial_range) || !consequent(mr1.mapped_range, mr2.mapped_range)) - return false; - size_t initial_gap = gap(mr1.initial_range, mr2.initial_range); - size_t mapped_gap = gap(mr1.mapped_range, mr2.mapped_range); - size_t max_gap = max(initial_gap, mapped_gap); - if ( max_gap > relative_max_gap_* double (max (min(mr1.initial_range.size(), mr1.mapped_range.size()), min(mr2.initial_range.size(), mr2.mapped_range.size())))) + size_t max_gap = max(AbsGap(mr1.initial_range, mr2.initial_range), + AbsGap(mr1.mapped_range, mr2.mapped_range)); + + if (max_gap > absolute_max_gap_) return false; - return true; + size_t len = max(min(mr1.initial_range.size(), mr1.mapped_range.size()), + min(mr2.initial_range.size(), mr2.mapped_range.size())); + return max_gap <= size_t(math::round(relative_max_gap_* double(len))); } PathScore GenomeConsistenceChecker::CountMisassemblies(const BidirectionalPath &path) const { - PathScore straight = CountMisassembliesWithStrand(path, "0"); - PathScore reverse = CountMisassembliesWithStrand(path, "1"); + PathScore score = InternalCountMisassemblies(path); + if (path.Size() == 0) { + WARN ("0 length path in GCChecker!!!"); + return PathScore(0,0,0); + } size_t total_length = path.LengthAt(0); //TODO: constant; - if (total_length > std::max(straight.mapped_length, reverse.mapped_length) * 2) { - if (total_length > 10000) { + if (total_length > score.mapped_length * 2) { + if (total_length > SIGNIFICANT_LENGTH_LOWER_LIMIT) { INFO ("For path length " << total_length <<" mapped less than half of the path, skipping"); } return PathScore(0,0,0); } else { - if (straight.mapped_length > reverse.mapped_length) { - return straight; - } else { - return reverse; - } + return score; } } -vector > GenomeConsistenceChecker::ConstructEdgeOrder() const { - vector > to_sort; - for(auto e: storage_) { - if (excluded_unique_.find(e) == excluded_unique_.end() ) { - set mappings = gp_.edge_pos.GetEdgePositions(e, "fxd0"); - if (mappings.size() > 1) { - INFO("edge " << e << "smth strange"); - } else if (mappings.size() == 0) { - continue; - } else { - to_sort.push_back(make_pair(e, *mappings.begin())); - } +MappingPath GenomeConsistenceChecker::ConstructEdgeOrder(const string& chr_name) const { + vector> to_sort; + DEBUG ("constructing edge order for chr " << chr_name); + for (auto e: storage_) { + set mappings = gp_.edge_pos.GetEdgePositions(e, chr_name); + VERIFY_MSG(mappings.size() <= 1, "Presumably unique edge " << e << " with multiple mappings!"); + if (!mappings.empty()) { + to_sort.push_back(make_pair(e, *mappings.begin())); } } - sort(to_sort.begin(), to_sort.end(), [](const pair & a, const pair & b) -> bool - { + DEBUG("Sorting " << to_sort << " positions:"); + sort(to_sort.begin(), to_sort.end(), + [](const pair & a, const pair & b) { return a.second.initial_range.start_pos < b.second.initial_range.start_pos; - } - ); - return to_sort; + }); + return MappingPathT(to_sort); } +void GenomeConsistenceChecker::ReportEdge(EdgeId e, double w) const{ + INFO("Edge " << gp_.g.int_id(e) << " weight " << w << " len " << gp_.g.length(e) << " cov " << gp_.g.coverage(e)); + if (!genome_info_.Multiplicity(e)) { + INFO(" no chromosome position"); + } else { + auto info = genome_info_.UniqueChromosomeIdx(e); + INFO ("Chromosome " << info.first << " index " << info.second); + } +} -void GenomeConsistenceChecker::SpellGenome() { +void GenomeConsistenceChecker::ReportVariants(vector> &sorted_w) const { + sort(sorted_w.rbegin(), sorted_w.rend()); size_t count = 0; - auto to_sort = ConstructEdgeOrder(); - vector starts; - vector ends; - for(size_t i = 0; i 0 && to_sort[i].second.initial_range.start_pos - to_sort[i-1].second.initial_range.end_pos > storage_.GetMinLength() ) { - INFO ("Large gap " << to_sort[i].second.initial_range.start_pos - to_sort[i-1].second.initial_range.end_pos ); - starts.push_back(to_sort[i].second.initial_range.start_pos); - ends.push_back(to_sort[i-1].second.initial_range.end_pos); - } - if (i == 0) { - starts.push_back(to_sort[i].second.initial_range.start_pos); - } - if (i == to_sort.size() - 1){ - ends.push_back(to_sort[i].second.initial_range.end_pos); - } - INFO("edge " << gp_.g.int_id(to_sort[i].first) << " length "<< gp_.g.length(to_sort[i].first) << - " coverage " << gp_.g.coverage(to_sort[i].first) << " mapped to " << to_sort[i].second.mapped_range.start_pos - << " - " << to_sort[i].second.mapped_range.end_pos << " init_range " << to_sort[i].second.initial_range.start_pos << " - " << to_sort[i].second.initial_range.end_pos ); - genome_spelled_[to_sort[i].first] = count; + double additional_weight = 0; + size_t reporting = 4; + for (const auto pair: sorted_w) { + if (count == 0) { + INFO("First candidate:"); + } + if (count < reporting) { + ReportEdge(pair.second, pair.first); + } else { + additional_weight += pair.first; + } count++; } - vector lengths; - size_t total_len = 0; - for (size_t i = 0; i < starts.size(); i++) { - lengths.push_back(ends[i] - starts[i]); - total_len += lengths[i]; + if (reporting < sorted_w.size()) { + INFO("Additional weight " << additional_weight << " of " << sorted_w.size() - reporting << + " candidates"); + } + if (sorted_w.size() == 0) { + INFO("No uniqueness info"); } - sort(lengths.begin(), lengths.end()); - reverse(lengths.begin(), lengths.end()); - size_t cur = 0; - size_t i = 0; - while (cur < total_len / 2 && i < lengths.size()) { - cur += lengths[i]; - i++; +} + +void GenomeConsistenceChecker::ReportPathEndByLongLib(const path_extend::BidirectionalPathSet &covering_paths, + EdgeId current_edge) const { + vector> sorted_w; + for (const auto & cov_path: covering_paths) { + double w = cov_path->GetWeight(); + map next_weigths; + if (math::gr(w, 1.0)) { + for (size_t p_ind = 0; p_ind < cov_path->Size(); p_ind++) { + if (cov_path->At(p_ind) == current_edge) { + for (size_t p_ind2 = p_ind + 1; p_ind2 < cov_path->Size(); p_ind2++) { + if (gp_.g.length(cov_path->At(p_ind2)) >= storage_.min_length() ) { + next_weigths[cov_path->At(p_ind2)] += w; + } + } + break; + } + } + } + for (const auto &p: next_weigths) { + sorted_w.push_back(make_pair(p.second, p.first)); + } } - INFO("Assuming gaps of length > " << storage_.GetMinLength() << " unresolvable.."); - if (lengths.size() > 0) - INFO("Rough estimates on N50/L50:" << lengths[i - 1] << " / " << i - 1 << " with len " << total_len); + INFO("Looking on long reads, last long edge: "); + ReportVariants(sorted_w); } -PathScore GenomeConsistenceChecker::CountMisassembliesWithStrand(const BidirectionalPath &path, const string strand) const { - if (strand == "1") { - return (CountMisassembliesWithStrand(*path.GetConjPath(), "0")); +void GenomeConsistenceChecker::ReportPathEndByPairedLib(const shared_ptr paired_lib, + EdgeId current_edge) const { + vector> sorted_w; + set result; + paired_lib->FindJumpEdges(current_edge, result, std::numeric_limits::min(), std::numeric_limits::max(), + storage_.min_length()); + for (const auto e: result) { + double w = paired_lib->CountPairedInfo(current_edge, e, std::numeric_limits::min(), + std::numeric_limits::max()); + if (math::gr(w, 1.0)) + sorted_w.push_back(make_pair(w, e)); + } + INFO("Looking on lib IS " << paired_lib->GetIS()); + ReportVariants(sorted_w); +} + +void GenomeConsistenceChecker::CheckPathEnd(const BidirectionalPath &path) const { + for (int i = (int)path.Size() - 1; i >= 0; --i) { + if (storage_.IsUnique(path.At(i))) { + EdgeId current_edge = path.At(i); + if (genome_info_.Multiplicity(current_edge)) { + const auto &chr_info = genome_info_.UniqueChromosomeInfo(current_edge); + size_t index = chr_info.UniqueEdgeIdx(current_edge); + if (index == 0 || index == chr_info.size()) { + DEBUG("Path length " << path.Length() << " ended at the chromosome " << chr_info.name() + << (index == 0 ? " start": " end")); + return; + } + } + INFO("Path length " << path.Length() << " ended, last unique: "); + ReportEdge(current_edge, -1.0); + for (size_t lib_index = 0; lib_index < reads_.lib_count(); ++lib_index) { + const auto &lib = reads_[lib_index]; + if (lib.is_paired()) { + shared_ptr paired_lib; + if (lib.is_mate_pair()) + paired_lib = path_extend::MakeNewLib(gp_.g, lib, gp_.paired_indices[lib_index]); + else if (lib.type() == io::LibraryType::PairedEnd) + paired_lib = path_extend::MakeNewLib(gp_.g, lib, gp_.clustered_indices[lib_index]); + ReportPathEndByPairedLib(paired_lib, current_edge); + } else if (lib.is_long_read_lib()) { + ReportPathEndByLongLib(long_reads_cov_map_[lib_index].GetCoveringPaths(current_edge), current_edge); + } + } + return; + } } - PathScore res(0, 0, 0); - EdgeId prev; - size_t prev_in_genome = std::numeric_limits::max(); - size_t prev_in_path = std::numeric_limits::max(); - MappingRange prev_range; - for (int i = 0; i < (int) path.Size(); i++) { - if (genome_spelled_.find(path.At(i)) != genome_spelled_.end()) { - size_t cur_in_genome = genome_spelled_[path.At(i)]; - MappingRange cur_range = *gp_.edge_pos.GetEdgePositions(path.At(i), "fxd0").begin(); - if (prev_in_genome != std::numeric_limits::max()) { - if (cur_in_genome == prev_in_genome + 1) { - int dist_in_genome = (int) cur_range.initial_range.start_pos - (int) prev_range.initial_range.end_pos; - int dist_in_path = (int) path.LengthAt(prev_in_path) - (int) path.LengthAt(i) + (int) cur_range.mapped_range.start_pos - (int) prev_range.mapped_range.end_pos; - DEBUG("Edge " << prev.int_id() << " position in genome ordering: " << prev_in_genome); - DEBUG("Gap in genome / gap in path: " << dist_in_genome << " / " << dist_in_path); - if (size_t(abs(dist_in_genome - dist_in_path)) > absolute_max_gap_ && (dist_in_genome * (1 + relative_max_gap_) < dist_in_path || dist_in_path * (1 + relative_max_gap_) < dist_in_genome)) { +} - res.wrong_gap_size ++; +size_t GenomeConsistenceChecker::GetSupportingPathCount(EdgeId e1, EdgeId e2, size_t lib_index) const { + auto covering_paths = long_reads_cov_map_[lib_index].GetCoveringPaths(e1); + size_t res = 0; + for (const auto & cov_path: covering_paths) { + double w = cov_path->GetWeight(); + if (math::gr(w, 1.0)) { + for (size_t p_ind = 0; p_ind < cov_path->Size(); p_ind++) { + if (cov_path->At(p_ind) == e1) { + for (size_t p_ind2 = p_ind + 1; p_ind2 < cov_path->Size(); p_ind2++) { + if (storage_.IsUnique(cov_path->At(p_ind2))) { + if (e2 == cov_path->At(p_ind2)) + res += size_t(w); + break; + } } - } else { - if (path.At(i) != circular_edge_ && path.At(prev_in_path) != circular_edge_) - res.misassemblies++; - else - INFO("Skipping fake(circular) misassembly"); + break; } } - res.mapped_length += cur_range.mapped_range.size(); - prev = path.At(i); - prev_in_genome = cur_in_genome; - prev_range = cur_range; - prev_in_path = i; } } - if (prev_in_path != std::numeric_limits::max()) - DEBUG("Edge " << prev.int_id() << " position in genome ordering: " << prev_in_genome); return res; } -void GenomeConsistenceChecker::RefillPos() { - RefillPos("0"); - RefillPos("1"); + +void GenomeConsistenceChecker::PrintMisassemblyInfo(EdgeId e1, EdgeId e2) const { + VERIFY(genome_info_.Multiplicity(e1)); + VERIFY(genome_info_.Multiplicity(e2)); + const auto &chr_info1 = genome_info_.UniqueChromosomeInfo(e1); + const auto &chr_info2 = genome_info_.UniqueChromosomeInfo(e2); + size_t ind1 = chr_info1.UniqueEdgeIdx(e1); + size_t ind2 = chr_info2.UniqueEdgeIdx(e2); +//FIXME: checks, compliment_strands; + EdgeId true_next = chr_info1.EdgeAt((chr_info1.UniqueEdgeIdx(e1) + 1) % chr_info1.size()); + EdgeId true_prev = chr_info2.EdgeAt((chr_info2.UniqueEdgeIdx(e2) + chr_info2.size() - 1) % chr_info2.size()); + INFO("Next genomic edge " << true_next.int_id() << " len " << gp_.g.length(true_next) << " prev " << true_prev.int_id() << " len " << gp_.g.length(true_prev)); + if (chr_info1.name() == chr_info2.name() && ind1 < ind2) { + INFO("Same chromosome large forward jump misassembly"); + } else if (chr_info1.name() == chr_info2.name() && ind1 > ind2) { + INFO("Backward jump misassembly"); + } else if (chr_info1.name().substr(1) == chr_info2.name().substr(1)) { + string revers = (ind1 + ind2 + 2 > chr_info1.size() ? " backwards " : " forward " ); + INFO("Inversion" + revers + "misassembly, chr edge size " << chr_info1.size()); + } else if (ind1 + 1 == chr_info1.size() || ind2 == 0) { + string start_end = (ind2 == 0 ? " start " : " end "); + INFO("Chromosome " + start_end + " misassembly "); + } else { + INFO("Something else misassembly"); + } + for (size_t lib_index = 0; lib_index < reads_.lib_count(); ++lib_index) { + const auto &lib = reads_[lib_index]; + if (lib.is_paired()) { + shared_ptr paired_lib; + if (lib.is_mate_pair()) + paired_lib = path_extend::MakeNewLib(gp_.g, lib, gp_.paired_indices[lib_index]); + else if (lib.type() == io::LibraryType::PairedEnd) + paired_lib = path_extend::MakeNewLib(gp_.g, lib, gp_.clustered_indices[lib_index]); + INFO("for lib " << lib_index << " IS" << paired_lib->GetIS()); + INFO("Misassembly weight regardless of dists: " << paired_lib->CountPairedInfo(e1, e2, -1000000, 1000000)); + INFO("Next weight " << paired_lib->CountPairedInfo(e1, true_next, -1000000, 1000000)); + INFO("Prev weight " << paired_lib->CountPairedInfo(true_prev, e2, -1000000, 1000000)); + } else if (lib.is_long_read_lib()) { + INFO("for lib " << lib_index << " of long reads: "); + INFO("Misassembly weight " << GetSupportingPathCount(e1, e2 ,lib_index)); + INFO("Next weight " << GetSupportingPathCount(e1, true_next ,lib_index) ); + INFO("Prev weight " << GetSupportingPathCount(true_prev, e2 ,lib_index) ); + + } + } } +void GenomeConsistenceChecker::ClassifyPosition(size_t prev_pos, size_t cur_pos, + const BidirectionalPath & path, PathScore &res) const{ + EdgeId cur_e = path.At(cur_pos); + const auto& chr_info = genome_info_.UniqueChromosomeInfo(cur_e); + size_t cur_in_genome = chr_info.UniqueEdgeIdx(cur_e); + string cur_chr = chr_info.name(); + MappingRange cur_range = gp_.edge_pos.GetUniqueEdgePosition(cur_e, cur_chr); + EdgeId prev_e = path.At(prev_pos); + const auto& prev_chr_info = genome_info_.UniqueChromosomeInfo(prev_e); + size_t prev_in_genome = prev_chr_info.UniqueEdgeIdx(prev_e); + string prev_chr = prev_chr_info.name(); + MappingRange prev_range = gp_.edge_pos.GetUniqueEdgePosition(prev_e, prev_chr); + + res.mapped_length += cur_range.mapped_range.size(); + if (cur_in_genome == prev_in_genome + 1 && cur_chr == prev_chr) { + int dist_in_genome = (int) cur_range.initial_range.start_pos - (int) prev_range.initial_range.end_pos; + int dist_in_path = (int) path.LengthAt(prev_pos) - (int) path.LengthAt(cur_pos) + + (int) cur_range.mapped_range.start_pos - (int) prev_range.mapped_range.end_pos; + DEBUG("Edge " << prev_e.int_id() << " position in genome ordering: " << prev_in_genome); + DEBUG("Gap in genome / gap in path: " << dist_in_genome << " / " << dist_in_path); + if (size_t(abs(dist_in_genome - dist_in_path)) > absolute_max_gap_ && + (dist_in_genome * (1 + relative_max_gap_) < dist_in_path || + dist_in_path * (1 + relative_max_gap_) < dist_in_genome)) { + res.wrong_gap_size ++; + } + } else { + if (cur_chr == prev_chr && (circular_edges_.find(prev_e) != circular_edges_.end() || + circular_edges_.find(cur_e) != circular_edges_.end())) { + INFO("Skipping fake(circular) misassembly"); + } else if (cur_in_genome > prev_in_genome && cur_chr == prev_chr + && prev_range.initial_range.end_pos + SIGNIFICANT_LENGTH_LOWER_LIMIT > cur_range.initial_range.start_pos) { + INFO("Local misassembly between edges: "<::max(); + for (int i = 0; i < (int) path.Size(); i++) { +//const method, so at instead of [] + EdgeId e = path.At(i); + if (genome_info_.Multiplicity(e)) { + if (prev_pos != std::numeric_limits::max()) { + ClassifyPosition(prev_pos, i, path, res); + } + prev_pos = i; + } } + return res; } -void GenomeConsistenceChecker::FindBestRangeSequence(const set& old_mappings, vector& used_mappings) const { - vector to_process (old_mappings.begin(), old_mappings.end()); +vector GenomeConsistenceChecker::FindBestRangeSequence(const set& mappings) const { + vector to_process(mappings.begin(), mappings.end()); sort(to_process.begin(), to_process.end(), [](const MappingRange & a, const MappingRange & b) -> bool { return a.mapped_range.start_pos < b.mapped_range.start_pos; @@ -177,9 +318,9 @@ //max weight path in orgraph of mappings TRACE("constructing mapping graph" << sz << " vertices"); vector> consecutive_mappings(sz); - for(size_t i = 0; i < sz; i++) { + for (size_t i = 0; i < sz; i++) { for (size_t j = i + 1; j < sz; j++) { - if (consequent(to_process[i], to_process[j])) { + if (Consequent(to_process[i], to_process[j])) { consecutive_mappings[i].push_back(j); } else { if (to_process[j].mapped_range.start_pos > to_process[i].mapped_range.end_pos + absolute_max_gap_) { @@ -189,88 +330,217 @@ } } vector scores(sz), prev(sz); - for(size_t i = 0; i < sz; i++) { + for (size_t i = 0; i < sz; i++) { scores[i] = to_process[i].initial_range.size(); prev[i] = std::numeric_limits::max(); } - for(size_t i = 0; i < sz; i++) { + for (size_t i = 0; i < sz; i++) { for (size_t j = 0; j < consecutive_mappings[i].size(); j++) { TRACE(consecutive_mappings[i][j]); - if (scores[consecutive_mappings[i][j]] < scores[i] + to_process[consecutive_mappings[i][j]].initial_range.size()) { - scores[consecutive_mappings[i][j]] = scores[i] + to_process[consecutive_mappings[i][j]].initial_range.size(); + if (scores[consecutive_mappings[i][j]] < scores[i] + + to_process[consecutive_mappings[i][j]].initial_range.size()) { + scores[consecutive_mappings[i][j]] = scores[i] + + to_process[consecutive_mappings[i][j]].initial_range.size(); prev[consecutive_mappings[i][j]] = i; } } } size_t cur_max = 0; size_t cur_i = 0; - for(size_t i = 0; i < sz; i++) { + for (size_t i = 0; i < sz; i++) { if (scores[i] > cur_max) { cur_max = scores[i]; cur_i = i; } } - used_mappings.clear(); + + vector answer; while (cur_i != std::numeric_limits::max()) { - used_mappings.push_back(to_process[cur_i]); + answer.push_back(to_process[cur_i]); cur_i = prev[cur_i]; } - reverse(used_mappings.begin(), used_mappings.end()); -}; + reverse(answer.begin(), answer.end()); + return answer; +} -void GenomeConsistenceChecker::RefillPos(const string &strand, const EdgeId &e) { - set old_mappings = gp_.edge_pos.GetEdgePositions(e, strand); - TRACE("old mappings sz " << old_mappings.size() ); - size_t total_mapped = 0; - for (auto mp:old_mappings) { - total_mapped += mp.initial_range.size(); - } - if (total_mapped > (double) gp_.g.length(e) * 1.5) { - INFO ("Edge " << gp_.g.int_id(e) << "is not unique, excluding"); - excluded_unique_.insert(e); - return; - } -//TODO: support non-unique edges; - if (total_mapped < (double) gp_.g.length(e) * 0.5) { - DEBUG ("Edge " << gp_.g.int_id(e) << "is not mapped on strand "<< strand <<", not used"); - return; +map GenomeConsistenceChecker::EdgeLabels() const { + INFO("Constructing reference labels"); + map answer; + size_t count = 0; + for (const auto &chr: genome_info_.Chromosomes()) { + const auto &chr_info = genome_info_.ChrInfo(chr); + for (size_t pos = 0; pos < chr_info.size(); ++pos) { + EdgeId e = chr_info.EdgeAt(pos); + auto mr = gp_.edge_pos.GetUniqueEdgePosition(e, chr); + VERIFY(!answer.count(e)); + answer[e] += chr + + "order: " + to_string(count) + + "\n mapped range: " + + to_string(mr.mapped_range.start_pos) + " : " + + to_string(mr.mapped_range.end_pos) + + "\n init range: " + + to_string(mr.initial_range.start_pos) + " : " + + to_string(mr.initial_range.end_pos) + "\n"; + } + } + return answer; +} + +void GenomeConsistenceChecker::Fill() { + gp_.edge_pos.clear(); + if (!gp_.edge_pos.IsAttached()) { + gp_.edge_pos.Attach(); + } + + //FIXME set the parameters to something more reasonable + EdgesPositionHandler tmp_edge_pos(gp_.g, 0, 0); + visualization::position_filler::PosFiller pos_filler(gp_.g, MapperInstance(gp_), tmp_edge_pos); + + for (const auto &chr: gp_.genome.GetChromosomes()) { + pos_filler.Process(chr.sequence, "0_" + chr.name); + pos_filler.Process(ReverseComplement(chr.sequence), "1_" + chr.name); } - TRACE(total_mapped << " " << gp_.g.length(e)); - string new_strand = "fxd" + strand; - vector used_mappings; - FindBestRangeSequence(old_mappings, used_mappings); - size_t cur_i = 0; - MappingRange new_mapping; - new_mapping = used_mappings[cur_i]; - size_t used_mapped = new_mapping.initial_range.size(); - TRACE ("Edge " << gp_.g.int_id(e) << " length "<< gp_.g.length(e)); - TRACE ("new_mapping mp_range "<< new_mapping.mapped_range.start_pos << " - " << new_mapping.mapped_range.end_pos - << " init_range " << new_mapping.initial_range.start_pos << " - " << new_mapping.initial_range.end_pos ); - while (cur_i < used_mappings.size() - 1) { - cur_i ++; - used_mapped += used_mappings[cur_i].initial_range.size(); - new_mapping = new_mapping.Merge(used_mappings[cur_i]); - TRACE("new_mapping mp_range "<< new_mapping.mapped_range.start_pos << " - " << new_mapping.mapped_range.end_pos - << " init_range " << new_mapping.initial_range.start_pos << " - " << new_mapping.initial_range.end_pos ); + for (auto e: storage_) { + FillPos(e, tmp_edge_pos); + } + + vector theoretic_lens; + for (const auto &prefix: vector{"0_", "1_"}) { + for (const auto &chr: gp_.genome.GetChromosomes()) { + string label = prefix + chr.name; + INFO("Spelling label " << label); + auto mapping_path = ConstructEdgeOrder(label); + genome_info_.AddInfo(ChromosomeInfo(label, mapping_path)); + utils::push_back_all(theoretic_lens, MappedRegions(mapping_path)); + } } -//used less that 0.9 of aligned length - if (total_mapped * 10 >= used_mapped * 10 + gp_.g.length(e)) { + + TheoreticLenStats(theoretic_lens); +} + +void GenomeConsistenceChecker::TheoreticLenStats(vector theoretic_lens) const { + size_t total_len = std::accumulate(theoretic_lens.begin(), theoretic_lens.end(), + 0, std::plus()); + + std::sort(theoretic_lens.begin(), theoretic_lens.end()); + std::reverse(theoretic_lens.begin(), theoretic_lens.end()); + size_t cur = 0; + size_t i = 0; + while (cur < total_len / 2) { + cur += theoretic_lens[i]; + i++; + } + INFO("Assuming gaps of length > " << storage_.min_length() << " unresolvable.."); + if (theoretic_lens.size() > 0) + INFO("Rough estimates on N50/L50:" << theoretic_lens[i - 1] << " / " << i - 1 << " with len " << total_len); +} + +map +GenomeConsistenceChecker::TotalAlignedLengths(const EdgesPositionHandler &tmp_edge_pos, EdgeId e) const { + map chr2len; + for (const auto &edge_pos: tmp_edge_pos.GetEdgePositions(e)) { + chr2len[edge_pos.contigId] += edge_pos.mr.initial_range.size(); + } + return chr2len; +} + +vector GenomeConsistenceChecker::MappedRegions(const GenomeConsistenceChecker::MappingPathT &mapping_path) const { + vector mapped_regions; + if (mapping_path.size() == 0) + return mapped_regions; + size_t pos = mapping_path.front().second.initial_range.start_pos; + for (size_t i = 0; i < mapping_path.size(); i++) { + auto current_range = mapping_path[i].second; + INFO("Pos: " << i << " init_range " << current_range.initial_range + << " mapped to edge " << gp_.g.str(mapping_path[i].first) + << " range " << current_range.mapped_range); + + size_t curr_start = current_range.initial_range.start_pos; + if (i > 0) { + auto prev_range = mapping_path[i - 1].second; + size_t prev_end = prev_range.initial_range.end_pos; + if (curr_start - prev_end > unresolvable_len_) { + INFO ("Large gap " << current_range.initial_range.start_pos - + prev_range.initial_range.end_pos); + mapped_regions.push_back(prev_end - pos); + pos = curr_start; + } + } + } + mapped_regions.push_back(mapping_path.back().second.initial_range.end_pos - pos); + return mapped_regions; +} + +void GenomeConsistenceChecker::FillPos(EdgeId e, const EdgesPositionHandler &tmp_edge_pos) { + size_t total_mapped; + string chr = ChromosomeByUniqueEdge(e, tmp_edge_pos, total_mapped); + if (chr.empty()) + return; + + auto mapping_info = Merge(FindBestRangeSequence(tmp_edge_pos.GetEdgePositions(e, chr))); + + //FIXME what is the logic here? + //used less that 0.9 of aligned length + VERIFY(total_mapped >= mapping_info.second); + if ((total_mapped - mapping_info.second) * 10 >= gp_.g.length(e)) { INFO ("Edge " << gp_.g.int_id(e) << " length "<< gp_.g.length(e) << "is potentially misassembled! mappings: "); - for (auto mp:old_mappings) { - INFO("mp_range "<< mp.mapped_range.start_pos << " - " << mp.mapped_range.end_pos << " init_range " << mp.initial_range.start_pos << " - " << mp.initial_range.end_pos ); + for (auto mp : tmp_edge_pos.GetEdgePositions(e, chr)) { + INFO("mp_range "<< mp.mapped_range.start_pos << " - " << mp.mapped_range.end_pos + << " init_range " << mp.initial_range.start_pos << " - " << mp.initial_range.end_pos ); if (mp.initial_range.start_pos < absolute_max_gap_) { INFO ("Fake(linear order) misassembly on edge "<< e.int_id()); - if (strand == "0") { - circular_edge_ = e; - } + circular_edges_.insert(e); } } + } + gp_.edge_pos.AddEdgePosition(e, chr, mapping_info.first); +} +pair GenomeConsistenceChecker::Merge(const vector &mappings) const { + VERIFY(mappings.size() > 0); + + MappingRange mr = mappings.front(); + size_t total_mapped = mr.initial_range.size(); + for (size_t i = 1; i < mappings.size(); ++i) { + total_mapped += mappings[i].initial_range.size(); + //FIXME why do we need merge? + mr = mr.Merge(mappings[i]); } - gp_.edge_pos.AddEdgePosition(e, new_strand, new_mapping); + return make_pair(mr, total_mapped); } +string GenomeConsistenceChecker::ChromosomeByUniqueEdge(const EdgeId &e, + const EdgesPositionHandler &tmp_edge_pos, + size_t &total) const { + DEBUG("Positioning edge " << gp_.g.str(e)); + map total_al_lens = TotalAlignedLengths(tmp_edge_pos, e); + total = 0; + for (size_t c : utils::value_set(total_al_lens)) + total += c; + + if (total > size_t(math::round((double) gp_.g.length(e) * 1.5))) { + INFO("Edge " << gp_.g.int_id(e) <<" was not unique due to the references, excluding "); + return ""; + } + + string chr = ""; + size_t max_l = 0; + for (const auto &p : total_al_lens) { + if (p.second > max_l) { + max_l = p.second; + chr = p.first; + } + } + + DEBUG("Most likely chromosome " << chr << ". Mapped bp: " << max_l); + //TODO: support non-unique edges; + if (max_l < size_t(math::round((double) gp_.g.length(e) * 0.5))) { + DEBUG("Too small a portion mapped. Edge not used"); + return ""; + } + return chr; +}; } diff -Nru spades-3.10.1+dfsg/src/common/modules/genome_consistance_checker.hpp spades-3.11.1+dfsg/src/common/modules/genome_consistance_checker.hpp --- spades-3.10.1+dfsg/src/common/modules/genome_consistance_checker.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/genome_consistance_checker.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -14,13 +14,15 @@ #include "pipeline/graph_pack.hpp" #include "visualization/position_filler.hpp" #include "assembly_graph/paths/bidirectional_path.hpp" +#include "assembly_graph/paths/mapping_path.hpp" #include "assembly_graph/graph_support/scaff_supplementary.hpp" +#include "modules/path_extend/pe_utils.hpp" namespace debruijn_graph { - using path_extend::BidirectionalPath; using path_extend::ScaffoldingUniqueEdgeStorage; +using omnigraph::MappingPath; struct PathScore{ size_t misassemblies; @@ -28,52 +30,201 @@ size_t mapped_length; PathScore(size_t m, size_t w, size_t ml): misassemblies(m), wrong_gap_size(w), mapped_length(ml) {} }; -class GenomeConsistenceChecker { -private: +class ChromosomeInfo { + std::string name_; + std::vector path_; + std::multimap edge_idxs_; + +public: + ChromosomeInfo() {} + + explicit ChromosomeInfo(const string &name, const MappingPath &mapping_path) : + name_(name), + path_(mapping_path.simple_path()) { + for (size_t i = 0; i < path_.size(); ++i) { + edge_idxs_.insert(std::make_pair(path_[i], i)); + } + } + + size_t Multiplicity(EdgeId e) const { + return edge_idxs_.count(e); + } + + size_t IsUnique(EdgeId e) const { + return Multiplicity(e) == 1; + } + + EdgeId EdgeAt(size_t idx) const { + VERIFY(idx < path_.size()); + return path_[idx]; + } + + vector EdgeIdxs(EdgeId e) const { + return utils::get_all(edge_idxs_, e); + } + + size_t UniqueEdgeIdx(EdgeId e) const { + vector idxs = EdgeIdxs(e); + VERIFY(idxs.size() == 1); + return idxs.front(); + } + + const std::string& name() const { + return name_; + } + + size_t size() const { + return path_.size(); + } +}; + +class GenomeInfo { + std::map chr_infos_; +public: + void AddInfo(ChromosomeInfo info) { + VERIFY(!chr_infos_.count(info.name())); + chr_infos_[info.name()] = std::move(info); + } + + const ChromosomeInfo& ChrInfo(const string &name) const { + return utils::get(chr_infos_, name); + } + + vector ChromosomesByEdge(EdgeId e) const { + vector answer; + for (const auto& chr_info: chr_infos_) + if (chr_info.second.Multiplicity(e)) + answer.push_back(chr_info.first); + return answer; + } + + size_t Multiplicity(EdgeId e) const { + size_t ans = 0; + for (const auto& chr_info: chr_infos_) + ans += chr_info.second.Multiplicity(e); + return ans; + } + + bool IsUnique(EdgeId e) const { + return Multiplicity(e) == 1; + } + + bool InUniqueChromosome(EdgeId e) const { + return ChromosomesByEdge(e).size() == 1; + } + + const ChromosomeInfo& UniqueChromosomeInfo(EdgeId e) const { + auto chr_names = ChromosomesByEdge(e); + VERIFY(chr_names.size() == 1); + return ChrInfo(chr_names.front()); + } + + pair UniqueChromosomeIdx(EdgeId e) const { + VERIFY(IsUnique(e)); + auto chrs = ChromosomesByEdge(e); + VERIFY(chrs.size() == 1); + return std::make_pair(chrs.front(), ChrInfo(chrs.front()).UniqueEdgeIdx(e)); + } + + vector Chromosomes() const { + vector answer; + utils::push_back_all(answer, utils::key_set(chr_infos_)); + return answer; + } +}; + +class GenomeConsistenceChecker { + typedef omnigraph::MappingPath MappingPathT; const conj_graph_pack &gp_; - //EdgesPositionHandler &position_handler_; - Sequence genome_; + const size_t absolute_max_gap_; + const double relative_max_gap_; + const size_t unresolvable_len_; + const ScaffoldingUniqueEdgeStorage &storage_; - size_t absolute_max_gap_; - double relative_max_gap_; - set excluded_unique_; - EdgeId circular_edge_; -//map from unique edges to their order in genome spelling; - mutable map genome_spelled_; - bool consequent(const Range &mr1, const Range &mr2) const; - bool consequent(const MappingRange &mr1, const MappingRange &mr2) const ; + const vector &long_reads_cov_map_; + static const size_t SIGNIFICANT_LENGTH_LOWER_LIMIT = 10000; + GenomeInfo genome_info_; + //Edges containing zero point for each reference + //TODO: do we need circular/linear chromosomes support? + set circular_edges_; + + io::DataSet reads_; + bool Consequent(const MappingRange &mr1, const MappingRange &mr2) const ; + + void PrintMisassemblyInfo(EdgeId e1, EdgeId e2) const; + + void ClassifyPosition(size_t prev_pos, size_t cur_pos, const BidirectionalPath & path, PathScore &res) const; + + PathScore InternalCountMisassemblies(const BidirectionalPath &path) const; - PathScore CountMisassembliesWithStrand(const BidirectionalPath &path, const string strand) const; //constructs longest sequence of consequetive ranges, stores result in used_mappings - void FindBestRangeSequence(const set& old_mappings, vector& used_mappings) const; -//Refills genomic positions uniting alingments separated with small gaps - void RefillPos(); - void RefillPos(const string &strand); - void RefillPos(const string &strand, const EdgeId &e); -DECL_LOGGER("GenomeConsistenceChecker"); + vector FindBestRangeSequence(const set& mappings) const; + + string ChromosomeByUniqueEdge(const EdgeId &e, + const EdgesPositionHandler &tmp_edge_pos, + size_t &total) const; + + pair Merge(const vector& mappings) const; + void FillPos(EdgeId e, const EdgesPositionHandler &tmp_edge_pos); + void ReportPathEndByPairedLib(const shared_ptr paired_lib, EdgeId current_edge) const; + + void ReportPathEndByLongLib(const path_extend::BidirectionalPathSet &covering_paths, EdgeId current_edge) const; + + void ReportEdge(EdgeId e, double w) const; + + void ReportVariants(std::vector> &sorted_w) const; + + size_t GetSupportingPathCount(EdgeId e1, EdgeId e2, size_t lib_index) const; + + void TheoreticLenStats(vector theoretic_lens) const; + + map TotalAlignedLengths(const EdgesPositionHandler &tmp_edge_pos, EdgeId e) const;; + + MappingPathT ConstructEdgeOrder(const std::string &chr_name) const; + + //returns lengths of mapped regions, divided by "unresolvable_len_" + vector MappedRegions(const MappingPathT &mapping_path) const; + + bool IsCloseToEnd(MappingRange range, const ChromosomeInfo &chr_info) const { + auto last_range = gp_.edge_pos.GetUniqueEdgePosition(chr_info.EdgeAt(chr_info.size() - 1), chr_info.name()); + return range.initial_range.end_pos + SIGNIFICANT_LENGTH_LOWER_LIMIT > last_range.initial_range.end_pos; + } + + bool IsCloseToStart(MappingRange range, const ChromosomeInfo &) const { + return range.initial_range.start_pos <= SIGNIFICANT_LENGTH_LOWER_LIMIT; + } + + DECL_LOGGER("GenomeConsistenceChecker"); public: - GenomeConsistenceChecker(const conj_graph_pack &gp, const ScaffoldingUniqueEdgeStorage &storage, size_t max_gap, double relative_max_gap /*= 0.2*/) : gp_(gp), - genome_(gp.genome.GetSequence()), storage_(storage), - absolute_max_gap_(max_gap), relative_max_gap_(relative_max_gap), excluded_unique_(), circular_edge_() { - if (!gp.edge_pos.IsAttached()) { - gp.edge_pos.Attach(); - } - gp.edge_pos.clear(); - visualization::position_filler::FillPos(gp_, gp_.genome.GetSequence(), "0"); - visualization::position_filler::FillPos(gp_, !gp_.genome.GetSequence(), "1"); - RefillPos(); + GenomeConsistenceChecker(const conj_graph_pack &gp, + size_t max_gap, + double relative_max_gap /*= 0.2*/, + size_t unresolvable_len, + const ScaffoldingUniqueEdgeStorage &storage, + const vector &long_reads_cov_map, + const io::DataSet reads) : + gp_(gp), + absolute_max_gap_(max_gap), + relative_max_gap_(relative_max_gap), + unresolvable_len_(unresolvable_len), + storage_(storage), + long_reads_cov_map_(long_reads_cov_map), + reads_(reads) { + //Fixme call outside + Fill(); } + + void Fill(); + PathScore CountMisassemblies(const BidirectionalPath &path) const; - vector > ConstructEdgeOrder() const; -//spells genome in language of long unique edges from storage; - void SpellGenome(); + void CheckPathEnd(const BidirectionalPath &path) const; + map EdgeLabels() const; }; - } diff -Nru spades-3.10.1+dfsg/src/common/modules/graph_construction.hpp spades-3.11.1+dfsg/src/common/modules/graph_construction.hpp --- spades-3.10.1+dfsg/src/common/modules/graph_construction.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/graph_construction.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -18,17 +18,19 @@ #include "io/reads/io_helper.hpp" #include "assembly_graph/core/graph.hpp" -#include "utils/debruijn_graph/debruijn_graph_constructor.hpp" -#include "utils/debruijn_graph/early_simplification.hpp" +#include "assembly_graph/construction/debruijn_graph_constructor.hpp" +#include "assembly_graph/construction/early_simplification.hpp" -#include "utils/perfcounter.hpp" +#include "utils/perf/perfcounter.hpp" #include "io/dataset_support/read_converter.hpp" #include "assembly_graph/handlers/edges_position_handler.hpp" #include "assembly_graph/graph_support/coverage_filling.hpp" -#include "utils/indices/storing_traits.hpp" -#include "utils/indices/edge_index_builders.hpp" -#include "utils/openmp_wrapper.h" +#include "utils/ph_map/storing_traits.hpp" +#include "assembly_graph/index/edge_index_builders.hpp" +#include "utils/parallel/openmp_wrapper.h" +#include "utils/extension_index/kmer_extension_index_builder.hpp" + namespace debruijn_graph { @@ -37,7 +39,7 @@ }; template<> -struct CoverageCollector { +struct CoverageCollector { template static void CollectCoverage(Info edge_info) { edge_info.edge_id->IncCoverage(edge_info.count); @@ -45,7 +47,7 @@ }; template<> -struct CoverageCollector { +struct CoverageCollector { template static void CollectCoverage(Info edge_info) { edge_info.edge_id->IncCoverage(edge_info.count); @@ -110,10 +112,8 @@ } } -#include "utils/indices/kmer_extension_index_builder.hpp" - template -ReadStatistics ConstructGraphUsingExtentionIndex(const config::debruijn_config::construction params, +utils::ReadStatistics ConstructGraphUsingExtentionIndex(const config::debruijn_config::construction params, io::ReadStreamList& streams, Graph& g, Index& index, io::SingleStreamPtr contigs_stream = io::SingleStreamPtr()) { size_t k = g.k(); @@ -124,19 +124,19 @@ TRACE("... in parallel"); // FIXME: output_dir here is damn ugly! - typedef DeBruijnExtensionIndex<> ExtensionIndex; - typedef typename ExtensionIndexHelper::DeBruijnExtensionIndexBuilderT ExtensionIndexBuilder; + typedef utils::DeBruijnExtensionIndex<> ExtensionIndex; + typedef typename utils::ExtensionIndexHelper::DeBruijnExtensionIndexBuilderT ExtensionIndexBuilder; ExtensionIndex ext((unsigned) k, index.inner_index().workdir()); //fixme hack - ReadStatistics stats = ExtensionIndexBuilder().BuildExtensionIndexFromStream(ext, streams, (contigs_stream == 0) ? 0 : &(*contigs_stream), params.read_buffer_size); + utils::ReadStatistics stats = ExtensionIndexBuilder().BuildExtensionIndexFromStream(ext, streams, (contigs_stream == 0) ? 0 : &(*contigs_stream), params.read_buffer_size); EarlyClipTips(k, params, stats.max_read_length_, ext); INFO("Condensing graph"); VERIFY(!index.IsAttached()); DeBruijnGraphExtentionConstructor g_c(g, ext); - g_c.ConstructGraph(100, 10000, 1.2, params.keep_perfect_loops);//TODO move these parameters to config + g_c.ConstructGraph(params.keep_perfect_loops); INFO("Building index with from graph") //todo pass buffer size @@ -147,7 +147,7 @@ } template -ReadStatistics ConstructGraph(const config::debruijn_config::construction ¶ms, +utils::ReadStatistics ConstructGraph(const config::debruijn_config::construction ¶ms, Streams& streams, Graph& g, Index& index, io::SingleStreamPtr contigs_stream = io::SingleStreamPtr()) { if (params.con_mode == config::construction_mode::extention) { @@ -162,11 +162,11 @@ } template -ReadStatistics ConstructGraphWithCoverage(const config::debruijn_config::construction ¶ms, +utils::ReadStatistics ConstructGraphWithCoverage(const config::debruijn_config::construction ¶ms, Streams& streams, Graph& g, Index& index, FlankingCoverage& flanking_cov, io::SingleStreamPtr contigs_stream = io::SingleStreamPtr()) { - ReadStatistics rs = ConstructGraph(params, streams, g, index, contigs_stream); + utils::ReadStatistics rs = ConstructGraph(params, streams, g, index, contigs_stream); typedef typename Index::InnerIndex InnerIndex; typedef typename EdgeIndexHelper::CoverageAndGraphPositionFillingIndexBuilderT IndexBuilder; diff -Nru spades-3.10.1+dfsg/src/common/modules/graph_read_correction.hpp spades-3.11.1+dfsg/src/common/modules/graph_read_correction.hpp --- spades-3.10.1+dfsg/src/common/modules/graph_read_correction.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/graph_read_correction.hpp 2017-09-28 09:05:09.000000000 +0000 @@ -29,9 +29,9 @@ optional UniqueAlternativeEdge(EdgeId tip, bool outgoing_tip) { vector edges; if (outgoing_tip) { - push_back_all(edges, gp_.g.OutgoingEdges(gp_.g.EdgeStart(tip))); + utils::push_back_all(edges, gp_.g.OutgoingEdges(gp_.g.EdgeStart(tip))); } else { - push_back_all(edges, gp_.g.IncomingEdges(gp_.g.EdgeEnd(tip))); + utils::push_back_all(edges, gp_.g.IncomingEdges(gp_.g.EdgeEnd(tip))); } restricted::set edges_set(edges.begin(), edges.end()); edges_set.erase(tip); diff -Nru spades-3.10.1+dfsg/src/common/modules/path_extend/extension_chooser.hpp spades-3.11.1+dfsg/src/common/modules/path_extend/extension_chooser.hpp --- spades-3.10.1+dfsg/src/common/modules/path_extend/extension_chooser.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/path_extend/extension_chooser.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -222,14 +222,13 @@ return wc_ != nullptr; } - const WeightCounter& wc() const { - VERIFY(wc_); - return *wc_; + shared_ptr wc() const { + return wc_; } protected: bool HasIdealInfo(EdgeId e1, EdgeId e2, size_t dist) const { - return math::gr(wc_->lib().IdealPairedInfo(e1, e2, (int) dist), 0.); + return math::gr(wc_->PairedLibrary().IdealPairedInfo(e1, e2, (int) dist), 0.); } bool HasIdealInfo(const BidirectionalPath& p, EdgeId e, size_t gap) const { @@ -286,6 +285,158 @@ } }; + +class SimpleCoverageExtensionChooser: public ExtensionChooser { + const SSCoverageStorage& coverage_storage_; + //less than 1 + double coverage_delta_; + //larger than 1 + double inverted_coverage_delta_; + + double min_upper_coverage_; + +public: + SimpleCoverageExtensionChooser(const SSCoverageStorage& coverage_storage, const Graph& g, + double coverage_delta, double min_upper_coverage = 0) : + ExtensionChooser(g), coverage_storage_(coverage_storage), + coverage_delta_(coverage_delta), + inverted_coverage_delta_(0), + min_upper_coverage_(min_upper_coverage) { + VERIFY(math::le(coverage_delta_, 1.0)); + VERIFY(!math::eq(coverage_delta_, 0.0)); + inverted_coverage_delta_ = 1.0 / coverage_delta_; + } + + EdgeContainer Filter(const BidirectionalPath& path, const EdgeContainer& edges) const override { + if (edges.size() != 2) + return EdgeContainer(); + + size_t index = path.Size() - 1; + while (index > 0) { + if (g_.IncomingEdgeCount(g_.EdgeStart(path[index])) == 2) + break; + index--; + } + + if (index == 0) { + return EdgeContainer(); + } + DEBUG("Split found at " << index); + EdgeId path_edge_at_split = path[index - 1]; + + return Filter(path, edges, math::ls(coverage_storage_.GetCoverage(path_edge_at_split), coverage_storage_.GetCoverage(path_edge_at_split, true))); + } + +private: + EdgeContainer Filter(const BidirectionalPath& path, const EdgeContainer& edges, bool reverse) const { + DEBUG("COVERAGE extension chooser"); + VERIFY(edges.size() == 2); + if (!IsEnoughCoverage(edges.front().e_, edges.back().e_, reverse)) { + DEBUG("Candidates are not covered enough: e1 = " << coverage_storage_.GetCoverage(edges.front().e_, reverse) << + ", e2 = " << coverage_storage_.GetCoverage(edges.back().e_, reverse)); + return EdgeContainer(); + } + + if (IsCoverageSimilar(edges.front().e_, edges.back().e_, reverse)) { + DEBUG("Candidates coverage is too similar: e1 = " << coverage_storage_.GetCoverage(edges.front().e_, reverse) << + ", e2 = " << coverage_storage_.GetCoverage(edges.back().e_, reverse)); + return EdgeContainer(); + } + + size_t index = path.Size() - 1; + while (index > 0) { + if (g_.IncomingEdgeCount(g_.EdgeStart(path[index])) == 2) + break; + index--; + } + + EdgeContainer result; + if (index > 0) { + DEBUG("Split found at " << index); + EdgeId path_edge_at_split = path[index - 1]; + EdgeId other_edge_at_split = GetOtherEdgeAtSplit(g_.EdgeEnd(path_edge_at_split), path_edge_at_split); + VERIFY(other_edge_at_split != EdgeId()); + + if (IsCoverageSimilar(path_edge_at_split, other_edge_at_split, reverse)) { + DEBUG("Path edge and alternative is too similar: path = " << coverage_storage_.GetCoverage(path_edge_at_split, reverse) << + ", other = " << coverage_storage_.GetCoverage(other_edge_at_split, reverse)); + + return EdgeContainer(); + } + if (!IsEnoughCoverage(path_edge_at_split, other_edge_at_split, reverse)) { + DEBUG("Path edge and alternative coverage is too low: path = " << coverage_storage_.GetCoverage(path_edge_at_split, reverse) << + ", other = " << coverage_storage_.GetCoverage(other_edge_at_split, reverse)); + + return EdgeContainer(); + } + + EdgeId candidate1 = edges.front().e_; + EdgeId candidate2 = edges.back().e_; + + if (math::gr(coverage_storage_.GetCoverage(path_edge_at_split, reverse), coverage_storage_.GetCoverage(other_edge_at_split, reverse))) { + DEBUG("path coverage is high, edge " << g_.int_id(path_edge_at_split) << ", path cov = " + << coverage_storage_.GetCoverage(path_edge_at_split, reverse) << ", other " << coverage_storage_.GetCoverage(other_edge_at_split, reverse)); + + result.emplace_back(math::gr(coverage_storage_.GetCoverage(candidate1, reverse), coverage_storage_.GetCoverage(candidate2, reverse)) ? candidate1 : candidate2, 0); + } else { + DEBUG("path coverage is low, edge " << g_.int_id(path_edge_at_split) << ", path cov = " + << coverage_storage_.GetCoverage(path_edge_at_split, reverse) << ", other " << coverage_storage_.GetCoverage(other_edge_at_split, reverse)); + + result.emplace_back(math::ls(coverage_storage_.GetCoverage(candidate1, reverse), coverage_storage_.GetCoverage(candidate2, reverse)) ? candidate1 : candidate2, 0); + } + + if (!IsCoverageSimilar(path_edge_at_split, result.front().e_, reverse)) { + DEBUG("Coverage is NOT similar: path = " << coverage_storage_.GetCoverage(path_edge_at_split, reverse) << + ", candidate = " << coverage_storage_.GetCoverage(result.front().e_, reverse)) + result.clear(); + } + else { + DEBUG("Coverage is similar: path = " << coverage_storage_.GetCoverage(path_edge_at_split, reverse) << + ", candidate = " << coverage_storage_.GetCoverage(result.front().e_, reverse)) + DEBUG("Coverage extension chooser helped, adding " << g_.int_id(result.front().e_)); + } + } + + VERIFY(result.size() <= 1); + return result; + } + + bool IsEnoughCoverage(EdgeId e1, EdgeId e2, bool reverse) const { + double cov1 = coverage_storage_.GetCoverage(e1, reverse); + double cov2 = coverage_storage_.GetCoverage(e2, reverse); + return math::ge(max(cov1, cov2), min_upper_coverage_) || math::eq(min(cov1, cov2), 0.0); + } + + bool IsCoverageSimilar(EdgeId e1, EdgeId e2, bool reverse) const { + double cov1 = coverage_storage_.GetCoverage(e1, reverse); + double cov2 = coverage_storage_.GetCoverage(e2, reverse); + + if (math::eq(cov2, 0.0) || math::eq(cov1, 0.0)) { + return false; + } + + double diff = cov1 / cov2; + if (math::ls(diff, 1.0)) + return math::gr(diff, coverage_delta_); + else + return math::ls(diff, inverted_coverage_delta_); + } + + EdgeId GetOtherEdgeAtSplit(VertexId split, EdgeId e) const { + VERIFY(g_.IncomingEdgeCount(split) == 2); + for (auto other : g_.IncomingEdges(split)) { + if (e != other) + return other; + } + return EdgeId(); + } + + DECL_LOGGER("SimpleCoverageExtensionChooser"); + +}; + + + class ExcludingExtensionChooser: public ExtensionChooser { PathAnalyzer analyzer_; double prior_coeff_; @@ -346,7 +497,7 @@ return edges; } std::set to_exclude; - path.Print(); + path.PrintDEBUG(); EdgeContainer result = edges; ExcludeEdges(path, result, to_exclude); result = FindFilteredEdges(path, result, to_exclude); @@ -509,8 +660,6 @@ }; class ScaffoldingExtensionChooser : public ExtensionChooser { - -protected: typedef ExtensionChooser base; double raw_weight_threshold_; double cl_weight_threshold_; @@ -539,7 +688,7 @@ void GetDistances(EdgeId e1, EdgeId e2, std::vector& dist, std::vector& w) const { - wc_->lib().CountDistances(e1, e2, dist, w); + wc_->PairedLibrary().CountDistances(e1, e2, dist, w); } void CountAvrgDists(const BidirectionalPath& path, EdgeId e, std::vector> & histogram) const { @@ -580,7 +729,7 @@ set FindCandidates(const BidirectionalPath& path) const { set jumping_edges; - const auto& lib = wc_->lib(); + const auto& lib = wc_->PairedLibrary(); //todo lib (and FindJumpEdges) knows its var so it can be counted there int is_scatter = int(math::round(lib.GetIsVar() * is_scatter_coeff_)); for (int i = (int) path.Size() - 1; i >= 0 && path.LengthAt(i) - g_.length(path.At(i)) <= lib.GetISMax(); --i) { @@ -630,7 +779,6 @@ } class LongReadsUniqueEdgeAnalyzer { -private: DECL_LOGGER("LongReadsUniqueEdgeAnalyzer") public: LongReadsUniqueEdgeAnalyzer(const Graph& g, const GraphCoverageMap& cov_map, @@ -812,51 +960,51 @@ bool uneven_depth_; }; -class SimpleScaffolding { -public: - SimpleScaffolding(const Graph& g) : g_(g) {} - - BidirectionalPath FindMaxCommonPath(const vector& paths, - size_t max_diff_len) const { - BidirectionalPath max_end(g_); - for (auto it1 = paths.begin(); it1 != paths.end(); ++it1) { - BidirectionalPath* p1 = *it1; - for (size_t i = 0; i < p1->Size(); ++i) { - if (p1->Length() - p1->LengthAt(i) > max_diff_len) { - break; - } - bool contain_all = true; - for (size_t i1 = i + 1; i1 <= p1->Size() && contain_all; ++i1) { - BidirectionalPath subpath = p1->SubPath(i, i1); - for (auto it2 = paths.begin(); it2 != paths.end() && contain_all; ++it2) { - BidirectionalPath* p2 = *it2; - vector positions2 = p2->FindAll(subpath.At(0)); - bool contain = false; - for (size_t ipos2 = 0; ipos2 < positions2.size(); ++ipos2) { - size_t pos2 = positions2[ipos2]; - if (p2->Length() - p2->LengthAt(pos2) <= max_diff_len - && EqualEnds(subpath, 0, *p2, pos2, false)) { - contain = true; - break; - } - } - if (!contain) { - contain_all = false; - } - } - if (contain_all && (i1 - i) >= max_end.Size()) { - max_end.Clear(); - max_end.PushBack(subpath); - } - } - } - } - return max_end; - } - -private: - const Graph& g_; -}; +//class SimpleScaffolding { +//public: +// SimpleScaffolding(const Graph& g) : g_(g) {} +// +// BidirectionalPath FindMaxCommonPath(const vector& paths, +// size_t max_diff_len) const { +// BidirectionalPath max_end(g_); +// for (auto it1 = paths.begin(); it1 != paths.end(); ++it1) { +// BidirectionalPath* p1 = *it1; +// for (size_t i = 0; i < p1->Size(); ++i) { +// if (p1->Length() - p1->LengthAt(i) > max_diff_len) { +// break; +// } +// bool contain_all = true; +// for (size_t i1 = i + 1; i1 <= p1->Size() && contain_all; ++i1) { +// BidirectionalPath subpath = p1->SubPath(i, i1); +// for (auto it2 = paths.begin(); it2 != paths.end() && contain_all; ++it2) { +// BidirectionalPath* p2 = *it2; +// vector positions2 = p2->FindAll(subpath.At(0)); +// bool contain = false; +// for (size_t ipos2 = 0; ipos2 < positions2.size(); ++ipos2) { +// size_t pos2 = positions2[ipos2]; +// if (p2->Length() - p2->LengthAt(pos2) <= max_diff_len +// && EqualEnds(subpath, 0, *p2, pos2, false)) { +// contain = true; +// break; +// } +// } +// if (!contain) { +// contain_all = false; +// } +// } +// if (contain_all && (i1 - i) >= max_end.Size()) { +// max_end.Clear(); +// max_end.PushBack(subpath); +// } +// } +// } +// } +// return max_end; +// } +// +//private: +// const Graph& g_; +//}; class LongReadsExtensionChooser : public ExtensionChooser { public: @@ -875,8 +1023,7 @@ cov_map_(read_paths_cov_map), unique_edge_analyzer_(g, cov_map_, filtering_threshold, unique_edge_priority_threshold, - max_repeat_length, uneven_depth), - simple_scaffolding_(g) + max_repeat_length, uneven_depth) { } @@ -889,13 +1036,12 @@ if (edges.empty()) { return edges; }DEBUG("We in Filter of LongReadsExtensionChooser"); - path.Print(); + path.PrintDEBUG(); map weights_cands; for (auto it = edges.begin(); it != edges.end(); ++it) { weights_cands.insert(make_pair(it->e_, 0.0)); } set filtered_cands; - map support_paths_ends; auto support_paths = cov_map_.GetCoveringPaths(path.Back()); DEBUG("Found " << support_paths.size() << " covering paths!!!"); for (auto it = support_paths.begin(); it != support_paths.end(); ++it) { @@ -912,10 +1058,6 @@ EdgeId next = (*it)->At(positions[i] + 1); weights_cands[next] += (*it)->GetWeight(); filtered_cands.insert(next); - if (support_paths_ends.count(next) == 0){ - support_paths_ends[next] = BidirectionalPathSet(); - } - support_paths_ends[next].insert(new BidirectionalPath((*it)->SubPath(positions[i] + 1))); } } } @@ -974,7 +1116,6 @@ size_t min_significant_overlap_; const GraphCoverageMap& cov_map_; LongReadsUniqueEdgeAnalyzer unique_edge_analyzer_; - SimpleScaffolding simple_scaffolding_; DECL_LOGGER("LongReadsExtensionChooser"); }; diff -Nru spades-3.10.1+dfsg/src/common/modules/path_extend/loop_traverser.hpp spades-3.11.1+dfsg/src/common/modules/path_extend/loop_traverser.hpp --- spades-3.10.1+dfsg/src/common/modules/path_extend/loop_traverser.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/path_extend/loop_traverser.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -22,23 +22,22 @@ namespace path_extend { class LoopTraverser { - const Graph& g_; - GraphCoverageMap& covMap_; - size_t long_edge_limit_; - size_t component_size_limit_; - size_t shortest_path_limit_; + const GraphCoverageMap& cov_map_; + const size_t long_edge_limit_; + const size_t component_size_limit_; + const size_t shortest_path_limit_; static const size_t DIJKSTRA_LIMIT = 3000; -private: - bool AnyTipsInComponent(const GraphComponent& component) const{ - for(auto e : component.edges()) { + static const size_t BASIC_N_CNT = 100; + + bool AnyTipsInComponent(const GraphComponent& component) const { + for (auto e : component.edges()) if (g_.IncomingEdgeCount(g_.EdgeStart(e)) == 0 || g_.OutgoingEdgeCount(g_.EdgeEnd(e)) == 0) return true; - } return false; } - EdgeId FindStart(const set& component_set) const{ + EdgeId FindStart(const set& component_set) const { EdgeId result; for (auto it = component_set.begin(); it != component_set.end(); ++it) { for (auto eit = g_.in_begin(*it); eit != g_.in_end(*it); ++eit) { @@ -72,19 +71,19 @@ bool IsEndInsideComponent(const BidirectionalPath &path, const set &component_set) { - if (component_set.count(g_.EdgeStart(path.Front())) == 0) { + if (component_set.count(g_.EdgeStart(path.Front())) == 0) return false; - } - for (size_t i = 0; i < path.Size(); ++i) { + + for (size_t i = 0; i < path.Size(); ++i) if (component_set.count(g_.EdgeEnd(path.At(i))) == 0) return false; - } + return true; } bool IsEndInsideComponent(const BidirectionalPath &path, EdgeId component_entrance, - const set &component_set, + const set &component_set, bool conjugate = false) { int i = path.FindLast(component_entrance); VERIFY_MSG(i != -1, "Component edge is not found in the path") @@ -92,106 +91,96 @@ if ((size_t) i == path.Size() - 1) { if (conjugate) return component_set.count(g_.conjugate(g_.EdgeEnd(path.Back()))) > 0; - else - return component_set.count(g_.EdgeEnd(path.Back())) > 0; + return component_set.count(g_.EdgeEnd(path.Back())) > 0; } if (conjugate) return IsEndInsideComponent(path.SubPath((size_t) i + 1).Conjugate(), component_set); - else - return IsEndInsideComponent(path.SubPath((size_t) i + 1), component_set); + return IsEndInsideComponent(path.SubPath((size_t) i + 1), component_set); } bool TraverseLoop(EdgeId start, EdgeId end, const set& component_set) { DEBUG("start " << g_.int_id(start) << " end " << g_.int_id(end)); - BidirectionalPathSet coveredStartPaths = - covMap_.GetCoveringPaths(start); - BidirectionalPathSet coveredEndPaths = - covMap_.GetCoveringPaths(end); - - for (auto it_path = coveredStartPaths.begin(); - it_path != coveredStartPaths.end(); ++it_path) { - if ((*it_path)->FindAll(end).size() > 0) { + BidirectionalPathSet start_cover_paths = cov_map_.GetCoveringPaths(start); + BidirectionalPathSet end_cover_paths = cov_map_.GetCoveringPaths(end); + + for (auto path_ptr : start_cover_paths) + if (path_ptr->FindAll(end).size() > 0) return false; - } - } - if (coveredStartPaths.size() < 1 or coveredEndPaths.size() < 1) { - DEBUG("TraverseLoop STRANGE SITUATION: start " << coveredStartPaths.size() << " end " << coveredEndPaths.size()); + + if (start_cover_paths.size() < 1 || end_cover_paths.size() < 1) { + DEBUG("TraverseLoop STRANGE SITUATION: start " << start_cover_paths.size() << " end " << end_cover_paths.size()); return false; } - if (coveredStartPaths.size() > 1 or coveredEndPaths.size() > 1) { + if (start_cover_paths.size() > 1 || end_cover_paths.size() > 1) { DEBUG("Ambiguous situation in path joining, quitting"); return false; } - BidirectionalPath* startPath = *coveredStartPaths.begin(); - BidirectionalPath* endPath = *coveredEndPaths.begin(); - if ((*startPath) == endPath->Conjugate()){ + BidirectionalPath& start_path = **start_cover_paths.begin(); + BidirectionalPath& end_path = **end_cover_paths.begin(); + + //TODO isn't it enough to check pointer equality? + if (start_path == end_path.Conjugate()){ return false; } //Checking that paths ends are within component - if (!IsEndInsideComponent(*startPath, start, component_set) || - !IsEndInsideComponent(*endPath->GetConjPath(), g_.conjugate(end), component_set, true)) { + if (!IsEndInsideComponent(start_path, start, component_set) || + !IsEndInsideComponent(*end_path.GetConjPath(), g_.conjugate(end), component_set, true)) { DEBUG("Some path goes outside of the component") return false; } - size_t commonSize = startPath->CommonEndSize(*endPath); - size_t nLen = 0; - DEBUG("Str " << startPath->Size() << ", end" << endPath->Size()); - if (commonSize == 0 && !startPath->Empty() > 0 && !endPath->Empty()) { + size_t common_size = start_path.CommonEndSize(end_path); + DEBUG("Str " << start_path.Size() << ", end" << end_path.Size()); + if (common_size == 0 && !start_path.Empty() && !end_path.Empty()) { DEBUG("Estimating gap size"); - VertexId lastVertex = g_.EdgeEnd(startPath->Back()); - VertexId firstVertex = g_.EdgeStart(endPath->Front()); + VertexId last_vertex = g_.EdgeEnd(start_path.Back()); + VertexId first_vertex = g_.EdgeStart(end_path.Front()); - if (firstVertex == lastVertex) { - nLen = 0; - } else { - DijkstraHelper::BoundedDijkstra dijkstra(DijkstraHelper::CreateBoundedDijkstra(g_, shortest_path_limit_, - DIJKSTRA_LIMIT)); - dijkstra.Run(lastVertex); - vector shortest_path = dijkstra.GetShortestPathTo(g_.EdgeStart(endPath->Front())); + if (first_vertex != last_vertex) { + auto dijkstra = DijkstraHelper::CreateBoundedDijkstra(g_, shortest_path_limit_, DIJKSTRA_LIMIT); + dijkstra.Run(last_vertex); + vector shortest_path = dijkstra.GetShortestPathTo(first_vertex); if (shortest_path.empty()) { DEBUG("Failed to find closing path"); return false; - } else if (!IsEndInsideComponent(BidirectionalPath(g_, shortest_path), component_set)) { + } + if (!IsEndInsideComponent(BidirectionalPath(g_, shortest_path), component_set)) { DEBUG("Closing path is outside the component"); return false; - } else { - nLen = CumulativeLength(g_, shortest_path); } } } - if (commonSize < endPath->Size()){ - startPath->PushBack(endPath->At(commonSize), (int) nLen); - } - for (size_t i = commonSize + 1; i < endPath->Size(); ++i) { - startPath->PushBack(endPath->At(i), endPath->GapAt(i), endPath->TrashPreviousAt(i), endPath->TrashCurrentAt(i)); - } + start_path.PushBack(end_path.SubPath(common_size), Gap(int(g_.k() + BASIC_N_CNT))); + DEBUG("travers"); - startPath->Print(); - endPath->Print(); + start_path.PrintDEBUG(); + end_path.PrintDEBUG(); DEBUG("conj"); - endPath->GetConjPath()->Print(); - endPath->Clear(); + end_path.GetConjPath()->PrintDEBUG(); + end_path.Clear(); return true; } bool ContainsLongEdges(const GraphComponent& component) const { - for(auto e : component.edges()) { - if(g_.length(e) > long_edge_limit_) { + for (auto e : component.edges()) + if (g_.length(e) > long_edge_limit_) return true; - } - } return false; } public: - LoopTraverser(const Graph& g, GraphCoverageMap& coverageMap, size_t long_edge_limit, size_t component_size_limit, size_t shortest_path_limit) : - g_(g), covMap_(coverageMap), long_edge_limit_(long_edge_limit), component_size_limit_(component_size_limit), shortest_path_limit_(shortest_path_limit) { + LoopTraverser(const Graph& g, GraphCoverageMap& coverage_map, + size_t long_edge_limit, size_t component_size_limit, + size_t shortest_path_limit) : + g_(g), cov_map_(coverage_map), + long_edge_limit_(long_edge_limit), + component_size_limit_(component_size_limit), + shortest_path_limit_(shortest_path_limit) { } size_t TraverseAllLoops() { diff -Nru spades-3.10.1+dfsg/src/common/modules/path_extend/overlap_analysis.hpp spades-3.11.1+dfsg/src/common/modules/path_extend/overlap_analysis.hpp --- spades-3.10.1+dfsg/src/common/modules/path_extend/overlap_analysis.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/path_extend/overlap_analysis.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -1,11 +1,10 @@ #pragma once #include "utils/logger/logger.hpp" -#include "utils/range.hpp" +#include "sequence/range.hpp" #include "ssw/ssw_cpp.h" namespace debruijn_graph { -using omnigraph::Range; struct OverlapInfo { Range r1; @@ -89,6 +88,7 @@ /*mismatch_penalty*/3, /*gap_opening_penalty*/4, /*gap_extending_penalty*/3) { + DEBUG("Considered max overlap " << flank_length); } diff -Nru spades-3.10.1+dfsg/src/common/modules/path_extend/path_extender.hpp spades-3.11.1+dfsg/src/common/modules/path_extend/path_extender.hpp --- spades-3.10.1+dfsg/src/common/modules/path_extend/path_extender.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/path_extend/path_extender.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -14,6 +14,7 @@ #pragma once #include "extension_chooser.hpp" +#include "assembly_graph/paths/bidirectional_path_container.hpp" #include "path_filter.hpp" #include "overlap_analysis.hpp" #include "assembly_graph/graph_support/scaff_supplementary.hpp" @@ -21,347 +22,430 @@ namespace path_extend { -class ShortLoopResolver { -public: - ShortLoopResolver(const Graph& g) - : g_(g) { } +inline BidirectionalPath OptimizedConjugate(const BidirectionalPath &path) { + return path.GetConjPath() ? *path.GetConjPath() : path.Conjugate(); +} - virtual ~ShortLoopResolver() { } +//TODO think about symmetry and what if it breaks? +class OverlapFindingHelper { + const Graph &g_; + const GraphCoverageMap &coverage_map_; + const size_t min_edge_len_; + const size_t max_diff_; + const bool try_extend_; + + //TODO think of the cases when (gap + length) < 0 + //Changes second argument on success + void TryExtendToEnd(const BidirectionalPath &path, size_t &pos) const { + if (pos < path.Size() && + path.GapAt(pos).gap + path.LengthAt(pos) <= max_diff_) + pos = path.Size(); + } + + //Changes second argument on success + void TryExtendToStart(const BidirectionalPath &path, size_t &pos) const { + if (pos > 0 && path.Length() - path.LengthAt(pos) <= max_diff_) + pos = 0; + } + + pair ComparePaths(const BidirectionalPath &path1, + const BidirectionalPath &path2, + size_t start2) const { + TRACE("Comparing paths " << path1.GetId() << " and " << path2.GetId()); + //TODO change to edit distance? + int shift1 = 0; + //path1 is always matched from the start + const size_t start1 = 0; + size_t end1 = start1; + size_t end2 = start2; - virtual void ResolveShortLoop(BidirectionalPath& path) const = 0; + for (size_t i = start1; i < path1.Size(); ++i) { + if (abs(shift1) > int(max_diff_)) + break; -protected: - DECL_LOGGER("PathExtender") - const Graph& g_; + bool match = false; + size_t j = end2; + int shift2 = 0; + for (; j < path2.Size(); ++j) { + if (end1 == 0) { + //Force first match to start with pos2 + if (j > start2) { + break; + } + } - void UndoCycles(BidirectionalPath& p, EdgeId next_edge) const { - if (p.Size() <= 2) { - return; - } - EdgeId first_edge = p.Back(); - EdgeId second_edge = next_edge; - while (p.Size() > 2) { - if (p.At(p.Size() - 1) == first_edge && p.At(p.Size() - 2) == second_edge) { - p.PopBack(2); + if (abs(shift2) > int(max_diff_)) + break; + if (path1.At(i) == path2.At(j) && + (end1 == 0 || + abs(shift1 + path1.GapAt(i).gap - shift2 - path2.GapAt(j).gap) <= int(max_diff_))) { + match = true; + break; + } else { + shift2 += path2.ShiftLength(j); + } + } + if (match) { + end1 = i+1; + end2 = j+1; + shift1 = 0; } else { - return;; + shift1 += path1.ShiftLength(i); } } - } - void MakeCycleStep(BidirectionalPath& path, EdgeId e) const { - if (path.Size() == 0) { - return; + //Extending the ends of the paths if possible + if (try_extend_ && end1 > 0) { + TryExtendToEnd(path1, end1); + TryExtendToEnd(path2, end2); + //no need to extend path1 left + VERIFY(start1 == 0); + TryExtendToStart(path2, start2); } - EdgeId pathEnd = path.Back(); - path.PushBack(e); - path.PushBack(pathEnd); + + return make_pair(Range(start1, end1), Range(start2, end2)); } -}; -class CovShortLoopResolver : public ShortLoopResolver { public: - CovShortLoopResolver(const conj_graph_pack& gp) - : ShortLoopResolver(gp.g), gp_(gp) { + OverlapFindingHelper(const Graph &g, + const GraphCoverageMap &coverage_map, + size_t min_edge_len, + size_t max_diff) : + g_(g), + coverage_map_(coverage_map), + min_edge_len_(min_edge_len), + max_diff_(max_diff), + //had to enable try_extend, otherwise equality lost symmetry + try_extend_(max_diff_ > 0) { + } + + bool IsSubpath(const BidirectionalPath &path, + const BidirectionalPath &other) const { + for (size_t j = 0; j < other.Size(); ++j) { + auto range_pair = ComparePaths(path, other, j); + if (range_pair.first.end_pos == path.Size()) { + return true; + } + } + return false; + } + //NB! Equality is not transitive if max_diff is > 0 + bool IsEqual(const BidirectionalPath &path, + const BidirectionalPath &other) const { + auto ends_pair = CommonPrefix(path, other); + return ends_pair.first == path.Size() + && ends_pair.second == other.Size(); } - void ResolveShortLoop(BidirectionalPath& path) const override { - DEBUG("resolve short loop by coverage"); - path.Print(); - - pair edges; - if (path.Size() >= 1 && GetLoopAndExit(g_, path.Back(), edges)) { - DEBUG("Coverage Short Loop Resolver"); - UndoCycles(path, edges.first); - EdgeId e1 = path.Back(); - EdgeId e2 = edges.first; - EdgeId e_out = edges.second; - auto prob_e_in = g_.IncomingEdges(g_.EdgeEnd(e2)); - EdgeId e_in = *prob_e_in.begin(); - size_t count = 0; - for (auto edge = prob_e_in.begin(); edge != prob_e_in.end(); ++edge) { - if (*edge != e2) - e_in = *edge; - count++; - } - if (count != 2) { - return; - } - double in_cov = gp_.flanking_cov.GetOutCov(e_in); //g_.coverage(e_in); - double out_cov = gp_.flanking_cov.GetInCov(e_out); //g_.coverage(e_out); - double cov = (in_cov + out_cov) / 2.0; - //what are time variables??? - double time1 = math::round(gp_.g.coverage(e1) / cov); - double time2 = math::round(gp_.g.coverage(e2) / cov); - size_t time = (size_t) std::max(0.0, std::min(time1 - 1.0, time2)); - for (size_t i = 0; i < time; ++i) { - MakeCycleStep(path, edges.first); - } - path.PushBack(edges.second); - DEBUG("loop with start " << g_.int_id(e_in) - <<" e1 " << g_.int_id(e1) - << " e2 " << g_.int_id(e2) - << " out " <Subscribe(&coverage_map); + for (size_t i = 0; i < path->Size(); ++i) { + coverage_map.BackEdgeAdded(path->At(i), path, path->GapAt(i)); + } +} + +inline BidirectionalPath* AddPath(PathContainer &paths, + const BidirectionalPath &path, + GraphCoverageMap &coverage_map) { + BidirectionalPath* p = new BidirectionalPath(path); + BidirectionalPath* conj_p = new BidirectionalPath(OptimizedConjugate(path)); + SubscribeCoverageMap(p, coverage_map); + SubscribeCoverageMap(conj_p, coverage_map); + paths.AddPair(p, conj_p); + return p; +} + +class ShortLoopEstimator { +public: + //Path must end with forward cycle edge, contain at least 2 edges and must not contain backward cycle edges + //Returns 0 (for no loops), 1 (for a single loop) or 2 (for many loops) + virtual size_t EstimateSimpleCycleCount(const BidirectionalPath& path, EdgeId backward_edge, EdgeId exit_edge) const = 0; + virtual ~ShortLoopEstimator() {}; +}; + +class ShortLoopResolver { public: - SimpleLoopResolver(Graph& g) : ShortLoopResolver(g) { } + static const size_t BASIC_N_CNT = 100; - void ResolveShortLoop(BidirectionalPath& path) const override { - pair edges; - if (path.Size() >= 1 && GetLoopAndExit(g_, path.Back(), edges)) { + ShortLoopResolver(const Graph& g, shared_ptr loop_estimator) + : g_(g), loop_estimator_(loop_estimator) { } + + void ResolveShortLoop(BidirectionalPath& path) const { + EdgeId back_cycle_edge; + EdgeId loop_exit; + if (path.Size() >=1 && GetLoopAndExit(g_, path.Back(), back_cycle_edge, loop_exit)) { DEBUG("Resolving short loop..."); - EdgeId e = path.Back(); - path.PushBack(edges.first); - path.PushBack(e); - path.PushBack(edges.second); + MakeBestChoice(path, back_cycle_edge, loop_exit); DEBUG("Resolving short loop done"); } } -protected: - DECL_LOGGER("PathExtender") -}; - -class LoopResolver : public ShortLoopResolver { - static const size_t ITER_COUNT = 10; - const WeightCounter& wc_; - private: - bool CheckLoopPlausible(EdgeId froward_loop_edge, EdgeId backward_loop_edge) const { - size_t single_loop_length = 2 * g_.length(froward_loop_edge) + g_.length(backward_loop_edge); - return single_loop_length <= wc_.get_libptr()->GetISMax(); - } - -public: - LoopResolver(const Graph& g, const WeightCounter& wc) - : ShortLoopResolver(g), - wc_(wc) { } - //This code works only if loop wasn't fairly resolved - // - //Weird interface; need comments - void MakeBestChoice(BidirectionalPath& path, pair& edges) const { - UndoCycles(path, edges.first); - BidirectionalPath experiment(path); - double max_weight = wc_.CountWeight(experiment, edges.second); - double diff = max_weight - wc_.CountWeight(experiment, edges.first); - size_t maxIter = 0; - for (size_t i = 1; i <= ITER_COUNT; ++i) { - double weight = wc_.CountWeight(experiment, edges.first); - if (weight > 0) { - MakeCycleStep(experiment, edges.first); - weight = wc_.CountWeight(experiment, edges.second); - double weight2 = wc_.CountWeight(experiment, edges.first); - if (weight > max_weight || (weight == max_weight && weight - weight2 > diff) - || (weight == max_weight && weight - weight2 == diff && i == 1)) { - max_weight = weight; - maxIter = i; - diff = weight - weight2; - } - } - } + DECL_LOGGER("PathExtender") + const Graph& g_; + shared_ptr loop_estimator_; - if (!CheckLoopPlausible(path.Back(), edges.first) && maxIter > 0) { - MakeCycleStep(path, edges.first); - path.PushBack(edges.second, int(g_.k() + 100)); + void UndoCycles(BidirectionalPath& p, EdgeId next_edge) const { + if (p.Size() <= 2) { + return; } - else { - for (size_t i = 0; i < maxIter; ++i) { - MakeCycleStep(path, edges.first); + EdgeId first_edge = p.Back(); + EdgeId second_edge = next_edge; + while (p.Size() > 2) { + if (p.At(p.Size() - 1) == first_edge && p.At(p.Size() - 2) == second_edge) { + p.PopBack(2); + } else { + return;; } - path.PushBack(edges.second); } - } - void ResolveShortLoop(BidirectionalPath& path) const override { - pair edges; - if (path.Size() >=1 && GetLoopAndExit(g_, path.Back(), edges)) { - DEBUG("Resolving short loop..."); - MakeBestChoice(path, edges); - DEBUG("Resolving short loop done"); + //edges -- first edge is loop's back edge, second is loop exit edge + void MakeBestChoice(BidirectionalPath& path, EdgeId back_cycle_edge, EdgeId loop_exit) const { + EdgeId forward_cycle_edge = path.Back(); + UndoCycles(path, back_cycle_edge); + + //Expects 0 (for no loops), 1 (for a single loop) or 2 (for many loops, will insert back_cycle_edge and Ns) + size_t loop_count = loop_estimator_->EstimateSimpleCycleCount(path, back_cycle_edge, loop_exit); + if (loop_count > 0) { + path.PushBack(back_cycle_edge); + if (loop_count == 1) { + DEBUG("Single loop"); + path.PushBack(forward_cycle_edge); + path.PushBack(loop_exit); + } + else { + DEBUG("Multiple cycles"); + path.PushBack(loop_exit, Gap(int(g_.k() + BASIC_N_CNT))); + } + } + else { + path.PushBack(loop_exit); } } - }; -class GapJoiner { - +class CoverageLoopEstimator : public ShortLoopEstimator { public: - static const int INVALID_GAP = -1000000; - GapJoiner(const Graph& g) - : g_(g) { } + CoverageLoopEstimator(const Graph& g, const FlankingCoverage& flanking_cov) + : g_(g), flanking_cov_(flanking_cov) { + + } - virtual Gap FixGap( EdgeId source, EdgeId sink, int initial_gap) const = 0; + //Path must end with forward cycle edge, contain at least 2 edges and must not contain backward cycle edges + //Returns 0 (for no loops), 1 (for a single loop) or 2 (for many loops) + size_t EstimateSimpleCycleCount(const BidirectionalPath& path, EdgeId backward_edge, EdgeId exit_edge) const override { + VERIFY(path.Size() > 1); + EdgeId forward_edge = path.Back(); + EdgeId incoming_edge = path[path.Size() - 2]; + double in_cov = flanking_cov_.GetOutCov(incoming_edge); + double out_cov = flanking_cov_.GetInCov(exit_edge); + double avg_coverage = (in_cov + out_cov) / 2.0; + + double fwd_count = math::round(g_.coverage(forward_edge) / avg_coverage); + double back_count = math::round(g_.coverage(backward_edge) / avg_coverage); + size_t result = (size_t) math::round(std::max(0.0, std::min(fwd_count - 1.0, back_count))); + + DEBUG("loop with start " << g_.int_id(incoming_edge) + <<" e1 " << g_.int_id(forward_edge) + << " e2 " << g_.int_id(backward_edge) + << " out " < 1); + VERIFY(wc_ != nullptr); + EdgeId forward_cycle_edge = path.Back(); + + size_t result = 0; + double lopp_edge_weight = wc_->CountWeight(path, backward_edge); + if (math::gr(lopp_edge_weight, weight_threshold_)) { + //Paired information on loop back edges exits => at leat one iteration + //Looking for paired information supporting more than 1 cycle + if (NoSelfPairedInfo(backward_edge, forward_cycle_edge)) { + //More likely to be a single cycle + DEBUG("Single loop"); + result = 1; + } + else { + DEBUG("Multiple cycles"); + //More likely to be a 2 or more cycles + result = 2; } } - DEBUG("Perfect overlap is not found, inintial: " << initial_gap); - return Gap(initial_gap); + return result; } -}; - -class HammingGapJoiner: public GapJoiner { - const double min_gap_score_; - const size_t short_overlap_threshold_; - const size_t basic_overlap_length_; - vector DiffPos(const Sequence& s1, const Sequence& s2) const { - VERIFY(s1.size() == s2.size()); - vector < size_t > answer; - for (size_t i = 0; i < s1.size(); ++i) - if (s1[i] != s2[i]) - answer.push_back(i); - return answer; - } +private: - size_t HammingDistance(const Sequence& s1, const Sequence& s2) const { - VERIFY(s1.size() == s2.size()); - size_t dist = 0; - for (size_t i = 0; i < s1.size(); ++i) { - if (s1[i] != s2[i]) { - dist++; + bool NoSelfPairedInfo(EdgeId back_cycle_edge, EdgeId forward_cycle_edge) const { + size_t is = wc_->PairedLibrary().GetISMax(); + int forward_len = (int) g_.length(forward_cycle_edge); + bool exists_pi = true; + + BidirectionalPath cycle(g_, back_cycle_edge); + while (cycle.Length() < is + g_.length(back_cycle_edge)) { + auto w = wc_->CountWeight(cycle, back_cycle_edge, std::set(), forward_len); + if (math::gr(w, weight_threshold_)) { + //Paired information found within loop + DEBUG("Found PI with back weight " << w << ", weight threshold " << weight_threshold_); + exists_pi = false; + break; } + cycle.PushBack(back_cycle_edge, Gap(forward_len)); } - return dist; - } -// double ScoreGap(const Sequence& s1, const Sequence& s2, int gap, int initial_gap) const { -// VERIFY(s1.size() == s2.size()); -// return 1.0 - (double) HammingDistance(s1, s2) / (double) s1.size() -// - (double) abs(gap - initial_gap) / (double) (2 * g_.k()); -// } - - - double ScoreGap(const Sequence& s1, const Sequence& s2) const { - VERIFY(s1.size() == s2.size()); - return 1.0 - (double) HammingDistance(s1, s2) / (double) s1.size(); + return exists_pi; } +}; +class CombinedLoopEstimator: public ShortLoopEstimator { public: - - //todo review parameters in usages - HammingGapJoiner(const Graph& g, - double min_gap_score, - size_t short_overlap_threshold, - size_t basic_overlap_length): - GapJoiner(g), - min_gap_score_(min_gap_score), - short_overlap_threshold_(short_overlap_threshold), - basic_overlap_length_(basic_overlap_length) - { - DEBUG("HammingGapJoiner params: \n min_gap_score " << min_gap_score_ << - "\n short_overlap_threshold " << short_overlap_threshold_ << - "\n basic_overlap_length " << basic_overlap_length_); - } - - //estimated_gap is in k-mers - Gap FixGap(EdgeId source, EdgeId sink, int estimated_gap) const override { - - size_t corrected_start_overlap = basic_overlap_length_; - if (estimated_gap < 0) { - corrected_start_overlap -= estimated_gap; + CombinedLoopEstimator(const Graph& g, + const FlankingCoverage& flanking_cov, + shared_ptr wc, + double weight_threshold = 0.0) + : pi_estimator_(g, wc, weight_threshold), + cov_estimator_(g, flanking_cov) {} + + //Path must end with forward cycle edge, contain at least 2 edges and must not contain backward cycle edges + //Returns 0 (for no loops), 1 (for a single loop) or 2 (for many loops) + size_t EstimateSimpleCycleCount(const BidirectionalPath& path, EdgeId backward_edge, EdgeId exit_edge) const override { + size_t result = pi_estimator_.EstimateSimpleCycleCount(path, backward_edge, exit_edge); + if (result == 1) { + //Verify using coverage + if (cov_estimator_.EstimateSimpleCycleCount(path, backward_edge, exit_edge) > 1) + result = 2; } + return result; + } - corrected_start_overlap = min(corrected_start_overlap, - g_.k() + min(g_.length(source), g_.length(sink))); +private: + PairedInfoLoopEstimator pi_estimator_; + CoverageLoopEstimator cov_estimator_; +}; - DEBUG("Corrected max overlap " << corrected_start_overlap); - double best_score = min_gap_score_; - int fixed_gap = INVALID_GAP; - double overlap_coeff = 0.3; - size_t min_overlap = 1ul; - if (estimated_gap < 0) { - size_t estimated_overlap = g_.k() - estimated_gap; - min_overlap = max(size_t(math::round(overlap_coeff * double(estimated_overlap))), 1ul); - } - //todo better usage of estimated overlap - DEBUG("Min overlap " << min_overlap); - - for (size_t l = corrected_start_overlap; l >= min_overlap; --l) { - //TRACE("Sink: " << g_.EdgeNucls(sink).Subseq(g_.length(sink) + g_.k() - l).str()); - //TRACE("Source: " << g_.EdgeNucls(source).Subseq(0, l)); - double score = 0; - score = ScoreGap(g_.EdgeNucls(source).Subseq(g_.length(source) + g_.k() - l), - g_.EdgeNucls(sink).Subseq(0, l)); - if (math::gr(score, best_score)) { - TRACE("Curr overlap " << l); - TRACE("Score: " << score); - best_score = score; - fixed_gap = int(g_.k() - l); - } +//TODO move to gap_closing.hpp +typedef omnigraph::GapDescription GapDescription; +class GapAnalyzer { - if (l == short_overlap_threshold_ && fixed_gap != INVALID_GAP) { - //look at "short" overlaps only if long overlaps couldn't be found - DEBUG("Not looking at short overlaps"); - break; - } - } +public: + static const int INVALID_GAP = GapDescription::INVALID_GAP; + GapAnalyzer(const Graph& g) + : g_(g) { } - if (fixed_gap != INVALID_GAP) { - DEBUG("Found candidate gap length with score " << best_score); - DEBUG("Estimated gap: " << estimated_gap << - ", fixed gap: " << fixed_gap << " (overlap " << g_.k() - fixed_gap<< ")"); - } - return Gap(fixed_gap); - } + virtual GapDescription FixGap(const GapDescription &gap) const = 0; -private: - DECL_LOGGER("HammingGapJoiner"); + virtual ~GapAnalyzer() { } +protected: + const Graph& g_; }; -//deprecated! -//fixme reduce code duplication with HammingGapJoiner -class LikelihoodHammingGapJoiner: public GapJoiner { - static const size_t DEFAULT_PADDING_LENGTH = 10; +class HammingGapAnalyzer: public GapAnalyzer { const double min_gap_score_; const size_t short_overlap_threshold_; const size_t basic_overlap_length_; - vector DiffPos(const Sequence& s1, const Sequence& s2) const { - VERIFY(s1.size() == s2.size()); - vector < size_t > answer; - for (size_t i = 0; i < s1.size(); ++i) - if (s1[i] != s2[i]) - answer.push_back(i); - return answer; - } + static constexpr double MIN_OVERLAP_COEFF = 0.05; size_t HammingDistance(const Sequence& s1, const Sequence& s2) const { VERIFY(s1.size() == s2.size()); @@ -374,80 +458,65 @@ return dist; } -// double ScoreGap(const Sequence& s1, const Sequence& s2, int gap, int initial_gap) const { -// VERIFY(s1.size() == s2.size()); -// return 1.0 - (double) HammingDistance(s1, s2) / (double) s1.size() -// - (double) abs(gap - initial_gap) / (double) (2 * g_.k()); -// } - - //FIXME use GC content, change match prob and use partition of tip sequence into bad and good part double ScoreGap(const Sequence& s1, const Sequence& s2) const { - static double match_prob = 0.9; - static double log_match_prob = log2(match_prob); - static double log_mismatch_prob = log2(1. - match_prob); VERIFY(s1.size() == s2.size()); - size_t n = s1.size(); - size_t mismatches = HammingDistance(s1, s2); - VERIFY(mismatches <= n); - return 2.*double(n) + double(n - mismatches) * log_match_prob + double(mismatches) * log_mismatch_prob; + return 1.0 - (double) HammingDistance(s1, s2) / (double) s1.size(); } public: //todo review parameters in usages - LikelihoodHammingGapJoiner(const Graph& g, + HammingGapAnalyzer(const Graph& g, double min_gap_score, size_t short_overlap_threshold, size_t basic_overlap_length): - GapJoiner(g), + GapAnalyzer(g), min_gap_score_(min_gap_score), short_overlap_threshold_(short_overlap_threshold), basic_overlap_length_(basic_overlap_length) { - DEBUG("LikelihoodHammingGapJoiner params: \n min_gap_score " << min_gap_score_ << + DEBUG("HammingGapAnalyzer params: \n min_gap_score " << min_gap_score_ << "\n short_overlap_threshold " << short_overlap_threshold_ << "\n basic_overlap_length " << basic_overlap_length_); } - //estimated_gap is in k-mers - Gap FixGap(EdgeId source, EdgeId sink, int estimated_gap) const override { + GapDescription FixGap(const GapDescription &gap) const override { + VERIFY_MSG(gap.no_trim(), "Trims not supported yet"); - size_t corrected_start_overlap = basic_overlap_length_; - if (estimated_gap < 0) { - corrected_start_overlap -= estimated_gap; + size_t max_overlap = basic_overlap_length_; + if (gap.estimated_dist() < 0) { + max_overlap -= gap.estimated_dist(); } - corrected_start_overlap = min(corrected_start_overlap, - g_.k() + min(g_.length(source), g_.length(sink))); + max_overlap = min(max_overlap, + g_.k() + min(g_.length(gap.left()), g_.length(gap.right()))); - DEBUG("Corrected max overlap " << corrected_start_overlap); + DEBUG("Corrected max overlap " << max_overlap); double best_score = min_gap_score_; - int fixed_gap = INVALID_GAP; + int fixed_gap = GapDescription::INVALID_GAP; - double overlap_coeff = 0.3; - size_t min_overlap = 1ul; - if (estimated_gap < 0) { - size_t estimated_overlap = g_.k() - estimated_gap; - min_overlap = max(size_t(math::round(overlap_coeff * double(estimated_overlap))), 1ul); + size_t min_overlap = 1; + if (gap.estimated_dist() < 0) { + min_overlap = max(min_overlap, size_t(math::round(MIN_OVERLAP_COEFF * double(-gap.estimated_dist())))); } //todo better usage of estimated overlap DEBUG("Min overlap " << min_overlap); - for (size_t l = corrected_start_overlap; l >= min_overlap; --l) { + for (size_t l = max_overlap; l >= min_overlap; --l) { //TRACE("Sink: " << g_.EdgeNucls(sink).Subseq(g_.length(sink) + g_.k() - l).str()); //TRACE("Source: " << g_.EdgeNucls(source).Subseq(0, l)); double score = 0; - score = ScoreGap(g_.EdgeNucls(source).Subseq(g_.length(source) + g_.k() - l), - g_.EdgeNucls(sink).Subseq(0, l)); + score = ScoreGap(g_.EdgeNucls(gap.left()).Subseq(g_.length(gap.left()) + g_.k() - l), + g_.EdgeNucls(gap.right()).Subseq(0, l)); if (math::gr(score, best_score)) { TRACE("Curr overlap " << l); TRACE("Score: " << score); best_score = score; - fixed_gap = int(g_.k() - l); + fixed_gap = -int(l); } - if (l == short_overlap_threshold_ && fixed_gap != INVALID_GAP) { + if (l == short_overlap_threshold_ && fixed_gap != GapDescription::INVALID_GAP) { //look at "short" overlaps only if long overlaps couldn't be found DEBUG("Not looking at short overlaps"); break; @@ -456,165 +525,144 @@ if (fixed_gap != INVALID_GAP) { DEBUG("Found candidate gap length with score " << best_score); - DEBUG("Estimated gap: " << estimated_gap << - ", fixed gap: " << fixed_gap << " (overlap " << g_.k() - fixed_gap<< ")"); + DEBUG("Estimated gap: " << gap.estimated_dist() << + ", fixed gap: " << fixed_gap << " (overlap " << (-fixed_gap) << ")"); + + auto answer = gap; + answer.set_estimated_dist(fixed_gap); + return answer; + } else { + return GapDescription(); } - return Gap(fixed_gap); } private: - DECL_LOGGER("LikelihoodHammingGapJoiner"); + DECL_LOGGER("HammingGapAnalyzer"); }; -//if I was in LA -class LAGapJoiner: public GapJoiner { +//LA stands for Local Alignment +//TODO if current setting will work -- get rid of flank_*_coefficient params +class LAGapAnalyzer: public GapAnalyzer { public: - LAGapJoiner(const Graph& g, size_t min_la_length, + LAGapAnalyzer(const Graph& g, size_t min_la_length, double flank_multiplication_coefficient, - double flank_addition_coefficient) : - GapJoiner(g), min_la_length_(min_la_length), flank_addition_coefficient_( - flank_addition_coefficient), flank_multiplication_coefficient_( - flank_multiplication_coefficient) { - DEBUG("flank_multiplication_coefficient - " << flank_multiplication_coefficient_); - DEBUG("flank_addition_coefficient_ - " << flank_addition_coefficient_ ); - } - - Gap FixGap(EdgeId source, EdgeId sink, int initial_gap) const override { - - DEBUG("Overlap doesn't exceed " << size_t(abs(initial_gap) * ESTIMATED_GAP_MULTIPLIER) + GAP_ADDITIONAL_COEFFICIENT); - SWOverlapAnalyzer overlap_analyzer( - size_t(abs(initial_gap) * ESTIMATED_GAP_MULTIPLIER) + GAP_ADDITIONAL_COEFFICIENT); - - auto overlap_info = overlap_analyzer.AnalyzeOverlap(g_, source, - sink); + int flank_addition_coefficient) : + GapAnalyzer(g), + min_la_length_(min_la_length), + flank_multiplication_coefficient_(flank_multiplication_coefficient), + flank_addition_coefficient_(flank_addition_coefficient) { + DEBUG("flank_multiplication_coefficient - " << flank_multiplication_coefficient_); + DEBUG("flank_addition_coefficient - " << flank_addition_coefficient_ ); + } + + GapDescription FixGap(const GapDescription &gap) const override { + VERIFY_MSG(gap.no_trim(), "Trims not supported yet"); + //estimated_gap is in k-mers + + size_t estimated_overlap = gap.estimated_dist() < 0 ? size_t(abs(gap.estimated_dist())) : 0; + SWOverlapAnalyzer overlap_analyzer(size_t(math::round(double(estimated_overlap) * ESTIMATED_GAP_MULTIPLIER)) + + GAP_ADDITIONAL_COEFFICIENT); + auto overlap_info = overlap_analyzer.AnalyzeOverlap(g_, gap.left(), gap.right()); DEBUG(overlap_info); if (overlap_info.size() < min_la_length_) { DEBUG("Low alignment size"); - return Gap(INVALID_GAP); + return GapDescription(); } size_t max_flank_length = max(overlap_info.r2.start_pos, - g_.length(source) + g_.k() - overlap_info.r1.end_pos); + g_.length(gap.left()) + g_.k() - overlap_info.r1.end_pos); DEBUG("Max flank length - " << max_flank_length); - if ((double) max_flank_length * flank_multiplication_coefficient_ - + flank_addition_coefficient_ > (double) overlap_info.size()) { + if (int(math::round(double(max_flank_length) * flank_multiplication_coefficient_)) + + flank_addition_coefficient_ > int(overlap_info.size())) { DEBUG("Too long flanks for such alignment"); - return Gap(INVALID_GAP); + return GapDescription(); } if (math::ls(overlap_info.identity(), IDENTITY_RATIO)) { DEBUG("Low identity score"); - return Gap(INVALID_GAP); - } - - if (g_.k() + 1 > overlap_info.r1.end_pos) { - DEBUG("Save kmers. Don't want to have edges shorter than k"); - return Gap(INVALID_GAP); + return GapDescription(); } - if (overlap_info.r2.start_pos > g_.length(sink)) { - DEBUG("Save kmers. Don't want to have edges shorter than k"); - return Gap(INVALID_GAP); + if (overlap_info.r1.end_pos <= g_.k() || overlap_info.r2.start_pos >= g_.length(gap.right())) { + DEBUG("Less than k+1 nucleotides were left of one of the edges"); + return GapDescription(); } - return Gap( - (int) (-overlap_info.r1.size() - overlap_info.r2.start_pos - + g_.k()), - (uint32_t) (g_.length(source) + g_.k() - - overlap_info.r1.end_pos), - (uint32_t) overlap_info.r2.start_pos); + //TODO Is it ok to have a non-symmetric overlap gap description + return GapDescription(gap.left(), gap.right(), + -int(overlap_info.r2.size()), + g_.length(gap.left()) + g_.k() - overlap_info.r1.end_pos, + overlap_info.r2.start_pos); } private: - DECL_LOGGER("LAGapJoiner"); + DECL_LOGGER("LAGapAnalyzer"); const size_t min_la_length_; - const double flank_addition_coefficient_; const double flank_multiplication_coefficient_; - constexpr static double IDENTITY_RATIO = 0.9; - constexpr static double ESTIMATED_GAP_MULTIPLIER = 2.0; - const size_t GAP_ADDITIONAL_COEFFICIENT = 30; + const int flank_addition_coefficient_; + + static constexpr double IDENTITY_RATIO = 0.9; + static constexpr double ESTIMATED_GAP_MULTIPLIER = 2.0; + static constexpr size_t GAP_ADDITIONAL_COEFFICIENT = 30; }; -class CompositeGapJoiner: public GapJoiner { +class CompositeGapAnalyzer: public GapAnalyzer { public: - CompositeGapJoiner(const Graph& g, - const vector>& joiners, - size_t may_overlap_threhold, - int must_overlap_threhold, - size_t artificail_gap) : - GapJoiner(g), - joiners_(joiners), - may_overlap_threshold_(may_overlap_threhold), - must_overlap_threshold_(must_overlap_threhold), - artificial_gap_(artificail_gap) - { } - - Gap FixGap(EdgeId source, EdgeId sink, int estimated_gap) const override { - DEBUG("Trying to fix estimated gap " << estimated_gap << - " between " << g_.str(source) << " and " << g_.str(sink)); + CompositeGapAnalyzer(const Graph& g, + const vector>& joiners, + size_t may_overlap_threshold, + int must_overlap_threshold, + size_t artificial_gap) : + GapAnalyzer(g), + joiners_(joiners), + may_overlap_threshold_(may_overlap_threshold), + must_overlap_threshold_(must_overlap_threshold), + artificial_gap_(artificial_gap) + { } + + GapDescription FixGap(const GapDescription &gap) const override { + VERIFY_MSG(gap.right_trim() == 0 && gap.left_trim() == 0, "Not supported yet"); + DEBUG("Trying to fix estimated gap " << gap.estimated_dist() << + " between " << g_.str(gap.left()) << " and " << g_.str(gap.right())); - if (estimated_gap > int(g_.k() + may_overlap_threshold_)) { + if (gap.estimated_dist() > int(may_overlap_threshold_)) { DEBUG("Edges are supposed to be too far to check overlaps"); - return Gap(estimated_gap); + return gap; } for (auto joiner : joiners_) { - Gap gap = joiner->FixGap(source, sink, estimated_gap); - if (gap.gap_ != GapJoiner::INVALID_GAP) { - return gap; + GapDescription fixed_gap = joiner->FixGap(gap); + if (fixed_gap != GapDescription()) { + return fixed_gap; } } //couldn't find decent overlap - if (estimated_gap < must_overlap_threshold_) { + if (gap.estimated_dist() < must_overlap_threshold_) { DEBUG("Estimated gap looks unreliable"); - return Gap(INVALID_GAP); + return GapDescription(); } else { DEBUG("Overlap was not found"); - return Gap(max(estimated_gap, int(g_.k() + artificial_gap_))); + auto answer = gap; + answer.set_estimated_dist(max(gap.estimated_dist(), int(artificial_gap_))); + return answer; } } private: - vector> joiners_; + vector> joiners_; const size_t may_overlap_threshold_; const int must_overlap_threshold_; const size_t artificial_gap_; - DECL_LOGGER("CompositeGapJoiner"); + DECL_LOGGER("CompositeGapAnalyzer"); }; -//FIXME move to tests -//Just for test. Look at overlap_analysis_tests -inline Gap MimicLAGapJoiner(Sequence& s1, Sequence& s2) { - const int INVALID_GAP = -1000000; - constexpr static double IDENTITY_RATIO = 0.9; - - SWOverlapAnalyzer overlap_analyzer_(10000); - auto overlap_info = overlap_analyzer_.AnalyzeOverlap(s1, s2); - size_t min_la_length_ = 4; - if (overlap_info.size() < min_la_length_) { - DEBUG("Low alignment size"); - return Gap(INVALID_GAP); - } - if (overlap_info.identity() < IDENTITY_RATIO) { - DEBUG("Low identity score"); - return Gap(INVALID_GAP); - } - std::cout << overlap_info; - - return Gap( - (int) (-overlap_info.r1.size() - overlap_info.r2.start_pos), - (uint32_t) (s1.size() - overlap_info.r1.end_pos), - (uint32_t) overlap_info.r2.start_pos); -} - - //Detects a cycle as a minsuffix > IS present earlier in the path. Overlap is allowed. class InsertSizeLoopDetector { protected: @@ -722,11 +770,11 @@ VERIFY (last_cycle_pos == start_cycle_pos); DEBUG("find cycle " << last_cycle_pos); DEBUG("path"); - path.Print(); + path.PrintDEBUG(); DEBUG("last subpath"); - path.SubPath(last_cycle_pos).Print(); + path.SubPath(last_cycle_pos).PrintDEBUG(); DEBUG("cycle"); - cycle->Print(); + cycle->PrintDEBUG(); DEBUG("last_cycle_pos " << last_cycle_pos << " path size " << path.Size()); VERIFY(last_cycle_pos <= (int)path.Size()); DEBUG("last cycle pos + cycle " << last_cycle_pos + (int)cycle->Size()); @@ -748,254 +796,58 @@ visited_cycles_coverage_map_.Subscribe(p); visited_cycles_coverage_map_.Subscribe(cp); DEBUG("add cycle"); - p->Print(); - } -}; - -class RepeatDetector { -public: - RepeatDetector(const Graph& g, const GraphCoverageMap& cov_map, size_t max_repeat_len) - : g_(g), - cov_map_(cov_map), - used_paths_(), - repeat_len_(max_repeat_len){ - empty_ = new BidirectionalPath(g_); - } - ~RepeatDetector() { - delete empty_; - } - - BidirectionalPath* RepeatPath(const BidirectionalPath& p) { - if (p.Size() == 0) { - return empty_; - } - EdgeId last_e = p.Back(); - BidirectionalPathSet cov_paths = cov_map_.GetCoveringPaths(last_e); - DEBUG("cov paths for e " << g_.int_id(last_e) << " size " << cov_paths.size()); - size_t max_common_size = 0; - BidirectionalPath* result_p = empty_; - for (BidirectionalPath* cov_p : cov_paths) { - if (used_paths_.find(cov_p) == used_paths_.end() || cov_p == &p || cov_p == p.GetConjPath()) { - continue; - } - size_t common_size = MaxCommonSize(p, *cov_p); - DEBUG("max comon size with path " << cov_p->GetId() << " is " << common_size); - if (common_size == 0) { - continue; - } - VERIFY(common_size <= p.Size()); - if (p.LengthAt(p.Size() - common_size) > repeat_len_) { - DEBUG("repeat from " << (p.Size() - common_size) << " length " << p.LengthAt(p.Size() - common_size) << " repeat length " << repeat_len_); - max_common_size = max(common_size, max_common_size); - result_p = cov_p; - } - } - used_paths_.insert(&p); - DEBUG("max common size " << max_common_size); - return result_p; - } - size_t MaxCommonSize(const BidirectionalPath& p1, const BidirectionalPath& p2) const { - DEBUG("max coomon size ") - EdgeId last_e = p1.Back(); - vector positions2 = p2.FindAll(last_e); - DEBUG("pos size " << positions2.size()) - size_t max_common_size = 0; - for (size_t pos2 : positions2) { - size_t common_size = MaxCommonSize(p1, p1.Size() - 1, p2, pos2); - DEBUG("max common size from " << pos2 << " is " << common_size); - max_common_size = max(max_common_size, common_size); - } - return max_common_size; - } -private: - size_t MaxCommonSize(const BidirectionalPath& p1, size_t pos1, const BidirectionalPath& p2, size_t pos2) const { - int i1 = (int) pos1; - int i2 = (int) pos2; - while (i1 >= 0 && i2 >= 0 && - p1.At((size_t) i1) == p2.At((size_t) i2) && - p1.GapAt((size_t) i1) == p2.GapAt((size_t) i2)) { - i1--; - i2--; - } - if (i1 >=0 && i2>=0 && p1.At((size_t) i1) == p2.At((size_t) i2)) { - i1--; - i2--; - } - - VERIFY(i1 <= (int)pos1); - return std::max(size_t((int) pos1 - i1), (size_t)1); + p->PrintDEBUG(); } - const Graph& g_; - const GraphCoverageMap& cov_map_; - set used_paths_; - size_t repeat_len_; - BidirectionalPath* empty_; -}; - -class ContigsMaker { -public: - ContigsMaker(const Graph & g) - : g_(g) { } - - virtual ~ContigsMaker() { } - - virtual void GrowPath(BidirectionalPath& path, PathContainer* paths_storage = nullptr) = 0; - - virtual void GrowPathSimple(BidirectionalPath& path, PathContainer* paths_storage = nullptr) = 0; - - virtual void GrowAll(PathContainer & paths, PathContainer& paths_storage) = 0; - -protected: - const Graph& g_; - DECL_LOGGER("PathExtender") -}; - -struct UsedUniqueStorage { - set used_; - - const ScaffoldingUniqueEdgeStorage& unique_; - - UsedUniqueStorage(const ScaffoldingUniqueEdgeStorage& unique ):used_(), unique_(unique) {} - - void insert(EdgeId e) { - if (unique_.IsUnique(e)) { - used_.insert(e); - used_.insert(e->conjugate()); - } - } - - bool IsUsedAndUnique(EdgeId e) const { - return (unique_.IsUnique(e) && used_.find(e) != used_.end()); - } - - bool UniqueCheckEnabled() const { - return unique_.size() > 0; - } - - }; class PathExtender { public: - PathExtender(const Graph & g): - g_(g){ } + explicit PathExtender(const Graph &g): + g_(g) { } virtual ~PathExtender() { } virtual bool MakeGrowStep(BidirectionalPath& path, PathContainer* paths_storage = nullptr) = 0; - void AddUniqueEdgeStorage(shared_ptr used_storage) { - used_storage_ = used_storage; - } protected: - const Graph& g_; - shared_ptr used_storage_; + const Graph &g_; DECL_LOGGER("PathExtender") }; -class CompositeExtender : public ContigsMaker { +class CompositeExtender { public: - CompositeExtender(const Graph &g, GraphCoverageMap& cov_map, - size_t max_diff_len, - size_t max_repeat_length, - bool detect_repeats_online) - : ContigsMaker(g), - cover_map_(cov_map), - repeat_detector_(g, cover_map_, 2 * max_repeat_length), - extenders_(), - max_diff_len_(max_diff_len), - max_repeat_len_(max_repeat_length), - detect_repeats_online_(detect_repeats_online) { - } - CompositeExtender(const Graph & g, GraphCoverageMap& cov_map, - vector > pes, - const ScaffoldingUniqueEdgeStorage& unique, - size_t max_diff_len, - size_t max_repeat_length, - bool detect_repeats_online) - : ContigsMaker(g), + CompositeExtender(const Graph &g, GraphCoverageMap& cov_map, + UsedUniqueStorage &unique, + const vector> &pes, + size_t max_diff_len) + : g_(g), cover_map_(cov_map), - repeat_detector_(g, cover_map_, 2 * max_repeat_length), - extenders_(), - max_diff_len_(max_diff_len), - max_repeat_len_(max_repeat_length), - detect_repeats_online_(detect_repeats_online) { - extenders_ = pes; - used_storage_ = make_shared(UsedUniqueStorage(unique)); - for (auto ex: extenders_) { - ex->AddUniqueEdgeStorage(used_storage_); - } + used_storage_(unique), + extenders_(pes), + max_diff_len_(max_diff_len) { } - void AddExtender(shared_ptr pe) { - extenders_.push_back(pe); - pe->AddUniqueEdgeStorage(used_storage_); - } - - void GrowAll(PathContainer& paths, PathContainer& result) override { + void GrowAll(PathContainer& paths, PathContainer& result) { result.clear(); GrowAllPaths(paths, result); - LengthPathFilter filter(g_, 0); - filter.filter(result); + result.FilterEmptyPaths(); } - void GrowPath(BidirectionalPath& path, PathContainer* paths_storage) override { + void GrowPath(BidirectionalPath& path, PathContainer* paths_storage) { while (MakeGrowStep(path, paths_storage)) { } } - void GrowPathSimple(BidirectionalPath& path, PathContainer* paths_storage) override { - while (MakeGrowStep(path, paths_storage, false)) { } - } +private: + const Graph &g_; + GraphCoverageMap &cover_map_; + UsedUniqueStorage &used_storage_; + vector> extenders_; + size_t max_diff_len_; - bool MakeGrowStep(BidirectionalPath& path, PathContainer* paths_storage, - bool detect_repeats_online_local = true) { + bool MakeGrowStep(BidirectionalPath& path, PathContainer* paths_storage) { DEBUG("make grow step composite extender"); - if (detect_repeats_online_ && detect_repeats_online_local) { - BidirectionalPath *repeat_path = repeat_detector_.RepeatPath(path); - size_t repeat_size = repeat_detector_.MaxCommonSize(path, *repeat_path); - - if (repeat_size > 0) { - DEBUG("repeat with length " << repeat_size); - path.Print(); - repeat_path->Print(); - BidirectionalPath repeat = path.SubPath(path.Size() - repeat_size); - int begin_repeat = repeat_path->FindLast(repeat); - VERIFY(begin_repeat > -1); - size_t end_repeat = (size_t) begin_repeat + repeat_size; - DEBUG("not consistent subpaths "); - BidirectionalPath begin1 = path.SubPath(0, path.Size() - repeat_size); - begin1.Print(); - BidirectionalPath begin2 = repeat_path->SubPath(0, begin_repeat); - begin2.Print(); - int gpa_in_repeat_path = repeat_path->GapAt(begin_repeat); - BidirectionalPath end2 = repeat_path->SubPath(end_repeat); - BidirectionalPath begin1_conj = path.SubPath(0, path.Size() - repeat_size + 1).Conjugate(); - BidirectionalPath begin2_conj = repeat_path->SubPath(0, begin_repeat + 1).Conjugate(); - pair last = ComparePaths(0, 0, begin1_conj, begin2_conj, max_diff_len_); - DEBUG("last " << last.first << " last2 " << last.second); - path.Clear(); - repeat_path->Clear(); - int gap_len = repeat.GapAt(0); - - if (begin2.Size() == 0 || last.second != 0) { //TODO: incorrect: common edges, but then different ends - path.PushBack(begin1); - repeat_path->PushBack(begin2); - } else { - gap_len = gpa_in_repeat_path; - path.PushBack(begin2); - repeat_path->PushBack(begin1); - } - - path.PushBack(repeat.At(0), gap_len); - path.PushBack(repeat.SubPath(1)); - path.PushBack(end2); - DEBUG("new path"); - path.Print(); - return false; - } - } size_t current = 0; while (current < extenders_.size()) { @@ -1008,22 +860,6 @@ return false; } -private: - GraphCoverageMap& cover_map_; - RepeatDetector repeat_detector_; - vector > extenders_; - size_t max_diff_len_; - size_t max_repeat_len_; - bool detect_repeats_online_; - shared_ptr used_storage_; - - void SubscribeCoverageMap(BidirectionalPath * path) { - path->Subscribe(&cover_map_); - for (size_t i = 0; i < path->Size(); ++i) { - cover_map_.BackEdgeAdded(path->At(i), path, path->GapInfoAt(i)); - } - } - void GrowAllPaths(PathContainer& paths, PathContainer& result) { for (size_t i = 0; i < paths.size(); ++i) { VERBOSE_POWER_T2(i, 100, "Processed " << i << " paths from " << paths.size() << " (" << i * 100 / paths.size() << "%)"); @@ -1031,16 +867,17 @@ INFO("Processed " << i << " paths from " << paths.size() << " (" << i * 100 / paths.size() << "%)"); } //In 2015 modes do not use a seed already used in paths. - if (used_storage_->UniqueCheckEnabled()) { + //FIXME what is the logic here? + if (used_storage_.UniqueCheckEnabled()) { bool was_used = false; for (size_t ind =0; ind < paths.Get(i)->Size(); ind++) { EdgeId eid = paths.Get(i)->At(ind); - if (used_storage_->IsUsedAndUnique(eid)) { + if (used_storage_.IsUsedAndUnique(eid)) { DEBUG("Used edge " << g_.int_id(eid)); was_used = true; break; } else { - used_storage_->insert(eid); + used_storage_.insert(eid); } } if (was_used) { @@ -1050,11 +887,12 @@ } if (!cover_map_.IsCovered(*paths.Get(i))) { + AddPath(result, *paths.Get(i), cover_map_); BidirectionalPath * path = new BidirectionalPath(*paths.Get(i)); BidirectionalPath * conjugatePath = new BidirectionalPath(*paths.GetConjugate(i)); + SubscribeCoverageMap(path, cover_map_); + SubscribeCoverageMap(conjugatePath, cover_map_); result.AddPair(path, conjugatePath); - SubscribeCoverageMap(path); - SubscribeCoverageMap(conjugatePath); size_t count_trying = 0; size_t current_path_len = 0; do { @@ -1063,9 +901,8 @@ GrowPath(*path, &result); GrowPath(*conjugatePath, &result); } while (count_trying < 10 && (path->Length() != current_path_len)); - path->CheckConjugateEnd(max_repeat_len_); DEBUG("result path " << path->GetId()); - path->Print(); + path->PrintDEBUG(); } } } @@ -1074,36 +911,23 @@ //All Path-Extenders inherit this one class LoopDetectingPathExtender : public PathExtender { - -protected: - bool investigate_short_loops_; - bool use_short_loop_cov_resolver_; - CovShortLoopResolver cov_loop_resolver_; + const bool use_short_loop_cov_resolver_; + ShortLoopResolver cov_loop_resolver_; InsertSizeLoopDetector is_detector_; - const GraphCoverageMap& cov_map_; - -public: - LoopDetectingPathExtender(const conj_graph_pack &gp, - const GraphCoverageMap &cov_map, - bool investigate_short_loops, - bool use_short_loop_cov_resolver, - size_t is) - : PathExtender(gp.g), - investigate_short_loops_(investigate_short_loops), - use_short_loop_cov_resolver_(use_short_loop_cov_resolver), - cov_loop_resolver_(gp), - is_detector_(gp.g, is), - cov_map_(cov_map) { + UsedUniqueStorage &used_storage_; - } - - bool isInvestigateShortLoops() const { - return investigate_short_loops_; - } +protected: + const bool investigate_short_loops_; + const GraphCoverageMap &cov_map_; - void setInvestigateShortLoops(bool investigateShortLoops) { - this->investigate_short_loops_ = investigateShortLoops; + bool TryUseEdge(BidirectionalPath &path, EdgeId e, const Gap &gap) { + bool success = used_storage_.TryUseEdge(path, e, gap); + if (success) { + DEBUG("Adding edge. PathId: " << path.GetId() << " path length: " << path.Length() - 1 << ", fixed gap : " + << gap.gap << ", trash length: " << gap.trash_previous << "-" << gap.trash_current); + } + return success; } bool DetectCycle(BidirectionalPath& path) { @@ -1120,13 +944,29 @@ return false; } - bool DetectCycleScaffolding(BidirectionalPath& path) { - return is_detector_.CheckCycledNonIS(path); + bool DetectCycleScaffolding(BidirectionalPath& path, EdgeId e) { + BidirectionalPath temp_path(path); + temp_path.PushBack(e); + return is_detector_.CheckCycledNonIS(temp_path); } virtual bool MakeSimpleGrowStep(BidirectionalPath& path, PathContainer* paths_storage = nullptr) = 0; - virtual bool ResolveShortLoopByCov(BidirectionalPath& path) = 0; + virtual bool ResolveShortLoopByCov(BidirectionalPath& path) { + LoopDetector loop_detector(&path, cov_map_); + size_t init_len = path.Length(); + bool result = false; + while (path.Size() >= 1 && loop_detector.EdgeInShortLoop(path.Back())) { + cov_loop_resolver_.ResolveShortLoop(path); + if (init_len == path.Length()) { + return result; + } else { + result = true; + } + init_len = path.Length(); + } + return true; + } virtual bool ResolveShortLoopByPI(BidirectionalPath& path) = 0; @@ -1134,11 +974,30 @@ return false; } +public: + LoopDetectingPathExtender(const conj_graph_pack &gp, + const GraphCoverageMap &cov_map, + UsedUniqueStorage &unique, + bool investigate_short_loops, + bool use_short_loop_cov_resolver, + size_t is) + : PathExtender(gp.g), + use_short_loop_cov_resolver_(use_short_loop_cov_resolver), + cov_loop_resolver_(gp.g, make_shared(gp.g, gp.flanking_cov)), + is_detector_(gp.g, is), + used_storage_(unique), + investigate_short_loops_(investigate_short_loops), + cov_map_(cov_map) { + + } + + bool MakeGrowStep(BidirectionalPath& path, PathContainer* paths_storage) override { if (is_detector_.InExistingLoop(path)) { DEBUG("in existing loop"); return false; } + DEBUG("un ch enabled " << used_storage_.UniqueCheckEnabled()); bool result; LoopDetector loop_detector(&path, cov_map_); if (DetectCycle(path)) { @@ -1187,15 +1046,16 @@ class SimpleExtender: public LoopDetectingPathExtender { protected: - shared_ptr extensionChooser_; + ShortLoopResolver loop_resolver_; + double weight_threshold_; void FindFollowingEdges(BidirectionalPath& path, ExtensionChooser::EdgeContainer * result) { DEBUG("Looking for the following edges") result->clear(); vector edges; DEBUG("Pushing back") - push_back_all(edges, g_.OutgoingEdges(g_.EdgeEnd(path.Back()))); + utils::push_back_all(edges, g_.OutgoingEdges(g_.EdgeEnd(path.Back()))); result->reserve(edges.size()); for (auto iter = edges.begin(); iter != edges.end(); ++iter) { DEBUG("Adding edge w distance " << g_.int_id(*iter)); @@ -1208,14 +1068,17 @@ public: SimpleExtender(const conj_graph_pack &gp, - const GraphCoverageMap &cov_map, - shared_ptr ec, - size_t is, - bool investigate_short_loops, - bool use_short_loop_cov_resolver) : - LoopDetectingPathExtender(gp, cov_map, investigate_short_loops, use_short_loop_cov_resolver, is), - extensionChooser_(ec) { - } + const GraphCoverageMap &cov_map, + UsedUniqueStorage &unique, + shared_ptr ec, + size_t is, + bool investigate_short_loops, + bool use_short_loop_cov_resolver, + double weight_threshold = 0.0): + LoopDetectingPathExtender(gp, cov_map, unique, investigate_short_loops, use_short_loop_cov_resolver, is), + extensionChooser_(ec), + loop_resolver_(gp.g, make_shared(gp.g, gp.flanking_cov, extensionChooser_->wc(), weight_threshold)), + weight_threshold_(weight_threshold) {} std::shared_ptr GetExtensionChooser() const { return extensionChooser_; @@ -1225,30 +1088,13 @@ return extensionChooser_->WeightCounterBased(); } - bool ResolveShortLoopByCov(BidirectionalPath& path) override { - LoopDetector loop_detector(&path, cov_map_); - size_t init_len = path.Length(); - bool result = false; - while (path.Size() >= 1 && loop_detector.EdgeInShortLoop(path.Back())) { - cov_loop_resolver_.ResolveShortLoop(path); - if (init_len == path.Length()) { - return result; - } else { - result = true; - } - init_len = path.Length(); - } - return true; - } - bool ResolveShortLoopByPI(BidirectionalPath& path) override { if (extensionChooser_->WeightCounterBased()) { - LoopResolver loop_resolver(g_, extensionChooser_->wc()); LoopDetector loop_detector(&path, cov_map_); size_t init_len = path.Length(); bool result = false; while (path.Size() >= 1 && loop_detector.EdgeInShortLoop(path.Back())) { - loop_resolver.ResolveShortLoop(path); + loop_resolver_.ResolveShortLoop(path); if (init_len == path.Length()) { return result; } else { @@ -1263,7 +1109,7 @@ bool MakeSimpleGrowStep(BidirectionalPath& path, PathContainer* paths_storage) override { ExtensionChooser::EdgeContainer candidates; - return FilterCandidates(path, candidates) and AddCandidates(path, paths_storage, candidates); + return FilterCandidates(path, candidates) && AddCandidates(path, paths_storage, candidates); } protected: @@ -1272,7 +1118,7 @@ return false; } DEBUG("Simple grow step"); - path.Print(); + path.PrintDEBUG(); FindFollowingEdges(path, &candidates); DEBUG("found candidates"); DEBUG(candidates.size()) @@ -1305,44 +1151,33 @@ EdgeId eid = candidates.back().e_; //In 2015 modes when trying to use already used unique edge, it is not added and path growing stops. //That allows us to avoid overlap removal hacks used earlier. - if (used_storage_->UniqueCheckEnabled()) { - if (used_storage_->IsUsedAndUnique(eid)) { - return false; - } else { - used_storage_->insert(eid); - } - } - path.PushBack(eid, candidates.back().d_); - DEBUG("push done"); - return true; + Gap gap(candidates.back().d_); + return TryUseEdge(path, eid, gap); } -protected: DECL_LOGGER("SimpleExtender") - }; class MultiExtender: public SimpleExtender { - -protected: size_t max_candidates_; public: - MultiExtender(const conj_graph_pack &gp, - const GraphCoverageMap &cov_map, - shared_ptr ec, - size_t is, - bool investigate_short_loops, - bool use_short_loop_cov_resolver, - size_t max_candidates = 0) : - SimpleExtender(gp, cov_map, ec, is, investigate_short_loops, use_short_loop_cov_resolver), + const GraphCoverageMap &cov_map, + UsedUniqueStorage &unique, + shared_ptr ec, + size_t is, + bool investigate_short_loops, + bool use_short_loop_cov_resolver, + double weight_threshold, + size_t max_candidates = 0) : + SimpleExtender(gp, cov_map, unique, ec, is, investigate_short_loops, use_short_loop_cov_resolver, weight_threshold), max_candidates_(max_candidates) { } protected: - virtual bool AddCandidates(BidirectionalPath& path, PathContainer* paths_storage, ExtensionChooser::EdgeContainer& candidates) override { + bool AddCandidates(BidirectionalPath& path, PathContainer* paths_storage, ExtensionChooser::EdgeContainer& candidates) override { if (candidates.size() == 0) return false; @@ -1358,7 +1193,7 @@ if (candidates.size() == 1) { DEBUG("push"); EdgeId eid = candidates.back().e_; - path.PushBack(eid, candidates.back().d_); + path.PushBack(eid, Gap(candidates.back().d_)); DEBUG("push done"); return true; } @@ -1375,13 +1210,13 @@ for (size_t i = 1; i < candidates.size(); ++i) { DEBUG("push other candidates " << i); BidirectionalPath *p = new BidirectionalPath(path); - p->PushBack(candidates[i].e_, candidates[i].d_); + p->PushBack(candidates[i].e_, Gap(candidates[i].d_)); BidirectionalPath *cp = new BidirectionalPath(p->Conjugate()); paths_storage->AddPair(p, cp); } DEBUG("push"); - path.PushBack(candidates.front().e_, candidates.front().d_); + path.PushBack(candidates.front().e_, Gap(candidates.front().d_)); DEBUG("push done"); res = true; @@ -1400,10 +1235,9 @@ class ScaffoldingPathExtender: public LoopDetectingPathExtender { -private: std::shared_ptr extension_chooser_; ExtensionChooser::EdgeContainer sources_; - std::shared_ptr gap_joiner_; + std::shared_ptr gap_analyzer_; bool avoid_rc_connections_; //When check_sink_ set to false we can scaffold not only tips @@ -1419,102 +1253,110 @@ } } - bool IsSink(EdgeId e) const { + bool IsSink(EdgeId e) const { return g_.OutgoingEdgeCount(g_.EdgeEnd(e)) == 0; } + Gap ConvertGapDescription(const GapDescription &gap) const { + if (gap == GapDescription()) { + return Gap::INVALID(); + } + return Gap(gap.estimated_dist() + int(g_.k()) + - int(gap.left_trim()) - int(gap.right_trim()), + uint32_t(gap.left_trim()), uint32_t(gap.right_trim())); + } + protected: - virtual bool GapSatisfies(int /*gap*/) const { + virtual bool CheckGap(const Gap &/*gap*/) const { return true; } - bool MakeSimpleGrowStepForChooser(BidirectionalPath& path, std::shared_ptr ec, bool must_overlap = false) { + bool ResolveShortLoopByCov(BidirectionalPath&) override { + return false; + } + + bool ResolveShortLoopByPI(BidirectionalPath&) override { + return false; + } + + //TODO fix awful design with virtual CheckGap and must_overlap flag! + bool MakeSimpleGrowStepForChooser(BidirectionalPath& path, std::shared_ptr ec, + bool must_overlap = false) { if (path.Size() < 1 || (check_sink_ && !IsSink(path.Back()))) { return false; } - DEBUG("scaffolding:"); + DEBUG("Simple grow step, growing path"); - path.Print(); + path.PrintDEBUG(); ExtensionChooser::EdgeContainer candidates = ec->Filter(path, sources_); DEBUG("scaffolding candidates " << candidates.size() << " from sources " << sources_.size()); - //DEBUG("Extension chooser threshold = " << ec->GetThreshold()) DEBUG("Candidate size = " << candidates.size()) - if (candidates.size() == 1) { - if (candidates[0].e_ == path.Back() - || (avoid_rc_connections_ && candidates[0].e_ == g_.conjugate(path.Back()))) { - return false; - } - BidirectionalPath temp_path(path); - temp_path.PushBack(candidates[0].e_); - if (this->DetectCycleScaffolding(temp_path)) { + if (candidates.size() != 1) { + DEBUG("scaffolding end"); + return false; + } + + EdgeId e = candidates.back().e_; + if (e == path.Back() + || (avoid_rc_connections_ && e == g_.conjugate(path.Back()))) { + return false; + } + + if (this->DetectCycleScaffolding(path, e)) { + return false; + } + + Gap gap; + //TODO is it ok that we either force joining or ignore its possibility + if (check_sink_) { + gap = ConvertGapDescription(gap_analyzer_->FixGap(GapDescription(path.Back(), e, + candidates.back().d_ - + int(g_.k())))); + + if (gap == Gap::INVALID()) { + DEBUG("Looks like wrong scaffolding. PathId: " + << path.GetId() << " path length: " << path.Length() + << ", estimated gap length: " << candidates.back().d_); return false; } - EdgeId eid = candidates.back().e_; - if (check_sink_) { - Gap gap = gap_joiner_->FixGap(path.Back(), candidates.back().e_, candidates.back().d_); - DEBUG("Gap after fixing " << gap.gap_ << " (was " << candidates.back().d_ << ")"); - if (gap.gap_ != GapJoiner::INVALID_GAP) { - DEBUG("Scaffolding. PathId: " << path.GetId() << " path length: " << path.Length() << - ", fixed gap length: " << gap.gap_ << ", trash length: " << gap.trash_previous_ << "-" << - gap.trash_current_); - - if (used_storage_->UniqueCheckEnabled()) { - if (used_storage_->IsUsedAndUnique(eid)) { - return false; - } else { - used_storage_->insert(eid); - } - } + DEBUG("Gap after fixing " << gap.gap << " (was " << candidates.back().d_ << ")"); - if (must_overlap && GapSatisfies(gap.gap_)) { - DEBUG("Overlap is not large enogh") - return false; - } - DEBUG("Overlap is good, success") - path.PushBack(eid, gap); - return true; - } - else { - DEBUG("Looks like wrong scaffolding. PathId: " << path.GetId() << " path length: " << - path.Length() << ", fixed gap length: " << candidates.back().d_ << ", fixed = " << gap.gap_); - return false; - } - } - else { - DEBUG("Gap joiners off"); - DEBUG("Scaffolding. PathId: " << path.GetId() << " path length: " << path.Length() - << ", fixed gap length: " << candidates.back().d_); - - if (used_storage_->UniqueCheckEnabled()) { - if (used_storage_->IsUsedAndUnique(eid)) { - return false; - } else { - used_storage_->insert(eid); - } - } - path.PushBack(candidates.back().e_, candidates.back().d_); - return true; + if (must_overlap && !CheckGap(gap)) { + DEBUG("Overlap is not large enough") + return false; } + } else { + DEBUG("Gap joiners off"); + VERIFY(candidates.back().d_ > int(g_.k())); + gap = Gap(candidates.back().d_); } - DEBUG("scaffolding end"); - return false; + + return TryUseEdge(path, e, NormalizeGap(gap)); + } + + Gap NormalizeGap(Gap gap) const { + VERIFY(gap != Gap::INVALID()); + if (gap.overlap_after_trim(g_.k()) > 0) + gap.trash_current += gap.overlap_after_trim(g_.k()); + return gap; } public: ScaffoldingPathExtender(const conj_graph_pack &gp, const GraphCoverageMap &cov_map, + UsedUniqueStorage &unique, std::shared_ptr extension_chooser, - std::shared_ptr gap_joiner, + std::shared_ptr gap_analyzer, size_t is, bool investigate_short_loops, bool avoid_rc_connections, bool check_sink = true): - LoopDetectingPathExtender(gp, cov_map, investigate_short_loops, false, is), + LoopDetectingPathExtender(gp, cov_map, unique, investigate_short_loops, false, is), extension_chooser_(extension_chooser), - gap_joiner_(gap_joiner), + gap_analyzer_(gap_analyzer), avoid_rc_connections_(avoid_rc_connections), check_sink_(check_sink) { @@ -1525,14 +1367,6 @@ return MakeSimpleGrowStepForChooser(path, extension_chooser_); } - bool ResolveShortLoopByCov(BidirectionalPath&) override { - return false; - } - - bool ResolveShortLoopByPI(BidirectionalPath&) override { - return false; - } - std::shared_ptr GetExtensionChooser() const { return extension_chooser_; } @@ -1548,21 +1382,22 @@ int min_overlap_; protected: - bool GapSatisfies(int gap) const override { - return gap > (int) g_.k() - min_overlap_; + bool CheckGap(const Gap &gap) const override { + return gap.overlap_after_trim(g_.k()) >= min_overlap_; } public: RNAScaffoldingPathExtender(const conj_graph_pack &gp, const GraphCoverageMap &cov_map, + UsedUniqueStorage &unique, std::shared_ptr extension_chooser, std::shared_ptr strict_extension_chooser, - std::shared_ptr gap_joiner, + std::shared_ptr gap_joiner, size_t is, bool investigate_short_loops, int min_overlap = 0): - ScaffoldingPathExtender(gp, cov_map, extension_chooser, gap_joiner, is, investigate_short_loops, true), + ScaffoldingPathExtender(gp, cov_map, unique, extension_chooser, gap_joiner, is, investigate_short_loops, true), strict_extension_chooser_(strict_extension_chooser), min_overlap_(min_overlap) {} diff -Nru spades-3.10.1+dfsg/src/common/modules/path_extend/path_filter.hpp spades-3.11.1+dfsg/src/common/modules/path_extend/path_filter.hpp --- spades-3.10.1+dfsg/src/common/modules/path_extend/path_filter.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/path_extend/path_filter.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -19,163 +19,58 @@ namespace path_extend { -class CopyOnWritePathFilter { - -protected: - const Graph& g; - -public: - CopyOnWritePathFilter(const Graph& g_): g(g_) { - } - - virtual bool predicate(BidirectionalPath& path) = 0; - - PathContainer filter(PathContainer& paths) { - PathContainer result; - - for (size_t i = 0; i < paths.size(); ++i) { - if (predicate(*paths.Get(i)) || predicate(*paths.GetConjugate(i))) { - result.AddPair(paths.Get(i), paths.GetConjugate(i)); - } - } - - return result; - } - -}; - - -class IdFilter: public CopyOnWritePathFilter { - -protected: - std::set ids; +typedef func::AbstractPredicate AbstractPathCondition; +class EmptyPathCondition: public AbstractPathCondition { public: + EmptyPathCondition() {} - IdFilter(const Graph& g_, std::set ids_): CopyOnWritePathFilter(g_), ids(ids_) { - } - - virtual bool predicate(BidirectionalPath& path) { - return ids.count(path.GetId()) > 0; - } -}; - - -class DuplicateFilter { - -protected: - const Graph& g; - -public: - DuplicateFilter(const Graph& g_): g(g_) { - } - - PathContainer filter(PathContainer& paths) { - PathContainer result; - - for (size_t i = 0; i < paths.size(); ++i) { - bool duplicate = false; - for (size_t j = 0; j < result.size(); ++j) { - if (result[j] == paths[j]) - duplicate = true; - } - if (!duplicate) { - result.AddPair(paths.Get(i), paths.GetConjugate(i)); - } - } - - return result; + bool Check(checked_type p) const override { + return p.Empty(); } - }; -class ErasingPathFilter { - -protected: - const Graph& g; - +class LengthPathCondition: public AbstractPathCondition { + size_t min_length_; public: - ErasingPathFilter(const Graph& g_): g(g_) { - } - - virtual bool predicate(BidirectionalPath& path) = 0; + LengthPathCondition(size_t min_length): min_length_(min_length) {} - void filter(PathContainer& paths) { - for (PathContainer::Iterator iter = paths.begin(); iter != paths.end(); ) { - if (predicate(*iter.get()) || predicate(*iter.getConjugate())) { - iter = paths.erase(iter); - } - else { - ++iter; - } - } + bool Check(checked_type p) const override { + return p.Length() <= min_length_; } - }; - -class CoveragePathFilter: public ErasingPathFilter { - -protected: - double minCoverage; +class CoveragePathCondition: public AbstractPathCondition { + const Graph& g_; + double cov_; public: - CoveragePathFilter(Graph& g_, double cov): ErasingPathFilter(g_), minCoverage(cov) { - - } + CoveragePathCondition(const Graph& g, double cov): g_(g), cov_(cov) {} - virtual bool predicate(BidirectionalPath& path) { - for (size_t i = 0; i < path.Size(); ++i) { - if (math::ls(g.coverage(path[i]), minCoverage)) { - return true; - } + bool Check(checked_type p) const override { + for (size_t i = 0; i < p.Size(); ++i) { + if (math::gr(g_.coverage(p[i]), cov_)) + return false; } - return false; + return true; } }; - -class LengthPathFilter: public ErasingPathFilter { - -protected: - size_t minLength; - -public: - LengthPathFilter(const Graph& g_, size_t len): ErasingPathFilter(g_), minLength(len) { - } - - virtual bool predicate(BidirectionalPath& path) { - return path.Length() <= minLength; - } -}; - - -class IsolatedPathFilter: public ErasingPathFilter { - -protected: - size_t min_length_; - - double min_cov_; - +class IsolatedPathCondition: public AbstractPathCondition { + const Graph& g_; public: - IsolatedPathFilter(const Graph& g_, size_t min_length, double min_cov = 10000000.0): - ErasingPathFilter(g_), - min_length_(min_length), - min_cov_(min_cov) { - } + IsolatedPathCondition(const Graph& g): g_(g) {} - virtual bool predicate(BidirectionalPath& path) { - if (path.Empty()) + bool Check(checked_type p) const override { + if (p.Empty()) return true; - if (path.Size() <= 2) { - auto v1 = g.EdgeStart(path.Front()); - auto v2 = g.EdgeEnd(path.Back()); - - return g.IncomingEdgeCount(v1) == 0 && - g.OutgoingEdgeCount(v2) == 0 && - path.Length() < min_length_ && - math::ls(path.Coverage(), min_cov_); + if (p.Size() <= 2) { + auto v1 = g_.EdgeStart(p.Front()); + auto v2 = g_.EdgeEnd(p.Back()); + + return g_.IncomingEdgeCount(v1) == 0 && + g_.OutgoingEdgeCount(v2) == 0; } return false; } diff -Nru spades-3.10.1+dfsg/src/common/modules/path_extend/path_visualizer.hpp spades-3.11.1+dfsg/src/common/modules/path_extend/path_visualizer.hpp --- spades-3.10.1+dfsg/src/common/modules/path_extend/path_visualizer.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/path_extend/path_visualizer.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -38,7 +38,7 @@ if (labels_.count(path->At(j)) > 0) { labels_[path->At(j)] += ", "; } - labels_[path->At(j)] += "(" + ToString(path->GetId()) + " : " + ToString(j) + ")"; + labels_[path->At(j)] += "(" + std::to_string(path->GetId()) + " : " + std::to_string(j) + ")"; } path = paths.GetConjugate(i); @@ -46,7 +46,7 @@ if (labels_.count(path->At(j)) > 0) { labels_[path->At(j)] += ", "; } - labels_[path->At(j)] += "(" + ToString(path->GetId()) + " : " + ToString(j) + ")"; + labels_[path->At(j)] += "(" + std::to_string(path->GetId()) + " : " + std::to_string(j) + ")"; } } } diff -Nru spades-3.10.1+dfsg/src/common/modules/path_extend/pe_config_struct.cpp spades-3.11.1+dfsg/src/common/modules/path_extend/pe_config_struct.cpp --- spades-3.10.1+dfsg/src/common/modules/path_extend/pe_config_struct.cpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/path_extend/pe_config_struct.cpp 2017-09-28 09:05:10.000000000 +0000 @@ -10,6 +10,28 @@ namespace path_extend { +//convert string to vector of words separated by space +std::vector StringToVector(const std::string& s) { + std::string word = + ""; + std::vector res; + for (size_t i = 0; i < s.length(); ++i) { + if (s[i] == ' ') { + if (word != "") { + res.push_back(word); + word = ""; + } + } + else { + word += s[i]; + + } + } + if (word != "") { + res.push_back(word); + } + return res; +} void load(scaffolding_mode &sm, boost::property_tree::ptree const& pt, std::string const& key, bool complete) { if (complete || pt.find(key) != pt.not_found()) { @@ -76,14 +98,12 @@ load(so.use_la_gap_joiner , pt, "use_la_gap_joiner", complete); load(so.min_gap_score , pt, "min_gap_score", complete); - load(so.max_must_overlap , pt, "max_must_overlap", complete); load(so.max_can_overlap , pt, "max_can_overlap", complete); load(so.short_overlap , pt, "short_overlap", complete); load(so.artificial_gap , pt, "artificial_gap", complete); - load(so.use_old_score , pt, "use_old_score", complete); load(so.min_overlap_length, pt, "min_overlap_length", complete); - load(so.flank_addition_coefficient, pt, "flank_addition_coefficient", complete); load(so.flank_multiplication_coefficient, pt, "flank_multiplication_coefficient", complete); + load(so.flank_addition_coefficient, pt, "flank_addition_coefficient", complete); load(so.var_coeff , pt, "var_coeff", complete); load(so.basic_overlap_coeff, pt, "basic_overlap_coeff", complete); @@ -104,7 +124,11 @@ if (pf.enabled) { load(pf.min_length , pt, "min_length" , complete); load(pf.isolated_min_length , pt, "isolated_min_length" , complete); + load(pf.isolated_min_cov , pt, "isolated_min_cov" , complete); load(pf.min_length_for_low_covered , pt, "min_length_for_low_covered" , complete); + load(pf.rel_cutoff , pt, "rel_cutoff" , complete); + load(pf.rel_isolated_cutoff , pt, "rel_isolated_cutoff" , complete); + load(pf.rel_low_covered_cutoff , pt, "rel_low_covered_cutoff" , complete); load(pf.min_coverage , pt, "min_coverage" , complete); } } @@ -115,14 +139,34 @@ using config_common::load; load(gcc.max_gap , pt, "max_gap" , complete); load(gcc.relative_max_gap , pt, "relative_max_gap" , complete); + load(gcc.use_main_storage , pt, "use_main_storage" , complete); + load(gcc.unresolvable_jump , pt, "unresolvable_jump" , complete); + load(gcc.unique_length , pt, "unique_length" , complete); + +} + +void load(pe_config::ParamSetT::OverlapRemovalOptionsT& ors, + boost::property_tree::ptree const& pt, bool complete) { + using config_common::load; + load(ors.enabled, pt, "enabled" , complete); + load(ors.end_start_only, pt, "end_start_only" , complete); + load(ors.cut_all, pt, "cut_all" , complete); +} + +void load(pe_config::ParamSetT::SimpleCoverageResolver& scr, + boost::property_tree::ptree const& pt, bool complete) +{ + using config_common::load; + load(scr.enabled , pt, "enabled" , complete); + load(scr.coverage_delta , pt, "coverage_delta" , complete); + load(scr.min_upper_coverage , pt, "min_upper_coverage" , complete); } void load(pe_config::ParamSetT& p, boost::property_tree::ptree const& pt, bool complete) { using config_common::load; load(p.sm, pt, "scaffolding_mode", complete); load(p.normalize_weight, pt, "normalize_weight", complete); - load(p.cut_all_overlaps, pt, "cut_all_overlaps", complete); - load(p.remove_overlaps, pt, "remove_overlaps", complete); + load(p.overlap_removal, pt, "overlap_removal", complete); load(p.multi_path_extend, pt, "multi_path_extend", complete); load(p.split_edge_length, pt, "split_edge_length", complete); load(p.extension_options, pt, "extension_options", complete); @@ -132,10 +176,20 @@ load(p.use_coordinated_coverage, pt, "use_coordinated_coverage", complete); load(p.scaffolding2015, pt, "scaffolding2015", complete); load(p.scaffold_graph_params, pt, "scaffold_graph", complete); - load(p.path_filtration, pt, "path_cleaning", complete); + + string path_cleaning_presets; + load(path_cleaning_presets, pt, "path_cleaning_presets", complete); + auto presets = StringToVector(path_cleaning_presets); + for (auto &key : presets) { + pe_config::ParamSetT::PathFiltrationT path_filtration; + std::string config_key = key == "default" ? "path_cleaning" : key + "_path_cleaning"; + load(path_filtration, pt, config_key, complete); + p.path_filtration[key] = path_filtration; + } load(p.genome_consistency_checker, pt, "genome_consistency_checker", complete); load(p.uniqueness_analyser, pt, "uniqueness_analyser", complete); load(p.loop_traversal, pt, "loop_traversal", complete); + load(p.simple_coverage_resolver, pt, "simple_coverage_resolver", complete); } void load(pe_config::LongReads& p, boost::property_tree::ptree const& pt, diff -Nru spades-3.10.1+dfsg/src/common/modules/path_extend/pe_config_struct.hpp spades-3.11.1+dfsg/src/common/modules/path_extend/pe_config_struct.hpp --- spades-3.10.1+dfsg/src/common/modules/path_extend/pe_config_struct.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/path_extend/pe_config_struct.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -108,8 +108,18 @@ size_t split_edge_length; bool multi_path_extend; - bool remove_overlaps; - bool cut_all_overlaps; + + struct OverlapRemovalOptionsT { + bool enabled; + bool end_start_only; + bool cut_all; + } overlap_removal; + + struct SimpleCoverageResolver { + bool enabled; + double coverage_delta; + double min_upper_coverage; + } simple_coverage_resolver; struct ExtensionOptionsT { bool use_default_single_threshold; @@ -134,19 +144,16 @@ bool use_la_gap_joiner; double min_gap_score; - double max_must_overlap; double max_can_overlap; int short_overlap; size_t artificial_gap; - bool use_old_score; - double var_coeff; double basic_overlap_coeff; size_t min_overlap_length; - double flank_addition_coefficient; double flank_multiplication_coefficient; + int flank_addition_coefficient; boost::optional min_overlap_for_rna_scaffolding; } scaffolder_options; @@ -154,11 +161,16 @@ struct PathFiltrationT { bool enabled; size_t min_length; + double rel_cutoff; size_t isolated_min_length; + double isolated_min_cov; + double rel_isolated_cutoff; size_t min_length_for_low_covered; + double rel_low_covered_cutoff; double min_coverage; - } path_filtration; + }; + std::map path_filtration; bool use_coordinated_coverage; @@ -191,6 +203,9 @@ struct GenomeConsistencyCheckerParamsT { size_t max_gap; double relative_max_gap; + bool use_main_storage; + size_t unresolvable_jump; + size_t unique_length; } genome_consistency_checker; struct LoopTraversalParamsT { diff -Nru spades-3.10.1+dfsg/src/common/modules/path_extend/pe_resolver.hpp spades-3.11.1+dfsg/src/common/modules/path_extend/pe_resolver.hpp --- spades-3.10.1+dfsg/src/common/modules/path_extend/pe_resolver.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/path_extend/pe_resolver.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -12,455 +12,261 @@ * Author: andrey */ -#ifndef PE_RESOLVER_HPP_ -#define PE_RESOLVER_HPP_ +#pragma once #include "path_extender.hpp" namespace path_extend { +typedef const BidirectionalPath * PathPtr; +typedef unordered_map> SplitsStorage; -class SimpleOverlapRemover { +inline void PopFront(BidirectionalPath * const path, size_t cnt) { + path->GetConjPath()->PopBack(cnt); +} + +class OverlapRemover { + const Graph &g_; + const PathContainer &paths_; + const OverlapFindingHelper helper_; + SplitsStorage splits_; + + bool AlreadyAdded(PathPtr ptr, size_t pos) const { + auto it = splits_.find(ptr); + return it != splits_.end() && it->second.count(pos); + } + + //TODO if situation start ==0 && end==p.Size is not interesting then code can be simplified + bool AlreadyAdded(const BidirectionalPath &p, size_t start, size_t end) const { + if (start == 0 && AlreadyAdded(&p, end)) + return true; + if (end == p.Size() && AlreadyAdded(p.GetConjPath(), p.Size() - start)) + return true; + return false; + } + + //NB! This can only be launched over paths taken from path container! + size_t AnalyzeOverlaps(const BidirectionalPath &path, const BidirectionalPath &other, + bool end_start_only, bool retain_one_copy) const { + VERIFY(!retain_one_copy || !end_start_only); + auto range_pair = helper_.FindOverlap(path, other, end_start_only); + size_t overlap = range_pair.first.size(); + auto other_range = range_pair.second; -public: - SimpleOverlapRemover(const Graph& g, GraphCoverageMap& cm) - : g_(g), coverage_map_(cm) { - } - - void RemoveOverlaps(PathContainer& paths) const { - for (size_t i = 0; i < paths.size(); i++) { - FindAndRemovePathOverlap(paths, paths.Get(i)); - FindAndRemovePathOverlap(paths, paths.GetConjugate(i)); - } - } - - size_t NonUniqueCommon(BidirectionalPath * path, int pos1, int pos2) { - size_t answer = 0; - while (pos1 >= 0) { - if (path->At(pos1) == path->At(pos2)) { - pos1--; - pos2--; - answer++; - } else { - break; - } + if (overlap == 0) { + return 0; } - return answer; - } - size_t MaximumNonUniqueSuffix(BidirectionalPath * path) { - if (path->Size() == 0) { + //checking if region on the other path has not been already added + //TODO discuss if the logic is needed/correct. It complicates the procedure and prevents trivial parallelism. + if (retain_one_copy && + AlreadyAdded(other, + other_range.start_pos, + other_range.end_pos) && + /*forcing "cut_all" behavior on conjugate paths*/ + &other != path.GetConjPath() && + /*certain overkill*/ + &other != &path) { return 0; } - size_t answer = 0; - EdgeId back = path->Back(); - vector all_pos = path->FindAll(back); - for (size_t i = 0; i < all_pos.size() - 1; ++i) { - answer = std::max(answer, NonUniqueCommon(path, (int) all_pos[i], (int) path->Size() - 1)); - } - return answer; - } - - void CutNonUniqueSuffix(PathContainer& paths) { - vector> tmp_paths(paths.begin(), paths.end()); - for (auto it = tmp_paths.begin(); it != tmp_paths.end(); ++it) { - BidirectionalPath * path1 = it->first; - BidirectionalPath * path2 = it->second; - size_t longest_suffix1 = MaximumNonUniqueSuffix(path1); - path1->PopBack(longest_suffix1); - size_t longest_suffix2 = MaximumNonUniqueSuffix(path2); - path2->PopBack(longest_suffix2); + if (&other == &path) { + if (overlap == path.Size()) + return 0; + overlap = std::min(overlap, other_range.start_pos); } - } - void CutPseudoSelfConjugatePaths(PathContainer& paths) { - vector> tmp_paths(paths.begin(), paths.end()); - for (auto it = tmp_paths.begin(); it != tmp_paths.end(); ++it) { - BidirectionalPath * path1 = it->first; - BidirectionalPath * path2 = it->second; - bool ups = false; - if(path1 != path2) { - size_t last = 0; - while(last < path1->Size() && path1->operator [](last) == path2->operator [](last)) { - last++; - } - if(last > 0) { - AddOverlap(paths, path1, 0, last - 1); - path1->PopBack(last); - path2->PopBack(last); - } - } - if(ups) path1->Print(); + if (&other == path.GetConjPath()) { + overlap = std::min(overlap, other.Size() - other_range.end_pos); } + + DEBUG("First " << overlap << " edges of the path will be removed"); + DEBUG(path.str()); + DEBUG("Due to overlap with path"); + DEBUG(other.str()); + DEBUG("Range " << other_range); + + return overlap; } - void RemoveSimilarPaths(PathContainer& paths, size_t min_edge_len, size_t max_path_diff, bool del_only_equal, bool del_subpaths, bool del_begins, bool del_all, bool add_overlap_begins) const { - DEBUG("== Removing similar paths =="); - DEBUG("Min edge len " << min_edge_len << ", max path diff " << max_path_diff) - DEBUG("Only equal " << del_only_equal << ", subpaths " << del_subpaths << ", starts " << del_begins << ", all " << del_all << ", add starts " << add_overlap_begins); - std::vector edges = GetSortedEdges(); - for (size_t edgeIndex = 0; edgeIndex < edges.size(); ++edgeIndex) { - EdgeId edge = edges.at(edgeIndex); - BidirectionalPathSet cov_paths = coverage_map_.GetCoveringPaths(edge); - std::vector cov_vect(cov_paths.begin(), cov_paths.end()); - std::sort(cov_vect.begin(), cov_vect.end(), PathIdCompare); - for (size_t vect_i = 0; vect_i < cov_vect.size(); ++vect_i) { - BidirectionalPath* path1 = cov_vect.at(vect_i); - if (cov_paths.find(path1) == cov_paths.end()) { - continue; - } - for (size_t vect_i1 = vect_i + 1; vect_i1 < cov_vect.size(); ++vect_i1) { - BidirectionalPath* path2 = cov_vect.at(vect_i1); - if (path1 == path2 || path1 == path2->GetConjPath()) { - continue; - } - if (cov_paths.find(path2) == cov_paths.end()) - continue; - if ((*path1) == (*path2)) { - if (path2->IsOverlap()) { - path1->SetOverlap(true); - } - DEBUG("Removing path " << path2->GetId() << " because of path " << path1->GetId()); - path2->Print(); - path1->Print(); - path2->Clear(); - cov_paths = coverage_map_.GetCoveringPaths(edge); - continue; - } - if (g_.length(edge) <= min_edge_len || path1->IsOverlap() || path2->IsOverlap() || del_only_equal) { - continue; - } - CompareAndCut(paths, edge, path1, path2, max_path_diff, - del_subpaths, del_begins, del_all, add_overlap_begins); - cov_paths = coverage_map_.GetCoveringPaths(edge); - } + void MarkStartOverlaps(const BidirectionalPath &path, bool end_start_only, bool retain_one_copy) { + set overlap_poss; + for (PathPtr candidate : helper_.FindCandidatePaths(path)) { + size_t overlap = AnalyzeOverlaps(path, *candidate, + end_start_only, retain_one_copy); + if (overlap > 0) { + overlap_poss.insert(overlap); } } - DEBUG("== Emd removing similar paths =="); - } - -private: - - void SubscribeCoverageMap(BidirectionalPath* path) const { - path->Subscribe(&coverage_map_); - for (size_t i = 0; i < path->Size(); ++i) { - coverage_map_.BackEdgeAdded(path->At(i), path, path->GapAt(i)); + if (!overlap_poss.empty()) { + utils::insert_all(splits_[&path], overlap_poss); } } - void CompareAndCut(PathContainer& paths, EdgeId edge, BidirectionalPath* path1, BidirectionalPath* path2, - size_t max_path_diff, - bool del_subpaths, bool del_begins, - bool del_all, bool add_overlap_begins) const { - vector positions1 = path1->FindAll(edge); - vector positions2 = path2->FindAll(edge); - size_t i1 = 0; - size_t i2 = 0; - bool renewed = false; - while (i1 < positions1.size()) { - while (i2 < positions2.size()) { - DEBUG("CompareAndCutFromPos paths " << g_.int_id(edge)); - CompareAndCutFromPos(paths, path1, (int) positions1[i1], path2, - (int) positions2[i2], max_path_diff, - del_subpaths, del_begins, del_all, add_overlap_begins); - - if (positions1[i1] >= path1->Size() || path1->At(positions1[i1]) != edge || positions2[i2] >= path2->Size() || path2->At(positions2[i2]) != edge) { - vector new_positions1 = path1->FindAll(edge); - vector new_positions2 = path2->FindAll(edge); - - if (new_positions1.size() == positions1.size() && new_positions2.size() == positions2.size()) { - return; - } - else { - positions1 = new_positions1; - positions2 = new_positions2; - i1 = 0; - i2 = 0; - renewed = true; - break; - } - ++i2; - } - ++i2; - } - - if (renewed) { - renewed = false; + void InnerMarkOverlaps(bool end_start_only, bool retain_one_copy) { + for (auto path_pair: paths_) { + //TODO think if this "optimization" is necessary + if (path_pair.first->Size() == 0) continue; - } - ++i1; + MarkStartOverlaps(*path_pair.first, end_start_only, retain_one_copy); + MarkStartOverlaps(*path_pair.second, end_start_only, retain_one_copy); } } - void CompareAndCutFromPos(PathContainer& paths, BidirectionalPath* path1, int pos1, - BidirectionalPath* path2, int pos2, - size_t max_path_diff, - bool delete_subpaths, bool delete_begins, - bool delete_all, bool add_overlap_begins) const { - int last2 = pos2; - int last1 = pos1; - if (last1 >= (int) path1->Size() || last2 >= (int) path2->Size()) { - return; - } - vector other_path_end; - pair posRes = ComparePaths(last1, last2, *path1, *path2, max_path_diff); - last1 = posRes.first; - last2 = posRes.second; - BidirectionalPath* conj1 = path1->GetConjPath(); - BidirectionalPath* conj2 = path2->GetConjPath(); - size_t first1 = conj1->Size() - pos1 - 1; - size_t first2 = conj2->Size() - pos2 - 1; - posRes = ComparePaths(first1, first2, *conj1, *conj2, max_path_diff); - first2 = conj2->Size() - posRes.second - 1; - first1 = conj1->Size() - posRes.first - 1; - if ((int)path2->LengthAt(last2) - (int)g_.length(path2->At(last2)) < (int) max_path_diff) { - last2 = (int)path2->Size() - 1; - } - if ((int)path2->Length() - (int)path2->LengthAt(first2) < (int) max_path_diff) { - first2 = 0; - } - if ((int)path1->LengthAt(last1) - (int)g_.length(path1->At(last1)) < (int) max_path_diff) { - last1 = (int)path1->Size() - 1; - } - if ((int)path1->Length() - (int)path1->LengthAt(first1) < (int) max_path_diff) { - first1 = 0; - } - - CutOverlaps(paths, path1, first1, last1, path1->Size(), path2, - first2, last2, path2->Size(), delete_subpaths, - delete_begins, delete_all, add_overlap_begins); - } - - void AddOverlap(PathContainer& paths, BidirectionalPath* path1, size_t first1, size_t last1) const { - BidirectionalPath* overlap = new BidirectionalPath(path1->SubPath(first1, last1 + 1)); - BidirectionalPath* conj_overlap = new BidirectionalPath(overlap->Conjugate()); - SubscribeCoverageMap(overlap); - SubscribeCoverageMap(conj_overlap); - paths.AddPair(overlap, conj_overlap); - } - - bool CutOverlaps(PathContainer& paths, BidirectionalPath* path1, size_t first1, size_t last1, size_t size1, BidirectionalPath* path2, size_t first2, - size_t last2, size_t size2, bool del_subpaths, bool del_begins, bool del_all, bool add_overlap_begins) const { - if (first1 == 0 && last1 == size1 - 1 && del_subpaths) { - DEBUG("Removing path " << path1->GetId() << " because of path " << path2->GetId()); - path1->Print(); - path2->Print(); - path1->Clear(); - } else if (first2 == 0 && last2 == size2 - 1 && del_subpaths) { - DEBUG("Removing path " << path2->GetId() << " because of path " << path1->GetId()); - path2->Print(); - path1->Print(); - path2->Clear(); - } else if (first2 == 0 && first1 == 0 && del_begins) { - DEBUG("Path " << path1->GetId() << ", len " << path1->Length() << " and path " << path2->GetId() << ", len " << path2->Length() << " have similar starts"); - DEBUG("Path 1: " << last1 << " edges of length " << path1->Length() - path1->LengthAt(min(last1 + 1, path1->Size() - 1))); - DEBUG("Path 2: " << last2 << " edges of length " << path2->Length() - path2->LengthAt(min(last2 + 1, path2->Size() - 1))); - DEBUG("Path 1 has overlap start " << path1->HasOverlapedBegin() << ", path 2 has overlap start " << path2->HasOverlapedBegin()); - - if (add_overlap_begins) { - AddOverlap(paths, path1, first1, last1); - DEBUG("Detaching overlap " << path2->GetId() << " and " << path1->GetId()); - path2->Print(); - path1->Print(); - path1->GetConjPath()->PopBack(last1 + 1); - path2->GetConjPath()->PopBack(last2 + 1); - } else if (path1->Length() < path2->Length()) { - DEBUG("Detaching overlap from " << path1->GetId() << " because of "<< path2->GetId()); - path1->Print(); - path2->Print(); - path1->GetConjPath()->PopBack(last1 + 1); - } else { - DEBUG("Detaching overlap from " << path2->GetId() << " because of "<< path1->GetId()); - path2->Print(); - path1->Print(); - path2->GetConjPath()->PopBack(last2 + 1); - } - } else if ((last1 == size1 - 1 && last2 == size2 - 1) && del_begins) { - DEBUG("Path " << path1->GetId() << ", len " << path1->Length() << " and path " << path2->GetId() << ", len " << path2->Length() << " have similar ends"); - DEBUG("Path 1: " << path1->Size() - first1 << " edges of length " << path1->LengthAt(first1)); - DEBUG("Path 2: " << path2->Size() - first2 << " edges of length " << path2->LengthAt(first2)); - DEBUG("Path 1 has overlap end " << path1->HasOverlapedEnd() << ", path 2 has overlap end " << path2->HasOverlapedEnd()); - - if (add_overlap_begins){ - AddOverlap(paths, path1, first1, last1); - DEBUG("Detaching overlap " << path2->GetId() << " and " << path1->GetId()); - path2->Print(); - path1->Print(); - path1->PopBack(last1 + 1 - first1); - path2->PopBack(last2 + 1 - first2); - } - if (path1->Length() < path2->Length()) { - DEBUG("Detaching overlap from " << path1->GetId() << " because of "<< path2->GetId()); - path1->Print(); - path2->Print(); - path1->PopBack(last1 + 1 - first1); - } else { - DEBUG("Detaching overlap from " << path2->GetId() << " because of "<< path1->GetId()); - path2->Print(); - path1->Print(); - path2->PopBack(last2 + 1 - first2); - } - } else if (first2 == 0 && del_all) { - DEBUG("Detaching overlap from " << path2->GetConjPath()->GetId() << " because of "<< path1->GetId()); - DEBUG("Does it have overlap in the beginning: " << path2->HasOverlapedBegin()); - path2->Print(); - DEBUG(" >>>> ") - path1->Print(); - DEBUG(" ==== "); - path2->GetConjPath()->PopBack(last2 + 1); - } else if (last2 == size2 - 1 && del_all) { - DEBUG("Detaching overlap from " << path2->GetId() << " because of "<< path1->GetId()); - DEBUG("Does it have overlap in the end: " << path2->HasOverlapedEnd()); - path2->Print(); - DEBUG(" >>>> ") - path1->Print(); - DEBUG(" ==== "); - path2->PopBack(last1 + 1 - first1); - } else if (first1 == 0 && del_all) { - DEBUG("Detaching overlap from " << path1->GetConjPath()->GetId() << " because of "<< path2->GetId()); - DEBUG("Does it have overlap in the end: " << path1->HasOverlapedBegin()); - path1->Print(); - DEBUG(" >>>> ") - path2->Print(); - DEBUG(" ==== "); - path1->GetConjPath()->PopBack(last1 + 1); - } else if (last1 == size1 - 1 && del_all) { - DEBUG("Detaching overlap from " << path1->GetId() << " because of "<< path2->GetId()); - DEBUG("Does it have overlap in the end: " << path1->HasOverlapedBegin()); - path1->Print(); - DEBUG(" >>>> ") - path2->Print(); - DEBUG(" ==== "); - path1->PopBack(last1 + 1 - first1); - } else { - return false; - } - return true; - } - - std::vector GetSortedEdges() const { - std::set edges_set; - for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) { - edges_set.insert(*iter); - edges_set.insert(g_.conjugate(*iter)); +public: + OverlapRemover(const Graph &g, + const PathContainer &paths, + GraphCoverageMap &coverage_map, + size_t min_edge_len,// = 0, + size_t max_diff) :// = 0) : + g_(g), + paths_(paths), + helper_(g, coverage_map, + min_edge_len, max_diff) { + } + + //Note that during start/end removal all repeat instance have to be cut +// void MarkOverlaps(bool end_start_only = false, bool retain_one_copy = true) { + void MarkOverlaps(bool end_start_only, bool retain_one_copy) { + VERIFY(!end_start_only || !retain_one_copy); + INFO("Marking start/end overlaps"); + InnerMarkOverlaps(/*end/start overlaps only*/ true, /*retain one copy*/ false); + if (!end_start_only) { + INFO("Marking remaining overlaps"); + InnerMarkOverlaps(/*end/start overlaps only*/ false, retain_one_copy); } - std::vector edges(edges_set.begin(), edges_set.end()); - std::sort(edges.begin(), edges.end(), EdgeLengthAndIdComparator(g_)); - return edges; } - bool HasAlreadyOverlapedEnd(BidirectionalPath * path) const { - return !path->IsOverlap() and path->HasOverlapedEnd(); + const SplitsStorage& overlaps() const { + return splits_; } - bool HasAlreadyOverlapedBegin(BidirectionalPath * path) const { - return !path->IsOverlap() and path->HasOverlapedBegin(); - } +private: + DECL_LOGGER("OverlapRemover"); +}; - bool IsSamePath(BidirectionalPath * path1, - BidirectionalPath * path2) const { - return *path2 == *path1 or *path2 == *path1->GetConjPath(); - } - - void RemoveOverlap(PathContainer& paths, BidirectionalPath* path1, - BidirectionalPath* path2, size_t overlap_size) const { - BidirectionalPath* conj2 = path2->GetConjPath(); - if (path1->IsOverlap() && overlap_size == path1->Size()) { - DEBUG("Detaching overlap from " << path2->GetConjPath()->GetId() << " because of "<< path1->GetId()); - path2->Print(); - path1->Print(); - conj2->PopBack(overlap_size); - path2->SetOverlapedBeginTo(path1); - } else if (path2->IsOverlap() && path2->Size() == overlap_size) { - DEBUG("Detaching overlap from " << path1->GetId() << " because of "<< path2->GetId()); - path1->Print(); - path2->Print(); - path1->PopBack(overlap_size); - path1->SetOverlapedEndTo(path2); - } else if (overlap_size < path2->Size() - && overlap_size < path1->Size()) { - BidirectionalPath *overlap = new BidirectionalPath(g_, path1->Back()); - BidirectionalPath *conj_overlap = new BidirectionalPath(g_, g_.conjugate(path1->Back())); - SubscribeCoverageMap(overlap); - SubscribeCoverageMap(conj_overlap); - paths.AddPair(overlap, conj_overlap); - DEBUG("Detaching overlap " << path1->GetId() << " and " << conj2->GetId()); - path1->Print(); - conj2->Print(); - path1->PopBack(); - conj2->PopBack(); - - for (size_t i = 1; i < overlap_size; ++i) { - conj_overlap->PushBack(g_.conjugate(path1->Back())); - path1->PopBack(); - conj2->PopBack(); - } - overlap->SetOverlap(true); - path1->SetOverlapedEndTo(overlap); - path2->SetOverlapedBeginTo(overlap); +class PathSplitter { + const SplitsStorage splits_; + PathContainer &paths_; + GraphCoverageMap &coverage_map_; + + set TransformConjSplits(PathPtr p) const { + set path_splits; + size_t path_len = p->Size(); + auto it = splits_.find(p); + if (it != splits_.end()) { +// std::transform(it->second.begin(), it->second.end(), +// std::inserter(path_splits, path_splits.end()), +// [=] (size_t pos) {return path_len - pos;}); + for (size_t pos : it->second) { + path_splits.insert(path_len - pos); + } + } + return path_splits; + } + + set GatherAllSplits(const PathPair &pp) const { + VERIFY(pp.first->Size() == pp.second->Size()); + set path_splits = TransformConjSplits(pp.second); + auto it = splits_.find(pp.first); + if (it != splits_.end()) { + utils::insert_all(path_splits, it->second); + } + return path_splits; + } + + void SplitPath(BidirectionalPath * const p, const set &path_splits) { + size_t start_pos = 0; + for (size_t split_pos : path_splits) { + if (split_pos == 0) + continue; + if (split_pos == p->Size()) + break; + AddPath(paths_, p->SubPath(start_pos, split_pos), coverage_map_); + start_pos = split_pos; } + PopFront(p, start_pos); } - void FindAndRemovePathOverlap(PathContainer& all_paths, - BidirectionalPath* path1) const { - int last = (int) path1->Size() - 1; - if (last <= 0 or coverage_map_.GetCoverage(path1->At(last)) <= 1) { - return; - } - BidirectionalPathSet paths = - coverage_map_.GetCoveringPaths(path1->At(last)); - BidirectionalPath* overlap_path = NULL; - size_t overlap_size = 0; - for (auto path_iter = paths.begin(); path_iter != paths.end(); - ++path_iter) { - if (IsSamePath(*path_iter, path1)) { +public: + PathSplitter(const SplitsStorage &splits, + PathContainer &paths, + GraphCoverageMap &coverage_map) : + splits_(splits), + paths_(paths), + coverage_map_(coverage_map) {} + + void Split() { + vector tmp_paths(paths_.begin(), paths_.end()); + for (auto path_pair: tmp_paths) { + SplitPath(path_pair.first, GatherAllSplits(path_pair)); + } + } + +private: + DECL_LOGGER("PathSplitter"); +}; + +class PathDeduplicator { + const Graph& g_; + PathContainer &paths_; + const bool equal_only_; + const OverlapFindingHelper helper_; + + bool IsRedundant(PathPtr path) const { + TRACE("Checking if path redundant " << path->GetId()); + for (auto candidate : helper_.FindCandidatePaths(*path)) { + TRACE("Considering candidate " << candidate->GetId()); +// VERIFY(candidate != path && candidate != path->GetConjPath()); + if (candidate == path || candidate == path->GetConjPath()) continue; + if (equal_only_ ? helper_.IsEqual(*path, *candidate) : helper_.IsSubpath(*path, *candidate)) { + return true; } - size_t over_size = path1->OverlapEndSize(*path_iter); - if (over_size > overlap_size) { - overlap_size = over_size; - overlap_path = *path_iter; - } else if (over_size == overlap_size && - (overlap_path == NULL || (*path_iter)->GetId() < overlap_path->GetId())) { - overlap_path = *path_iter; - } - } - if (overlap_path == NULL) { - return; - } - if (overlap_size > 0) { - RemoveOverlap(all_paths, path1, overlap_path, overlap_size); } + return false; } - class EdgeLengthAndIdComparator { - public: - EdgeLengthAndIdComparator(const Graph& g) - : g_(g) { - } - bool operator()(const EdgeId& e1, const EdgeId& e2) const { - if (g_.length(e1) > g_.length(e2)) { - return true; - } - if (g_.length(e2) > g_.length(e1)) { - return false; +public: + + PathDeduplicator(const Graph &g, + PathContainer &paths, + GraphCoverageMap &coverage_map, + size_t min_edge_len, + size_t max_diff, + bool equal_only) : + g_(g), + paths_(paths), + equal_only_(equal_only), + helper_(g, coverage_map, min_edge_len, max_diff) {} + + //TODO use path container filtering? + void Deduplicate() { + for (auto path_pair : paths_) { + auto path = path_pair.first; + if (IsRedundant(path)) { + TRACE("Clearing path " << path->str()); + path->Clear(); } - return e1.int_id() < e2.int_id(); } - private: - const Graph& g_; - }; + } - const Graph& g_; - GraphCoverageMap& coverage_map_; -protected: - DECL_LOGGER("PEResolver") +private: + DECL_LOGGER("PathDeduplicator"); }; +inline void Deduplicate(const Graph &g, PathContainer &paths, GraphCoverageMap &coverage_map, + size_t min_edge_len, size_t max_path_diff, + bool equal_only = false) { + //add sorting to guarantee survival of longest paths if max_path_diff used + //paths.SortByLength(false); + PathDeduplicator deduplicator(g, paths, coverage_map, min_edge_len, max_path_diff, equal_only); + deduplicator.Deduplicate(); + paths.FilterEmptyPaths(); +} + class PathExtendResolver { -protected: const Graph& g_; size_t k_; @@ -471,107 +277,59 @@ PathContainer MakeSimpleSeeds() const { std::set included; PathContainer edges; - for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) { - if (g_.int_id(*iter) <= 0 or InTwoEdgeCycle(*iter, g_)) + for (auto iter = g_.ConstEdgeBegin(/*canonical only*/true); !iter.IsEnd(); ++iter) { + EdgeId e = *iter; + if (g_.int_id(e) <= 0 || InTwoEdgeCycle(e, g_)) continue; - if (included.count(*iter) == 0) { - BidirectionalPath * first = new BidirectionalPath(g_, *iter); - BidirectionalPath * second = new BidirectionalPath(g_, g_.conjugate(*iter)); - edges.AddPair(first,second); - included.insert(*iter); - included.insert(g_.conjugate(*iter)); - } + edges.AddPair(new BidirectionalPath(g_, e), new BidirectionalPath(g_, g_.conjugate(e))); } return edges; } - PathContainer ExtendSeeds(PathContainer &seeds, ContigsMaker &pathExtender) const { + PathContainer ExtendSeeds(PathContainer &seeds, CompositeExtender &composite_extender) const { PathContainer paths; - pathExtender.GrowAll(seeds, paths); + composite_extender.GrowAll(seeds, paths); return paths; } - void RemoveEqualPaths(PathContainer &paths, GraphCoverageMap &coverage_map, - size_t min_edge_len) const { - - SimpleOverlapRemover remover(g_, coverage_map); - remover.RemoveSimilarPaths(paths, min_edge_len, min_edge_len, true, false, false, false, false); - } - - void RemoveRNAOverlaps(PathContainer& paths, GraphCoverageMap& coverage_map, - size_t min_edge_len, size_t max_path_diff) const { - - SimpleOverlapRemover remover(g_, coverage_map); - remover.RemoveSimilarPaths(paths, min_edge_len, max_path_diff, true, false, false, false, false); - - remover.RemoveSimilarPaths(paths, min_edge_len, max_path_diff, false, true, false, false, false); - - remover.RemoveOverlaps(paths); - - remover.RemoveSimilarPaths(paths, min_edge_len, max_path_diff, true, false, false, false, false); - } - + //Paths should be deduplicated first! void RemoveOverlaps(PathContainer &paths, GraphCoverageMap &coverage_map, size_t min_edge_len, size_t max_path_diff, - bool add_overlaps_begin, - bool cut_preudo_self_conjugate) const { - SimpleOverlapRemover remover(g_, coverage_map); - if (cut_preudo_self_conjugate) - remover.CutPseudoSelfConjugatePaths(paths); - - remover.CutNonUniqueSuffix(paths); - //writer.WritePathsToFASTA(paths, output_dir + "/before.fasta"); - //DEBUG("Removing subpaths"); - //delete not only eq, - remover.RemoveSimilarPaths(paths, min_edge_len, max_path_diff, false, true, false, false, add_overlaps_begin); - //writer.WritePathsToFASTA(paths, output_dir + "/remove_similar.fasta"); - //DEBUG("Remove overlaps") - remover.RemoveOverlaps(paths); - //writer.WritePathsToFASTA(paths, output_dir + "/after_remove_overlaps.fasta"); - remover.RemoveSimilarPaths(paths, min_edge_len, max_path_diff, true, false, false, false, add_overlaps_begin); - //writer.WritePathsToFASTA(paths, output_dir + "/remove_equal.fasta"); - //DEBUG("remove similar path. Max difference " << max_overlap); - remover.RemoveSimilarPaths(paths, min_edge_len, max_path_diff, false, true, true, true, add_overlaps_begin); - DEBUG("end removing"); - } - - void RemoveMatePairEnds(PathContainer& paths, size_t min_edge_len) const { - DEBUG("remove mp ends"); - for (size_t i = 0; i < paths.size(); ++i) { - RemoveMatePairEnd(*paths.Get(i), min_edge_len); - RemoveMatePairEnd(*paths.GetConjugate(i), min_edge_len); - } + bool end_start_only, bool cut_all) const { + INFO("Removing overlaps"); + //VERIFY(min_edge_len == 0 && max_path_diff == 0); + if (!cut_all) { + INFO("Sorting paths"); + //sorting is currently needed to retain overlap instance in longest paths + paths.SortByLength(false); + } + + OverlapRemover overlap_remover(g_, paths, coverage_map, + min_edge_len, max_path_diff); + INFO("Marking overlaps"); + overlap_remover.MarkOverlaps(end_start_only, !cut_all); + + INFO("Splitting paths"); + PathSplitter splitter(overlap_remover.overlaps(), paths, coverage_map); + splitter.Split(); + //splits are invalidated after this point + + INFO("Deduplicating paths"); + Deduplicate(g_, paths, coverage_map, min_edge_len, max_path_diff); + INFO("Overlaps removed"); } void AddUncoveredEdges(PathContainer &paths, GraphCoverageMap &coverageMap) const { - std::set included; - for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) { - if (included.count(*iter) == 0 && !coverageMap.IsCovered(*iter)) { - BidirectionalPath* path = new BidirectionalPath(g_, *iter); - BidirectionalPath* conj = new BidirectionalPath(g_, g_.conjugate(*iter)); - path->Subscribe(&coverageMap); - conj->Subscribe(&coverageMap); - coverageMap.BackEdgeAdded(path->At(0), path, path->GapAt(0)); - coverageMap.BackEdgeAdded(conj->At(0), conj, conj->GapAt(0)); - paths.AddPair(path, conj); - included.insert(*iter); - included.insert(g_.conjugate(*iter)); + for (auto iter = g_.ConstEdgeBegin(true); !iter.IsEnd(); ++iter) { + EdgeId e = *iter; + if (!coverageMap.IsCovered(e)) { + AddPath(paths, BidirectionalPath(g_, e), coverageMap); } } } -private: - void RemoveMatePairEnd(BidirectionalPath& path, size_t min_edge_len) const { - int pos = int(path.Size()) - 1; - while (pos > 0 and g_.length(path.At(pos)) < min_edge_len) { - path.PopBack(); - pos--; - } - } protected: DECL_LOGGER("PEResolver") }; -} /* PE_RESOLVER_HPP_ */ - -#endif +} diff -Nru spades-3.10.1+dfsg/src/common/modules/path_extend/pe_utils.hpp spades-3.11.1+dfsg/src/common/modules/path_extend/pe_utils.hpp --- spades-3.10.1+dfsg/src/common/modules/path_extend/pe_utils.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/path_extend/pe_utils.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -35,34 +35,19 @@ return false; } -inline bool InBuble(EdgeId e, const Graph& g) { - auto edges = g.OutgoingEdges(g.EdgeStart(e)); - auto endVertex = g.EdgeEnd(e); - for (auto it = edges.begin(); it != edges.end(); ++it) { - if ((g.EdgeEnd(*it) == endVertex) and (*it != e)) { - return true; - } - } - return false; -} - - // Handles all paths in PathContainer. // For each edge output all paths that _traverse_ this path. If path contains multiple instances - count them. Position of the edge is not reported. class GraphCoverageMap: public PathListener { - public: typedef BidirectionalPathMultiset MapDataT; - private: const Graph& g_; - std::unordered_map edge_coverage_; + std::unordered_map edge_coverage_; + const MapDataT empty_; - MapDataT * empty_; - - virtual void EdgeAdded(EdgeId e, BidirectionalPath * path, Gap /*gap*/) { + void EdgeAdded(EdgeId e, BidirectionalPath * path) { auto iter = edge_coverage_.find(e); if (iter == edge_coverage_.end()) { edge_coverage_.insert(std::make_pair(e, new MapDataT())); @@ -70,7 +55,7 @@ edge_coverage_[e]->insert(path); } - virtual void EdgeRemoved(EdgeId e, BidirectionalPath * path) { + void EdgeRemoved(EdgeId e, BidirectionalPath * path) { auto iter = edge_coverage_.find(e); if (iter != edge_coverage_.end()) { if (iter->second->count(path) == 0) { @@ -82,6 +67,14 @@ } } + void ProcessPath(BidirectionalPath * path, bool subscribe) { + if (subscribe) + path->Subscribe(this); + for (size_t i = 0; i < path->Size(); ++i) { + EdgeAdded(path->At(i), path); + } + } + size_t EdgeCount() const { size_t result = 0; for (auto e = g_.ConstEdgeBegin(); !e.IsEnd(); ++e) { @@ -91,54 +84,47 @@ } public: - GraphCoverageMap(const Graph& g) : g_(g), edge_coverage_() { - empty_ = new MapDataT(); + GraphCoverageMap(const GraphCoverageMap&) = delete; + GraphCoverageMap& operator=(const GraphCoverageMap&) = delete; + + GraphCoverageMap(GraphCoverageMap&&) = default; + GraphCoverageMap& operator=(GraphCoverageMap&&) = default; + + explicit GraphCoverageMap(const Graph& g) : g_(g) { + //FIXME heavy constructor edge_coverage_.reserve(EdgeCount()); } - GraphCoverageMap(const Graph& g, const PathContainer& paths, bool subscribe = false) : g_(g), edge_coverage_() { - empty_ = new MapDataT(); - edge_coverage_.reserve(EdgeCount()); + GraphCoverageMap(const Graph& g, const PathContainer& paths, bool subscribe = false) : + GraphCoverageMap(g) { AddPaths(paths, subscribe); } - virtual ~GraphCoverageMap() { - delete empty_; + ~GraphCoverageMap() { for (auto iter = edge_coverage_.begin(); iter != edge_coverage_.end(); ++iter) { delete iter->second; } } void AddPaths(const PathContainer& paths, bool subscribe = false) { - for (size_t i = 0; i < paths.size(); ++i) { - if (subscribe) - paths.Get(i)->Subscribe(this); - for (size_t j = 0; j < paths.Get(i)->Size(); ++j) { - EdgeAdded(paths.Get(i)->At(j), paths.Get(i), paths.Get(i)->GapAt(j)); - } - if (subscribe) - paths.GetConjugate(i)->Subscribe(this); - for (size_t j = 0; j < paths.GetConjugate(i)->Size(); ++j) { - EdgeAdded(paths.GetConjugate(i)->At(j), paths.GetConjugate(i), paths.GetConjugate(i)->GapAt(j)); - } + for (auto path_pair : paths) { + ProcessPath(path_pair.first, subscribe); + ProcessPath(path_pair.second, subscribe); } } void Subscribe(BidirectionalPath * path) { - path->Subscribe(this); - for (size_t i = 0; i < path->Size(); ++i) { - BackEdgeAdded(path->At(i), path, path->GapAt(i)); - } + ProcessPath(path, true); } //Inherited from PathListener - void FrontEdgeAdded(EdgeId e, BidirectionalPath * path, Gap gap) override { - EdgeAdded(e, path, gap); + void FrontEdgeAdded(EdgeId e, BidirectionalPath * path, const Gap&) override { + EdgeAdded(e, path); } //Inherited from PathListener - void BackEdgeAdded(EdgeId e, BidirectionalPath * path, Gap gap) override { - EdgeAdded(e, path, gap); + void BackEdgeAdded(EdgeId e, BidirectionalPath * path, const Gap&) override { + EdgeAdded(e, path); } //Inherited from PathListener @@ -151,12 +137,12 @@ EdgeRemoved(e, path); } - MapDataT * GetEdgePaths(EdgeId e) const { + const MapDataT * GetEdgePaths(EdgeId e) const { auto iter = edge_coverage_.find(e); if (iter != edge_coverage_.end()) { return iter->second; } - return empty_; + return &empty_; } int GetCoverage(EdgeId e) const { @@ -189,33 +175,6 @@ return edge_coverage_.end(); } - // DEBUG output - void PrintUncovered() const { - DEBUG("Uncovered edges"); - int s = 0; - for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) { - if (!IsCovered(*iter)) { - DEBUG(g_.int_id(*iter) << " (" << g_.length(*iter) << ") ~ " << g_.int_id(g_.conjugate(*iter)) << " (" << g_.length(g_.conjugate(*iter)) << ")"); - s += 1; - } - } - DEBUG("Uncovered edges " << s / 2); - } - - void PrintMulticovered() const { - DEBUG("Multicovered edges"); - for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) { - auto paths = GetCoveringPaths(*iter); - if (paths.size() > 1 && g_.length(*iter) > 1000) { - DEBUG(g_.int_id(*iter) << " (" << g_.length(*iter) << "). " << " Covered: " << paths.size()); - for (auto path = paths.begin(); path != paths.end(); ++path) { - (*path)->Print(); - } - DEBUG("====="); - } - } - } - size_t size() const { return edge_coverage_.size(); } @@ -224,31 +183,59 @@ return g_; } -private: - GraphCoverageMap(const GraphCoverageMap& t) : g_(t.g_), empty_(t.empty_) {} }; -inline bool GetLoopAndExit(const Graph& g, EdgeId e, pair& result) { + +class PathContainerCoverageSwitcher { + const Graph& g_; + + const SSCoverageStorage& coverage_storage_; + + bool antisense_; + + double CalculateCoverage(const BidirectionalPath& p, bool reverse) const { + double res = 0.0; + double len = 0; + for(auto e : p) { + res += coverage_storage_.GetCoverage(e, reverse) * double(g_.length(e)); + len += (double) g_.length(e); + } + return res / len; + } + +public: + PathContainerCoverageSwitcher(const Graph& g, const SSCoverageStorage& coverage_storage, bool antisense): + g_(g), coverage_storage_(coverage_storage), antisense_(antisense) {} + + + void Apply(PathContainer& paths) const { + for (size_t i = 0; i < paths.size(); ++i) { + if (math::ls(CalculateCoverage(*paths.Get(i), antisense_), CalculateCoverage(*paths.GetConjugate(i), antisense_))) { + paths.Swap(i); + } + } + } +}; + +//result -- first edge is loop's back edge, second is loop exit edge +inline bool GetLoopAndExit(const Graph& g, EdgeId e, EdgeId& back_cycle_edge, EdgeId& loop_exit) { VertexId v = g.EdgeEnd(e); VertexId start = g.EdgeStart(e); if (g.OutgoingEdgeCount(v) != 2 || g.IncomingEdgeCount(v) != 1 || g.OutgoingEdgeCount(start) != 1 || g.IncomingEdgeCount(start) != 2) { return false; } - EdgeId loop; - EdgeId exit; bool loop_found = false; bool exit_found = false; auto edges = g.OutgoingEdges(v); for (auto edge = edges.begin(); edge != edges.end(); ++edge) { if (g.EdgeEnd(*edge) == g.EdgeStart(e) && *edge != e) { - loop = *edge; + back_cycle_edge = *edge; loop_found = true; } else if (*edge != e) { - exit = *edge; + loop_exit = *edge; exit_found = true; } } - result = make_pair(loop, exit); return exit_found && loop_found; } @@ -373,8 +360,9 @@ } inline bool LoopDetector::EdgeInShortLoop(EdgeId e) const { - pair temp; - return GetLoopAndExit(path_->graph(), e, temp); + EdgeId back_cycle_edge; + EdgeId loop_exit; + return GetLoopAndExit(path_->graph(), e, back_cycle_edge, loop_exit); } inline bool LoopDetector::PrevEdgeInShortLoop() const { diff -Nru spades-3.10.1+dfsg/src/common/modules/path_extend/pipeline/extenders_logic.cpp spades-3.11.1+dfsg/src/common/modules/path_extend/pipeline/extenders_logic.cpp --- spades-3.10.1+dfsg/src/common/modules/path_extend/pipeline/extenders_logic.cpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/path_extend/pipeline/extenders_logic.cpp 2017-09-28 09:05:10.000000000 +0000 @@ -31,13 +31,14 @@ resolvable_repeat_length_bound = std::max(resolvable_repeat_length_bound, lib.data().read_length); } INFO("resolvable_repeat_length_bound set to " << resolvable_repeat_length_bound); - + bool investigate_short_loop = lib.is_contig_lib() || lib.is_long_read_lib() || support_.UseCoverageResolverForSingleReads(lib.type()); auto long_read_ec = MakeLongReadsExtensionChooser(lib_index, read_paths_cov_map); return make_shared(gp_, cover_map_, + used_unique_storage_, long_read_ec, resolvable_repeat_length_bound, - true, /* investigate short loops */ + investigate_short_loop, /* investigate short loops */ support_.UseCoverageResolverForSingleReads(lib.type())); } @@ -58,35 +59,36 @@ opts.priority_coeff); return make_shared(gp_, cover_map_, + used_unique_storage_, extension, paired_lib->GetISMax(), investigate_loops, - false /*use short loop coverage resolver*/); + false /*use short loop coverage resolver*/, + opts.weight_threshold); } -shared_ptr ExtendersGenerator::MakeGapJoiners(double is_variation) const { +shared_ptr ExtendersGenerator::MakeGapAnalyzer(double is_variation) const { const auto &pset = params_.pset; - vector> joiners; + vector> joiners; if (params_.pset.scaffolder_options.use_la_gap_joiner) - joiners.push_back(std::make_shared(gp_.g, pset.scaffolder_options.min_overlap_length, + joiners.push_back(std::make_shared(gp_.g, pset.scaffolder_options.min_overlap_length, pset.scaffolder_options.flank_multiplication_coefficient, pset.scaffolder_options.flank_addition_coefficient)); - joiners.push_back(std::make_shared(gp_.g, + joiners.push_back(std::make_shared(gp_.g, pset.scaffolder_options.min_gap_score, pset.scaffolder_options.short_overlap, (int) pset.scaffolder_options.basic_overlap_coeff * dataset_info_.RL())); - return std::make_shared(gp_.g, + //todo introduce explicit must_overlap_coeff and rename max_can_overlap -> can_overlap_coeff + return std::make_shared(gp_.g, joiners, - size_t(pset.scaffolder_options.max_can_overlap - * (double) gp_.g.k()), /* may overlap threshold */ - int(math::round(double(gp_.g.k()) - - pset.scaffolder_options.var_coeff - * is_variation)), /* must overlap threshold */ + size_t(math::round(pset.scaffolder_options.max_can_overlap + * is_variation)), /* may overlap threshold */ + int(math::round(-pset.scaffolder_options.var_coeff * is_variation)), /* must overlap threshold */ pset.scaffolder_options.artificial_gap); } @@ -103,8 +105,9 @@ pset.scaffolder_options.cl_threshold, pset.scaffolder_options.var_coeff); - return make_shared(gp_, cover_map_, scaff_chooser, - MakeGapJoiners(paired_lib->GetIsVar()), + return make_shared(gp_, cover_map_, + used_unique_storage_, scaff_chooser, + MakeGapAnalyzer(paired_lib->GetIsVar()), paired_lib->GetISMax(), false, /* investigate short loops */ params_.avoid_rc_connections); @@ -130,17 +133,17 @@ VERIFY(pset.scaffolder_options.min_overlap_for_rna_scaffolding.is_initialized()); return make_shared(gp_, cover_map_, + used_unique_storage_, scaff_chooser, scaff_chooser2, - MakeGapJoiners(paired_lib->GetIsVar()), + MakeGapAnalyzer(paired_lib->GetIsVar()), paired_lib->GetISMax(), false /* investigate short loops */, *pset.scaffolder_options.min_overlap_for_rna_scaffolding); } -shared_ptr ExtendersGenerator::MakeMatePairScaffoldingExtender( - size_t lib_index, - const ScaffoldingUniqueEdgeStorage &storage) const { +shared_ptr ExtendersGenerator::MakeMatePairScaffoldingExtender(size_t lib_index, + const ScaffoldingUniqueEdgeStorage &storage) const { const auto &lib = dataset_info_.reads[lib_index]; const auto &pset = params_.pset; @@ -173,8 +176,9 @@ <= params_.pset.scaffolding2015.graph_connectivity_max_edges); return make_shared(gp_, cover_map_, + used_unique_storage_, scaff_chooser, - MakeGapJoiners(paired_lib->GetIsVar()), + MakeGapAnalyzer(paired_lib->GetIsVar()), paired_lib->GetISMax(), false, /* investigate short loops */ params_.avoid_rc_connections, @@ -204,10 +208,11 @@ auto chooser = make_shared(gp_.g, permissive_pi_chooser, coord_cov_chooser); - return make_shared(gp_, cover_map_, chooser, + return make_shared(gp_, cover_map_, used_unique_storage_, chooser, -1ul /* insert size is needed only for loop detection, which is not needed in this case */, false, /* investigate short loops */ - false /*use short loop coverage resolver*/); + false /*use short loop coverage resolver*/, + params_.pset.extension_options.weight_threshold); } shared_ptr ExtendersGenerator::MakeRNAExtender(size_t lib_index, bool investigate_loops) const { @@ -229,12 +234,31 @@ opts.priority_coeff); return make_shared(gp_, cover_map_, + used_unique_storage_, extension, paired_lib->GetISMax(), investigate_loops, - false /*use short loop coverage resolver*/); + false /*use short loop coverage resolver*/, + opts.weight_threshold); } + +shared_ptr ExtendersGenerator::MakeSimpleCoverageExtender(size_t lib_index) const { + + auto extension = + make_shared(gp_.ss_coverage[lib_index], gp_.g, + params_.pset.simple_coverage_resolver.coverage_delta, + params_.pset.simple_coverage_resolver.min_upper_coverage); + + return make_shared(gp_, cover_map_, + used_unique_storage_, + extension, + 1000, /*insert size for cycle detection*/ + false /*investigate short loops*/, + false /*use short loop coverage resolver*/); +} + + shared_ptr ExtendersGenerator::MakePEExtender(size_t lib_index, bool investigate_loops) const { const auto &lib = dataset_info_.reads[lib_index]; shared_ptr paired_lib = MakeNewLib(gp_.g, lib, gp_.clustered_indices[lib_index]); @@ -261,12 +285,26 @@ opts.priority_coeff); return make_shared(gp_, cover_map_, + used_unique_storage_, extension_chooser, paired_lib->GetISMax(), investigate_loops, - false /*use short loop coverage resolver*/); + false /*use short loop coverage resolver*/, + opts.weight_threshold); } +//FIXME do we need ExtenderTriplets story here? +//FIXME integrate with MakeBasicExtenders +Extenders ExtendersGenerator::MakePEExtenders() const { + Extenders result; + for (size_t lib_index = 0; lib_index < dataset_info_.reads.lib_count(); ++lib_index) { + const auto &lib = dataset_info_.reads[lib_index]; + if (support_.IsForPEExtender(lib)) { + result.push_back(MakePEExtender(lib_index, false)); + } + } + return result; +} void ExtendersGenerator::PrintExtenders(const Extenders &extenders) const { DEBUG("Extenders in vector:"); @@ -274,12 +312,12 @@ //TODO: use polymorphism instead of RTTI auto ext_ptr = extender.get(); DEBUG("Extender #i" << typeid(*ext_ptr).name()); - if (instanceof(ext_ptr)) { + if (utils::instanceof(ext_ptr)) { auto ec = ((SimpleExtender *) ext_ptr)->GetExtensionChooser(); auto ec_ptr = ec.get(); DEBUG(" Extender #i" << typeid(*ec_ptr).name()); } - else if (instanceof(ext_ptr)) { + else if (utils::instanceof(ext_ptr)) { auto ec = ((ScaffoldingPathExtender *) ext_ptr)->GetExtensionChooser(); auto ec_ptr = ec.get(); DEBUG(" Extender #i" << typeid(*ec_ptr).name()); @@ -287,6 +325,16 @@ } } +Extenders ExtendersGenerator::MakeMPExtenders() const { + Extenders extenders = MakeMPExtenders(unique_data_.main_unique_storage_); + INFO("Using " << extenders.size() << " mate-pair " << support_.LibStr(extenders.size())); + + for (const auto& unique_storage : unique_data_.unique_storages_) { + utils::push_back_all(extenders, MakeMPExtenders(unique_storage)); + } + return extenders; +} + Extenders ExtendersGenerator::MakeMPExtenders(const ScaffoldingUniqueEdgeStorage &storage) const { ExtenderTriplets result; @@ -294,7 +342,8 @@ const auto &lib = dataset_info_.reads[lib_index]; if (lib.is_mate_pair()) { - result.emplace_back(lib.type(), lib_index, MakeMatePairScaffoldingExtender(lib_index, storage)); + result.emplace_back(lib.type(), lib_index, + MakeMatePairScaffoldingExtender(lib_index, storage)); } } std::stable_sort(result.begin(), result.end()); @@ -302,8 +351,7 @@ return ExtractExtenders(result); } -Extenders ExtendersGenerator::MakePBScaffoldingExtenders(const ScaffoldingUniqueEdgeStorage &unique_storage_pb, - const vector> &long_reads_cov_map) const { +Extenders ExtendersGenerator::MakePBScaffoldingExtenders() const { const auto &pset = params_.pset; ExtenderTriplets result; @@ -312,20 +360,22 @@ INFO("Creating scaffolding extender for lib " << lib_index); shared_ptr condition = make_shared(gp_.g, lib_index, 2, - *long_reads_cov_map[lib_index]); + unique_data_.long_reads_cov_map_[lib_index]); auto scaff_chooser = std::make_shared(gp_.g, nullptr, condition, - unique_storage_pb, + unique_data_.unique_pb_storage_, pset.scaffolder_options.cl_threshold, pset.scaffolder_options.var_coeff, pset.scaffolding2015.relative_weight_cutoff); result.emplace_back(dataset_info_.reads[lib_index].type(), lib_index, + //FIXME are utilized constants reasonable? make_shared(gp_, cover_map_, + used_unique_storage_, scaff_chooser, - MakeGapJoiners(1000), /* "IS vatiation" */ + MakeGapAnalyzer(1000), /* "IS variation" */ 10000, /* insert size */ false, /* investigate short loops */ params_.avoid_rc_connections, @@ -349,8 +399,7 @@ return result; } -Extenders ExtendersGenerator::MakeBasicExtenders(const ScaffoldingUniqueEdgeStorage &storage, - const vector> &long_reads_cov_map) const { +Extenders ExtendersGenerator::MakeBasicExtenders() const { ExtenderTriplets basic_extenders; ExtenderTriplets loop_resolving_extenders; ExtenderTriplets scaffolding_extenders; @@ -366,7 +415,9 @@ //TODO: scaff2015 does not need any single read libs? if (support_.IsForSingleReadExtender(lib)) { - basic_extenders.emplace_back(lib.type(), lib_index, MakeLongReadsExtender(lib_index, *long_reads_cov_map[lib_index])); + basic_extenders.emplace_back(lib.type(), lib_index, + MakeLongReadsExtender(lib_index, + unique_data_.long_reads_cov_map_[lib_index])); ++single_read_libs; } if (support_.IsForPEExtender(lib)) { @@ -376,12 +427,16 @@ basic_extenders.emplace_back(lib.type(), lib_index, MakeLongEdgePEExtender(lib_index, false)); } else if (pset.multi_path_extend) { basic_extenders.emplace_back(lib.type(), lib_index, MakePEExtender(lib_index, false)); + if (pset.simple_coverage_resolver.enabled) + basic_extenders.emplace_back(lib.type(), lib_index, MakeSimpleCoverageExtender(lib_index)); basic_extenders.emplace_back(lib.type(), lib_index, MakeRNAExtender(lib_index, false)); } else { basic_extenders.emplace_back(lib.type(), lib_index, MakePEExtender(lib_index, false)); } } else if (pset.sm == sm_2015) { - basic_extenders.emplace_back(lib.type(), lib_index, MakeMatePairScaffoldingExtender(lib_index, storage)); + basic_extenders.emplace_back(lib.type(), lib_index, + MakeMatePairScaffoldingExtender(lib_index, + unique_data_.main_unique_storage_)); } } //TODO logic is very cryptic! @@ -397,7 +452,9 @@ } else { scaffolding_extenders.emplace_back(lib.type(), lib_index, MakeScaffoldingExtender(lib_index)); if (pset.sm == sm_combined) { - scaffolding_extenders.emplace_back(lib.type(), lib_index, MakeMatePairScaffoldingExtender(lib_index, storage)); + scaffolding_extenders.emplace_back(lib.type(), lib_index, + MakeMatePairScaffoldingExtender(lib_index, + unique_data_.main_unique_storage_)); } } } @@ -408,9 +465,9 @@ std::stable_sort(loop_resolving_extenders.begin(), loop_resolving_extenders.end()); Extenders result; - push_back_all(result, ExtractExtenders(basic_extenders)); - push_back_all(result, ExtractExtenders(scaffolding_extenders)); - push_back_all(result, ExtractExtenders(loop_resolving_extenders)); + utils::push_back_all(result, ExtractExtenders(basic_extenders)); + utils::push_back_all(result, ExtractExtenders(scaffolding_extenders)); + utils::push_back_all(result, ExtractExtenders(loop_resolving_extenders)); INFO("Using " << pe_libs << " paired-end " << support_.LibStr(pe_libs)); INFO("Using " << scf_pe_libs << " paired-end scaffolding " << support_.LibStr(scf_pe_libs)); diff -Nru spades-3.10.1+dfsg/src/common/modules/path_extend/pipeline/extenders_logic.hpp spades-3.11.1+dfsg/src/common/modules/path_extend/pipeline/extenders_logic.hpp --- spades-3.10.1+dfsg/src/common/modules/path_extend/pipeline/extenders_logic.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/path_extend/pipeline/extenders_logic.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -46,6 +46,8 @@ const conj_graph_pack &gp_; const GraphCoverageMap &cover_map_; + const UniqueData &unique_data_; + UsedUniqueStorage &used_unique_storage_; const PELaunchSupport &support_; @@ -54,25 +56,33 @@ const PathExtendParamsContainer ¶ms, const conj_graph_pack &gp, const GraphCoverageMap &cover_map, + const UniqueData &unique_data, + UsedUniqueStorage &used_unique_storage, const PELaunchSupport& support) : dataset_info_(dataset_info), params_(params), gp_(gp), cover_map_(cover_map), + unique_data_(unique_data), + used_unique_storage_(used_unique_storage), support_(support) { } - Extenders MakePBScaffoldingExtenders(const ScaffoldingUniqueEdgeStorage &unique_storage_pb, - const vector> &long_reads_cov_map) const; + Extenders MakePBScaffoldingExtenders() const; - Extenders MakeBasicExtenders(const ScaffoldingUniqueEdgeStorage &storage, - const vector> &long_reads_cov_map) const; + Extenders MakeBasicExtenders() const; - Extenders MakeMPExtenders(const ScaffoldingUniqueEdgeStorage &storage) const; + Extenders MakeMPExtenders() const; Extenders MakeCoverageExtenders() const; + Extenders MakePEExtenders() const; + private: + shared_ptr MakePEExtender(size_t lib_index, bool investigate_loops) const; + + Extenders MakeMPExtenders(const ScaffoldingUniqueEdgeStorage &storage) const; + shared_ptr MakeLongReadsExtensionChooser(size_t lib_index, const GraphCoverageMap& read_paths_cov_map) const; shared_ptr MakeLongReadsExtender(size_t lib_index, const GraphCoverageMap& read_paths_cov_map) const; @@ -80,36 +90,20 @@ shared_ptr MakeLongEdgePEExtender(size_t lib_index, bool investigate_loops) const; - shared_ptr MakeMetaWeightCounter(shared_ptr lib, - size_t read_length) const; - - shared_ptr MakeMetaExtensionChooser(shared_ptr lib, - size_t read_length) const; - - shared_ptr MakeMetaExtender(size_t lib_index, bool investigate_loops) const; - - - shared_ptr MakePEExtender(size_t lib_index, bool investigate_loops) const; - - - shared_ptr MakeGapJoiners(double is_variation) const; - + shared_ptr MakeGapAnalyzer(double is_variation) const; shared_ptr MakeScaffoldingExtender(size_t lib_index) const; - shared_ptr MakeRNAScaffoldingExtender(size_t lib_index) const; - shared_ptr MakeMatePairScaffoldingExtender (size_t lib_index, const ScaffoldingUniqueEdgeStorage &storage) const; - shared_ptr MakeCoordCoverageExtender(size_t lib_index) const; - shared_ptr MakeRNAExtender(size_t lib_index, bool investigate_loops) const; + shared_ptr MakeSimpleCoverageExtender(size_t lib_index) const; void PrintExtenders(const vector> &extenders) const; diff -Nru spades-3.10.1+dfsg/src/common/modules/path_extend/pipeline/launcher.cpp spades-3.11.1+dfsg/src/common/modules/path_extend/pipeline/launcher.cpp --- spades-3.10.1+dfsg/src/common/modules/path_extend/pipeline/launcher.cpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/path_extend/pipeline/launcher.cpp 2017-09-28 09:05:10.000000000 +0000 @@ -6,6 +6,7 @@ #include "modules/path_extend/path_visualizer.hpp" #include "modules/path_extend/loop_traverser.hpp" +#include "modules/path_extend/path_extender.hpp" #include "modules/alignment/long_read_storage.hpp" #include "modules/path_extend/scaffolder2015/extension_chooser2015.hpp" #include "modules/path_extend/scaffolder2015/scaffold_graph_visualizer.hpp" @@ -20,7 +21,6 @@ using namespace debruijn_graph; using namespace std; - vector> PathExtendLauncher::ConstructPairedConnectionConditions(const ScaffoldingUniqueEdgeStorage& edge_storage) const { @@ -39,7 +39,7 @@ INFO("Unusable for scaffold graph paired lib #" << lib_index); continue; } - conditions.push_back(make_shared(gp_.g, edge_storage.GetSet(), + conditions.push_back(make_shared(gp_.g, edge_storage.unique_edges(), paired_lib, lib_index, params.always_add, params.never_add, @@ -55,7 +55,7 @@ const pe_config::ParamSetT::ScaffoldGraphParamsT ¶ms = params_.pset.scaffold_graph_params; INFO("Constructing connections"); - LengthLowerBound edge_condition(gp_.g, edge_storage.GetMinLength()); + LengthLowerBound edge_condition(gp_.g, edge_storage.min_length()); vector> conditions = ConstructPairedConnectionConditions(edge_storage); @@ -68,9 +68,9 @@ INFO("Total conditions " << conditions.size()); - INFO("Constructing scaffold graph from set of size " << edge_storage.GetSet().size()); + INFO("Constructing scaffold graph from set of size " << edge_storage.unique_edges().size()); - DefaultScaffoldGraphConstructor constructor(gp_.g, edge_storage.GetSet(), conditions, edge_condition); + DefaultScaffoldGraphConstructor constructor(gp_.g, edge_storage.unique_edges(), conditions, edge_condition); auto scaffold_graph = constructor.Construct(); INFO("Scaffold graph contains " << scaffold_graph->VertexCount() << " vertices and " << scaffold_graph->EdgeCount() @@ -84,27 +84,12 @@ const string &filename) const { using namespace scaffold_graph; - INFO("Constructing reference labels"); - map edge_labels; - size_t count = 0; - for (const auto &edge_coord_pair: genome_checker.ConstructEdgeOrder()) { - if (edge_labels.find(edge_coord_pair.first) == edge_labels.end()) { - edge_labels[edge_coord_pair.first] = ""; - } - edge_labels[edge_coord_pair.first] += "order: " + ToString(count) + - "\n mapped range: " + ToString(edge_coord_pair.second.mapped_range.start_pos) + " : " - + ToString(edge_coord_pair.second.mapped_range.end_pos) + - "\n init range: " + ToString(edge_coord_pair.second.initial_range.start_pos) + " : " - + ToString(edge_coord_pair.second.initial_range.end_pos) + "\n"; - ++count; - } - auto vertex_colorer = make_shared(main_edge_set); auto edge_colorer = make_shared(); graph_colorer::CompositeGraphColorer colorer(vertex_colorer, edge_colorer); INFO("Visualizing scaffold graph"); - ScaffoldGraphVisualizer singleVisualizer(scaffold_graph, edge_labels); + ScaffoldGraphVisualizer singleVisualizer(scaffold_graph, genome_checker.EdgeLabels()); std::ofstream single_dot; single_dot.open((filename + "_single.dot").c_str()); singleVisualizer.Visualize(single_dot, colorer); @@ -122,13 +107,17 @@ //Scaffold graph shared_ptr scaffold_graph; if (params_.pset.scaffold_graph_params.construct) { - debruijn_graph::GenomeConsistenceChecker genome_checker(gp_, unique_data_.main_unique_storage_, + debruijn_graph::GenomeConsistenceChecker genome_checker(gp_, params_.pset.genome_consistency_checker.max_gap, - params_.pset.genome_consistency_checker.relative_max_gap); + params_.pset.genome_consistency_checker.relative_max_gap, + unique_data_.main_unique_storage_.min_length(), + unique_data_.main_unique_storage_, + unique_data_.long_reads_cov_map_, + dataset_info_.reads); scaffold_graph = ConstructScaffoldGraph(unique_data_.main_unique_storage_); if (params_.pset.scaffold_graph_params.output) { PrintScaffoldGraph(*scaffold_graph, - unique_data_.main_unique_storage_.GetSet(), + unique_data_.main_unique_storage_.unique_edges(), genome_checker, params_.etc_dir + "scaffold_graph"); } @@ -138,26 +127,38 @@ void PathExtendLauncher::CountMisassembliesWithReference(const PathContainer &paths) const { if (gp_.genome.size() == 0) return; - - debruijn_graph::GenomeConsistenceChecker genome_checker(gp_, unique_data_.main_unique_storage_, + bool use_main_storage = params_.pset.genome_consistency_checker.use_main_storage; + size_t unresolvable_gap = unique_data_.main_unique_storage_.min_length(); + ScaffoldingUniqueEdgeStorage tmp_storage; + if (!use_main_storage) { + unresolvable_gap = params_.pset.genome_consistency_checker.unresolvable_jump; + ScaffoldingUniqueEdgeAnalyzer tmp_analyzer(gp_, params_.pset.genome_consistency_checker.unique_length, unique_data_.unique_variation_); + tmp_analyzer.FillUniqueEdgeStorage(tmp_storage); + } + debruijn_graph::GenomeConsistenceChecker genome_checker(gp_, params_.pset.genome_consistency_checker.max_gap, - params_.pset.genome_consistency_checker.relative_max_gap); + params_.pset.genome_consistency_checker.relative_max_gap, + unresolvable_gap, + use_main_storage ? unique_data_.main_unique_storage_ : tmp_storage, + unique_data_.long_reads_cov_map_, + dataset_info_.reads); size_t total_mis = 0, gap_mis = 0; - genome_checker.SpellGenome(); for (auto iter = paths.begin(); iter != paths.end(); ++iter) { BidirectionalPath *path = iter.get(); auto map_res = genome_checker.CountMisassemblies(*path); if (map_res.misassemblies > 0) { INFO ("there are " << map_res.misassemblies << " misassemblies in path: "); - path->PrintInfo(); + path->PrintINFO(); total_mis += map_res.misassemblies; } if (map_res.wrong_gap_size > 0) { - INFO ("there are " << map_res.wrong_gap_size << " wrong gaps in path: "); - path->PrintInfo(); + INFO ("there are " << map_res.wrong_gap_size << " wrong gaps in path. "); + path->PrintDEBUG(); gap_mis += map_res.wrong_gap_size; } + genome_checker.CheckPathEnd(*path); + genome_checker.CheckPathEnd(path->Conjugate()); } INFO ("In total found " << total_mis << " misassemblies " << " and " << gap_mis << " gaps."); } @@ -199,49 +200,87 @@ if (!params_.pe_cfg.debug_output) { return; } - PathInfoWriter path_writer; PathVisualizer visualizer; - writer_.OutputPaths(paths, params_.etc_dir + name); + writer_.OutputPaths(paths, params_.etc_dir + name + ".fasta"); if (params_.pe_cfg.output.write_paths) { - path_writer.WritePaths(paths, params_.etc_dir + name + ".dat"); + std::ofstream oss(params_.etc_dir + name + ".dat"); + for (auto iter = paths.begin(); iter != paths.end(); ++iter) { + iter.get()->Print(oss); + } + oss.close(); } if (params_.pe_cfg.viz.print_paths) { visualizer.writeGraphWithPathsSimple(gp_, params_.etc_dir + name + ".dot", name, paths); } } -void PathExtendLauncher::FinalizePaths(PathContainer &paths, - GraphCoverageMap &cover_map, - const PathExtendResolver &resolver) const { +void FilterInterstandBulges(PathContainer &paths) { + DEBUG ("Try to delete paths with interstand bulges"); + for (auto iter = paths.begin(); iter != paths.end(); ++iter) { + if (EndsWithInterstrandBulge(*iter.get())) { + iter.get()->PopBack(); + } + if (EndsWithInterstrandBulge(*iter.getConjugate())) { + iter.getConjugate()->PopBack(); + } + } + DEBUG("deleted paths with interstand bulges"); +} - if (params_.pset.remove_overlaps) { +void PathExtendLauncher::RemoveOverlapsAndArtifacts(PathContainer &paths, + GraphCoverageMap &cover_map, + const PathExtendResolver &resolver) const { + INFO("Finalizing paths"); + + INFO("Deduplicating paths"); + Deduplicate(gp_.g, paths, cover_map, params_.min_edge_len, + params_.max_path_diff); + + INFO("Paths deduplicated"); + + if (params_.pset.overlap_removal.enabled) { resolver.RemoveOverlaps(paths, cover_map, params_.min_edge_len, params_.max_path_diff, - params_.pset.cut_all_overlaps, - (params_.mode == config::pipeline_type::moleculo)); - } else if (params_.mode == config::pipeline_type::rna) { - resolver.RemoveRNAOverlaps(paths, cover_map, params_.min_edge_len, params_.max_path_diff); + params_.pset.overlap_removal.end_start_only, + params_.pset.overlap_removal.cut_all); } else { - resolver.RemoveEqualPaths(paths, cover_map, params_.min_edge_len); + INFO("Overlaps will not be removed"); } + //TODO do we still need it? if (params_.avoid_rc_connections) { - paths.FilterInterstandBulges(); + FilterInterstandBulges(paths); } - paths.FilterEmptyPaths(); resolver.AddUncoveredEdges(paths, cover_map); - if (params_.pset.path_filtration.enabled) { - LengthPathFilter(gp_.g, params_.pset.path_filtration.min_length).filter(paths);; - IsolatedPathFilter(gp_.g, - params_.pset.path_filtration.min_length_for_low_covered, - params_.pset.path_filtration.min_coverage).filter(paths); - IsolatedPathFilter(gp_.g, params_.pset.path_filtration.isolated_min_length).filter(paths); - } paths.SortByLength(); - for (auto &path : paths) { - path.first->ResetOverlaps(); + INFO("Paths finalized"); +} + + +void PathExtendLauncher::CleanPaths(PathContainer &paths, const pe_config::ParamSetT::PathFiltrationT &path_filtration) const { + if (path_filtration.enabled) { + paths.FilterPaths(LengthPathCondition(GetLengthCutoff(path_filtration.min_length, path_filtration.rel_cutoff))); + paths.FilterPaths(func::And(CoveragePathCondition(gp_.g, path_filtration.min_coverage), + LengthPathCondition(GetLengthCutoff(path_filtration.min_length_for_low_covered, path_filtration.rel_low_covered_cutoff)))); + paths.FilterPaths(func::And(IsolatedPathCondition(gp_.g), + func::And(LengthPathCondition(GetLengthCutoff(path_filtration.isolated_min_length, path_filtration.rel_isolated_cutoff)), + CoveragePathCondition(gp_.g, path_filtration.isolated_min_cov)))); } + + paths.SortByLength(); +} + + +size_t PathExtendLauncher::GetLengthCutoff(size_t abs_cutoff, double rel_cutoff) const { + int rel_len = int(rel_cutoff * double(cfg::get().ds.RL())) - int(cfg::get().K); + int abs_len = int(abs_cutoff) - int(cfg::get().K); + size_t result = (size_t) max(0, max(rel_len, abs_len)); + + INFO("Read length relative cutoff " << rel_cutoff << " converted to " << rel_len); + INFO("Read length absolute cutoff " << abs_cutoff << " bp converted to " << result); + INFO("Length cutoff: " << result); + return result; } void PathExtendLauncher::TraverseLoops(PathContainer &paths, GraphCoverageMap &cover_map) const { @@ -258,34 +297,30 @@ INFO("Traversed " << res << " loops"); } -Extenders PathExtendLauncher::ConstructMPExtender(const ExtendersGenerator &generator, size_t uniqe_edge_len) { - ScaffoldingUniqueEdgeAnalyzer additional_edge_analyzer(gp_, (size_t) uniqe_edge_len, unique_data_.unique_variation_); - unique_data_.unique_storages_.push_back(make_shared()); - additional_edge_analyzer.FillUniqueEdgeStorage(*unique_data_.unique_storages_.back()); - - return generator.MakeMPExtenders(*unique_data_.unique_storages_.back()); +void PathExtendLauncher::AddScaffUniqueStorage(size_t uniqe_edge_len) { + ScaffoldingUniqueEdgeAnalyzer additional_edge_analyzer(gp_, (size_t) uniqe_edge_len, + unique_data_.unique_variation_); + unique_data_.unique_storages_.push_back(ScaffoldingUniqueEdgeStorage()); + additional_edge_analyzer.FillUniqueEdgeStorage(unique_data_.unique_storages_.back()); } Extenders PathExtendLauncher::ConstructMPExtenders(const ExtendersGenerator &generator) { const pe_config::ParamSetT &pset = params_.pset; - Extenders extenders = generator.MakeMPExtenders(unique_data_.main_unique_storage_); - INFO("Using " << extenders.size() << " mate-pair " << support_.LibStr(extenders.size())); - size_t cur_length = unique_data_.min_unique_length_ - pset.scaffolding2015.unique_length_step; size_t lower_bound = max(pset.scaffolding2015.unique_length_lower_bound, pset.scaffolding2015.unique_length_step); while (cur_length > lower_bound) { - INFO("Adding extender with length " << cur_length); - push_back_all(extenders, ConstructMPExtender(generator, cur_length)); + INFO("Will add extenders for length " << cur_length); + AddScaffUniqueStorage(cur_length); cur_length -= pset.scaffolding2015.unique_length_step; } if (unique_data_.min_unique_length_ > lower_bound) { - INFO("Adding final extender with length " << lower_bound); - push_back_all(extenders, ConstructMPExtender(generator, lower_bound)); + INFO("Will add final extenders for length " << lower_bound); + AddScaffUniqueStorage(lower_bound); } - return extenders; + return generator.MakeMPExtenders(); } void PathExtendLauncher::FillPathContainer(size_t lib_index, size_t size_threshold) { @@ -300,17 +335,17 @@ BidirectionalPath *conj_path = new BidirectionalPath(new_path->Conjugate()); new_path->SetWeight((float) path.getWeight()); conj_path->SetWeight((float) path.getWeight()); - unique_data_.long_reads_paths_[lib_index]->AddPair(new_path, conj_path); + unique_data_.long_reads_paths_[lib_index].AddPair(new_path, conj_path); } - DEBUG("Long reads paths " << unique_data_.long_reads_paths_[lib_index]->size()); - unique_data_.long_reads_cov_map_[lib_index]->AddPaths(*unique_data_.long_reads_paths_[lib_index]); + DEBUG("Long reads paths " << unique_data_.long_reads_paths_[lib_index].size()); + unique_data_.long_reads_cov_map_[lib_index].AddPaths(unique_data_.long_reads_paths_[lib_index]); } void PathExtendLauncher::FillLongReadsCoverageMaps() { for (size_t lib_index = 0; lib_index < dataset_info_.reads.lib_count(); lib_index++) { - unique_data_.long_reads_paths_.push_back(make_shared()); - unique_data_.long_reads_cov_map_.push_back(make_shared(gp_.g)); + unique_data_.long_reads_paths_.push_back(PathContainer()); + unique_data_.long_reads_cov_map_.push_back(GraphCoverageMap(gp_.g)); if (support_.IsForSingleReadExtender(dataset_info_.reads[lib_index])) { FillPathContainer(lib_index); } @@ -319,6 +354,7 @@ void PathExtendLauncher::FillPBUniqueEdgeStorages() { //FIXME magic constants + //FIXME need to change for correct usage of prelimnary contigs in loops ScaffoldingUniqueEdgeAnalyzer unique_edge_analyzer_pb(gp_, 500, 0.5); INFO("Filling backbone edges for long reads scaffolding..."); @@ -348,26 +384,26 @@ Extenders PathExtendLauncher::ConstructPBExtenders(const ExtendersGenerator &generator) { FillPBUniqueEdgeStorages(); - return generator.MakePBScaffoldingExtenders(unique_data_.unique_pb_storage_, - unique_data_.long_reads_cov_map_); + return generator.MakePBScaffoldingExtenders(); } -Extenders PathExtendLauncher::ConstructExtenders(const GraphCoverageMap& cover_map) { +Extenders PathExtendLauncher::ConstructExtenders(const GraphCoverageMap &cover_map, + UsedUniqueStorage &used_unique_storage) { INFO("Creating main extenders, unique edge length = " << unique_data_.min_unique_length_); if (support_.SingleReadsMapped() || support_.HasLongReads()) FillLongReadsCoverageMaps(); - ExtendersGenerator generator(dataset_info_, params_, gp_, cover_map, support_); - Extenders extenders = generator.MakeBasicExtenders(unique_data_.main_unique_storage_, - unique_data_.long_reads_cov_map_); + ExtendersGenerator generator(dataset_info_, params_, gp_, cover_map, + unique_data_, used_unique_storage, support_); + Extenders extenders = generator.MakeBasicExtenders(); //long reads scaffolding extenders. if (support_.HasLongReads()) { if (params_.pset.sm == sm_old) { INFO("Will not use new long read scaffolding algorithm in this mode"); } else { - push_back_all(extenders, ConstructPBExtenders(generator)); + utils::push_back_all(extenders, ConstructPBExtenders(generator)); } } @@ -375,26 +411,69 @@ if (params_.pset.sm == sm_old) { INFO("Will not use mate-pairs is this mode"); } else { - push_back_all(extenders, ConstructMPExtenders(generator)); + utils::push_back_all(extenders, ConstructMPExtenders(generator)); } } if (params_.pset.use_coordinated_coverage) - push_back_all(extenders, generator.MakeCoverageExtenders()); + utils::push_back_all(extenders, generator.MakeCoverageExtenders()); INFO("Total number of extenders is " << extenders.size()); return extenders; } -void PathExtendLauncher::PolishPaths(const PathContainer &paths, PathContainer &result) const { +void PathExtendLauncher::PolishPaths(const PathContainer &paths, PathContainer &result, + const GraphCoverageMap& /* cover_map */) const { //Fixes distances for paths gaps and tries to fill them in INFO("Closing gaps in paths"); - PathPolisher polisher(gp_, dataset_info_, unique_data_.main_unique_storage_, params_.max_polisher_gap); - polisher.PolishPaths(paths, result); + + vector> gap_closers; + + gap_closers.push_back(make_shared(gp_.g, params_.max_polisher_gap)); + for (size_t i = 0; i < dataset_info_.reads.lib_count(); i++) { + auto lib = dataset_info_.reads[i]; + if (lib.type() == io::LibraryType::HQMatePairs || lib.type() == io::LibraryType::MatePairs) { + shared_ptr paired_lib = MakeNewLib(gp_.g, lib, gp_.paired_indices[i]); + gap_closers.push_back(make_shared (gp_.g, params_.max_polisher_gap, paired_lib, + unique_data_.main_unique_storage_)); + } + } + +////TODO:: is it really empty? +// UniqueData unique_data; +// UsedUniqueStorage used_unique_storage(unique_data.main_unique_storage_); +// ExtendersGenerator generator(dataset_info_, params_, gp_, cover_map, +// unique_data, used_unique_storage, support_); +// auto polisher_storage = ScaffoldingUniqueEdgeStorage(); +// for (const auto& extender: generator.MakePEExtenders()) { +// gap_closers.push_back(make_shared(gp_.g, params_.max_polisher_gap, extender)); +// } +//FIXME: uncomment cover_map + + PathPolisher polisher(gp_, gap_closers); + result = polisher.PolishPaths(paths); result.SortByLength(); INFO("Gap closing completed") } +void PathExtendLauncher::FilterPaths() { + PathContainer contig_paths_copy(gp_.contig_paths.begin(), gp_.contig_paths.end()); + for (const auto& it: params_.pset.path_filtration) { + if (it.first == "default" && it.second.enabled) { + INFO("Finalizing main paths"); + CleanPaths(gp_.contig_paths, it.second); + DebugOutputPaths(gp_.contig_paths, "final_paths"); + } + else if (it.second.enabled) { + INFO("Finalizing paths - " + it.first); + PathContainer to_clean(contig_paths_copy.begin(), contig_paths_copy.end()); + CleanPaths(to_clean, it.second); + DebugOutputPaths(to_clean, it.first + "_final_paths"); + writer_.OutputPaths(to_clean, params_.output_dir + it.first + "_filtered_final_paths" + ".fasta"); + } + } +} + void PathExtendLauncher::Launch() { INFO("ExSPAnder repeat resolving tool started"); make_dir(params_.output_dir); @@ -415,30 +494,37 @@ DebugOutputPaths(seeds, "init_paths"); GraphCoverageMap cover_map(gp_.g); - Extenders extenders = ConstructExtenders(cover_map); - shared_ptr composite_extender = make_shared(gp_.g, cover_map, extenders, - unique_data_.main_unique_storage_, - params_.max_path_diff, - params_.pset.extension_options.max_repeat_length, - params_.detect_repeats_online); + UsedUniqueStorage used_unique_storage(unique_data_.main_unique_storage_); + Extenders extenders = ConstructExtenders(cover_map, used_unique_storage); + CompositeExtender composite_extender(gp_.g, cover_map, + used_unique_storage, + extenders, + params_.max_path_diff); - auto paths = resolver.ExtendSeeds(seeds, *composite_extender); - paths.FilterEmptyPaths(); - paths.SortByLength(); + auto paths = resolver.ExtendSeeds(seeds, composite_extender); DebugOutputPaths(paths, "raw_paths"); - FinalizePaths(paths, cover_map, resolver); + RemoveOverlapsAndArtifacts(paths, cover_map, resolver); DebugOutputPaths(paths, "before_loop_traversal"); - + //TODO think about ordering of path polisher vs loop traversal TraverseLoops(paths, cover_map); DebugOutputPaths(paths, "loop_traveresed"); - PolishPaths(paths, gp_.contig_paths); + //TODO does path polishing correctly work with coverage map + PolishPaths(paths, gp_.contig_paths, cover_map); DebugOutputPaths(gp_.contig_paths, "polished_paths"); - + + //TODO use move assignment to original map here GraphCoverageMap polished_map(gp_.g, gp_.contig_paths, true); - FinalizePaths(gp_.contig_paths, polished_map, resolver); - DebugOutputPaths(gp_.contig_paths, "final_paths"); + RemoveOverlapsAndArtifacts(gp_.contig_paths, polished_map, resolver); + DebugOutputPaths(gp_.contig_paths, "overlap_removed"); + + if (params_.ss.ss_enabled) { + PathContainerCoverageSwitcher switcher(gp_.g, gp_.ss_coverage.front(), params_.ss.antisense); + switcher.Apply(gp_.contig_paths); + } + + FilterPaths(); CountMisassembliesWithReference(gp_.contig_paths); diff -Nru spades-3.10.1+dfsg/src/common/modules/path_extend/pipeline/launcher.hpp spades-3.11.1+dfsg/src/common/modules/path_extend/pipeline/launcher.hpp --- spades-3.10.1+dfsg/src/common/modules/path_extend/pipeline/launcher.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/path_extend/pipeline/launcher.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -13,34 +13,22 @@ #include "modules/path_extend/scaffolder2015/scaffold_graph.hpp" #include "assembly_graph/paths/bidirectional_path_io/bidirectional_path_output.hpp" +#include "modules/alignment/rna/ss_coverage.hpp" + namespace path_extend { using namespace debruijn_graph; class PathExtendLauncher { - -private: const config::dataset& dataset_info_; const PathExtendParamsContainer& params_; conj_graph_pack& gp_; PELaunchSupport support_; - DefaultContigCorrector corrector_; - DefaultContigConstructor constructor_; shared_ptr contig_name_generator_; ContigWriter writer_; - struct { - size_t min_unique_length_; - double unique_variation_; - - ScaffoldingUniqueEdgeStorage main_unique_storage_; - vector> unique_storages_; - - ScaffoldingUniqueEdgeStorage unique_pb_storage_; - vector> long_reads_paths_; - vector> long_reads_cov_map_; - } unique_data_; + UniqueData unique_data_; vector> ConstructPairedConnectionConditions(const ScaffoldingUniqueEdgeStorage& edge_storage) const; @@ -69,20 +57,25 @@ void DebugOutputPaths(const PathContainer& paths, const string& name) const; - void FinalizePaths(PathContainer& paths, GraphCoverageMap &cover_map, const PathExtendResolver&resolver) const; + void RemoveOverlapsAndArtifacts(PathContainer &paths, GraphCoverageMap &cover_map, const PathExtendResolver &resolver) const; + + void CleanPaths(PathContainer &paths, const pe_config::ParamSetT::PathFiltrationT &path_filtration) const; - void TraverseLoops(PathContainer& paths, GraphCoverageMap& cover_map) const; + size_t GetLengthCutoff(size_t abs_cutoff, double rel_cutoff) const; - void PolishPaths(const PathContainer &paths, PathContainer &result) const; + void TraverseLoops(PathContainer &paths, GraphCoverageMap &cover_map) const; - Extenders ConstructExtenders(const GraphCoverageMap& cover_map); + void PolishPaths(const PathContainer &paths, PathContainer &result, const GraphCoverageMap &cover_map) const; + + Extenders ConstructExtenders(const GraphCoverageMap &cover_map, UsedUniqueStorage &used_unique_storage); Extenders ConstructMPExtenders(const ExtendersGenerator &generator); - Extenders ConstructMPExtender(const ExtendersGenerator &generator, size_t uniqe_edge_len); + void AddScaffUniqueStorage(size_t uniqe_edge_len); Extenders ConstructPBExtenders(const ExtendersGenerator &generator); + void FilterPaths(); public: @@ -93,19 +86,14 @@ params_(params), gp_(gp), support_(dataset_info, params), - corrector_(gp.g), - constructor_(gp.g, corrector_), contig_name_generator_(MakeContigNameGenerator(params_.mode, gp)), - writer_(gp.g, constructor_, gp_.components, contig_name_generator_), + writer_(gp.g, contig_name_generator_), unique_data_() { unique_data_.min_unique_length_ = params.pset.scaffolding2015.unique_length_upper_bound; unique_data_.unique_variation_ = params.pset.uniqueness_analyser.unique_coverage_variation; } - ~PathExtendLauncher() { - } - void Launch(); }; diff -Nru spades-3.10.1+dfsg/src/common/modules/path_extend/pipeline/launch_support.cpp spades-3.11.1+dfsg/src/common/modules/path_extend/pipeline/launch_support.cpp --- spades-3.10.1+dfsg/src/common/modules/path_extend/pipeline/launch_support.cpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/path_extend/pipeline/launch_support.cpp 2017-09-28 09:05:10.000000000 +0000 @@ -17,6 +17,15 @@ return true; } +bool PELaunchSupport::HasOnlySingleReads() const { + for (const auto &lib : dataset_info_.reads) { + if (lib.type() != io::LibraryType::SingleReads || !lib.data().single_reads_mapped) { + return false; + } + } + return true; +} + pe_config::ParamSetT::ExtensionOptionsT PELaunchSupport::GetExtensionOpts(shared_ptr lib, const pe_config::ParamSetT &pset) const { return lib->IsMp() ? pset.mate_pair_options : pset.extension_options; @@ -49,7 +58,7 @@ //TODO: review usage bool PELaunchSupport::UseCoverageResolverForSingleReads(const io::LibraryType &type) const { - return HasOnlyMPLibs() && (type == io::LibraryType::HQMatePairs); + return (HasOnlyMPLibs() && type == io::LibraryType::HQMatePairs) || HasOnlySingleReads(); } std::string PELaunchSupport::LibStr(size_t count) const { diff -Nru spades-3.10.1+dfsg/src/common/modules/path_extend/pipeline/launch_support.hpp spades-3.11.1+dfsg/src/common/modules/path_extend/pipeline/launch_support.hpp --- spades-3.10.1+dfsg/src/common/modules/path_extend/pipeline/launch_support.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/path_extend/pipeline/launch_support.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -38,6 +38,7 @@ PathExtendParamsContainer(const config::dataset& dataset_info, const pe_config::MainPEParamsT& pe_cfg_, + const config::debruijn_config::strand_specificity& strand_specificity, const std::string& output_dir_, config::pipeline_type mode_, bool uneven_depth_, @@ -45,14 +46,14 @@ bool use_scaffolder_): pe_cfg(pe_cfg_), pset(pe_cfg_.param_set), + ss(strand_specificity), output_dir(output_dir_), etc_dir(output_dir + pe_cfg_.etc_dir + "/"), mode(mode_), uneven_depth(uneven_depth_), avoid_rc_connections(avoid_rc_connections_), use_scaffolder(use_scaffolder_), - traverse_loops(true), - detect_repeats_online(mode_ != config::pipeline_type::meta && mode_ != config::pipeline_type::rna) + traverse_loops(true) { if (!(use_scaffolder && pset.scaffolder_options.enabled)) { traverse_loops = false; @@ -66,13 +67,18 @@ if (HasLongReads(dataset_info)) max_polisher_gap = max(max_polisher_gap, size_t(10000)); - min_edge_len = 100; - max_path_diff = mode == config::pipeline_type::rna ? 1 : FindMaxISRightQuantile(dataset_info); + min_edge_len = 0; + + max_path_diff = FindMaxISRightQuantile(dataset_info); + if (mode == config::pipeline_type::rna || mode == config::pipeline_type::meta) + max_path_diff = 0; } const pe_config::MainPEParamsT& pe_cfg; const pe_config::ParamSetT& pset; + const config::debruijn_config::strand_specificity& ss; + std::string output_dir; std::string etc_dir; @@ -82,8 +88,8 @@ bool avoid_rc_connections; bool use_scaffolder; bool traverse_loops; - bool detect_repeats_online; + //todo move to config size_t min_edge_len; size_t max_path_diff; size_t max_polisher_gap; @@ -108,6 +114,8 @@ bool HasOnlyMPLibs() const; + bool HasOnlySingleReads() const; + bool IsForSingleReadExtender(const io::SequencingLibrary &lib) const; bool IsForSingleReadScaffolder(const io::SequencingLibrary &lib) const; diff -Nru spades-3.10.1+dfsg/src/common/modules/path_extend/scaffolder2015/connection_condition2015.cpp spades-3.11.1+dfsg/src/common/modules/path_extend/scaffolder2015/connection_condition2015.cpp --- spades-3.10.1+dfsg/src/common/modules/path_extend/scaffolder2015/connection_condition2015.cpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/path_extend/scaffolder2015/connection_condition2015.cpp 2017-09-28 09:05:10.000000000 +0000 @@ -66,7 +66,7 @@ if (pos2.size() != 1) { if (pos2.size() >= 2) { DEBUG("Something went wrong:: Edge " << graph_.int_id(e2) << "is called unique but presents in path twice! first edge " << graph_.int_id(e1) << " path "); - path->Print(); + path->PrintDEBUG(); } return false; } @@ -216,7 +216,7 @@ AssemblyGraphConnectionCondition::AssemblyGraphConnectionCondition(const debruijn_graph::Graph &g, size_t max_connection_length, const ScaffoldingUniqueEdgeStorage & unique_edges) : - g_(g), max_connection_length_(max_connection_length), interesting_edge_set_(unique_edges.GetSet()), stored_distances_() { + g_(g), max_connection_length_(max_connection_length), interesting_edge_set_(unique_edges.unique_edges()), stored_distances_() { } map AssemblyGraphConnectionCondition::ConnectedWith(debruijn_graph::EdgeId e) const { diff -Nru spades-3.10.1+dfsg/src/common/modules/path_extend/scaffolder2015/connection_condition2015.hpp spades-3.11.1+dfsg/src/common/modules/path_extend/scaffolder2015/connection_condition2015.hpp --- spades-3.10.1+dfsg/src/common/modules/path_extend/scaffolder2015/connection_condition2015.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/path_extend/scaffolder2015/connection_condition2015.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -5,7 +5,7 @@ #include "assembly_graph/graph_support/scaff_supplementary.hpp" #include "modules/alignment/long_read_storage.hpp" #include "modules/path_extend/pe_utils.hpp" -#include "common/assembly_graph/graph_support/basic_edge_conditions.hpp" +#include "assembly_graph/graph_support/basic_edge_conditions.hpp" #include #include diff -Nru spades-3.10.1+dfsg/src/common/modules/path_extend/scaffolder2015/extension_chooser2015.hpp spades-3.11.1+dfsg/src/common/modules/path_extend/scaffolder2015/extension_chooser2015.hpp --- spades-3.10.1+dfsg/src/common/modules/path_extend/scaffolder2015/extension_chooser2015.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/path_extend/scaffolder2015/extension_chooser2015.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -43,7 +43,7 @@ lib_connection_condition_(condition), unique_edges_(unique_edges), relative_weight_threshold_(relative_threshold), - graph_connection_condition_(g, 2 * unique_edges_.GetMinLength(), unique_edges), + graph_connection_condition_(g, 2 * unique_edges_.min_length(), unique_edges), //TODO to config! absolute_weight_threshold_(2), graph_connection_bonus_(2), diff -Nru spades-3.10.1+dfsg/src/common/modules/path_extend/scaffolder2015/path_polisher.cpp spades-3.11.1+dfsg/src/common/modules/path_extend/scaffolder2015/path_polisher.cpp --- spades-3.10.1+dfsg/src/common/modules/path_extend/scaffolder2015/path_polisher.cpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/path_extend/scaffolder2015/path_polisher.cpp 2017-09-28 09:05:10.000000000 +0000 @@ -6,169 +6,192 @@ void PathPolisher::InfoAboutGaps(const PathContainer & result){ for (const auto& p_iter: result) { for (size_t i = 1; i < p_iter.first->Size(); ++i) { - if (p_iter.first->GapAt(i) > 0) { - DEBUG("Gap "<< p_iter.first->GapAt(i) << " left between " << gp_.g.int_id(p_iter.first->At(i-1)) << " and " << gp_.g.int_id(p_iter.first->At(i))); + if (p_iter.first->GapAt(i).gap > 0) { + DEBUG("Gap "<< p_iter.first->GapAt(i).gap + << " left between " << gp_.g.int_id(p_iter.first->At(i-1)) + << " and " << gp_.g.int_id(p_iter.first->At(i))); } } } } -PathPolisher::PathPolisher(const conj_graph_pack& gp, const config::dataset& dataset_info, const ScaffoldingUniqueEdgeStorage& storage, size_t max_resolvable_len ): gp_(gp) { - gap_closers.push_back(make_shared(gp.g, max_resolvable_len)); - for (size_t i = 0; i < dataset_info.reads.lib_count(); i++) { - auto lib = dataset_info.reads[i]; - if (lib.type() == io::LibraryType::HQMatePairs || lib.type() == io::LibraryType::MatePairs) { - shared_ptr paired_lib = MakeNewLib(gp.g, lib, gp.paired_indices[i]); - gap_closers.push_back(make_shared (gp.g, max_resolvable_len, paired_lib, storage)); - } - } -} - -void PathPolisher::PolishPaths(const PathContainer &paths, PathContainer &result) { - result.clear(); - - for (auto iter = paths.begin(); iter != paths.end(); ++iter) { - - BidirectionalPath *path = new BidirectionalPath(Polish(*iter.get())); - BidirectionalPath *conjugatePath = new BidirectionalPath(Polish(path->Conjugate())); - BidirectionalPath *re_path = new BidirectionalPath(conjugatePath->Conjugate()); - result.AddPair(re_path, conjugatePath); +PathContainer PathPolisher::PolishPaths(const PathContainer &paths) { + PathContainer result; + for (const auto& path_pair : paths) { + BidirectionalPath path = Polish(*path_pair.first); + BidirectionalPath *conjugate_path = new BidirectionalPath(Polish(path.Conjugate())); + BidirectionalPath *re_path = new BidirectionalPath(conjugate_path->Conjugate()); + result.AddPair(re_path, conjugate_path); } InfoAboutGaps(result); + return result; } -size_t DijkstraGapCloser::MinPathLength(const omnigraph::PathStorageCallback& path_storage) const { - size_t shortest_len = omnigraph::CumulativeLength(g_, path_storage.paths().front()); - for (size_t j = 1; j < path_storage.paths().size(); ++j) { - size_t cur_len = omnigraph::CumulativeLength(g_, path_storage.paths()[j]); +size_t DijkstraGapCloser::MinPathLength(const PathsT& paths) const { + size_t shortest_len = omnigraph::CumulativeLength(g_, paths.front()); + for (size_t j = 1; j < paths.size(); ++j) { + size_t cur_len = omnigraph::CumulativeLength(g_, paths[j]); shortest_len = min(shortest_len, cur_len); } return shortest_len; } -BidirectionalPath PathPolisher::Polish(const BidirectionalPath &path) { - if (path.Empty()) - return path; - shared_ptr current; - shared_ptr prev_step = std::make_shared(path); +BidirectionalPath PathPolisher::Polish(const BidirectionalPath &init_path) { + if (init_path.Empty()) + return init_path; + + auto path = make_shared(init_path); + size_t prev_len = path->Size(); + bool changed = true; size_t count = 0; while (changed) { changed = false; - for (size_t i = 0; i < gap_closers.size(); i++) { - current = std::make_shared(gap_closers[i]->Polish(*prev_step)); - if (current->Size() != prev_step->Size()){ + for (const auto& gap_closer : gap_closers_) { + path = make_shared(gap_closer->CloseGaps(*path)); + if (path->Size() != prev_len){ changed = true; - std::swap(current, prev_step); + prev_len = path->Size(); break; } } count++; - if (count > 5) { + if (count > MAX_POLISH_ATTEMPTS) { INFO("Unexpected cycle while polishing path, stopping polishing " ); - path.Print(); + path->PrintDEBUG(); break; } } - return *prev_step; + return *path; } -BidirectionalPath DijkstraGapCloser::Polish(const BidirectionalPath &path) { +BidirectionalPath PathGapCloser::CloseGaps(const BidirectionalPath &path) const { BidirectionalPath result(g_); if (path.Empty()) return result; - result.PushBack(path[0], path.GapInfoAt(0)); + + VERIFY(path.GapAt(0) == Gap()); + result.PushBack(path[0]); for (size_t i = 1; i < path.Size(); ++i) { if (g_.EdgeEnd(path[i - 1]) == g_.EdgeStart(path[i])) { - result.PushBack(path[i], path.GapInfoAt(i)); + result.PushBack(path[i], path.GapAt(i)); } else { - //Connect edges using Dijkstra - omnigraph::PathStorageCallback path_storage(g_); - omnigraph::ProcessPaths(g_, 0, - max_path_len_, - g_.EdgeEnd(path[i - 1]), - g_.EdgeStart(path[i]), - path_storage); - - if (path_storage.size() == 0) { - //No paths found, keeping the gap - result.PushBack(path[i], path.GapInfoAt(i)); - } else if (path_storage.size() > 1) { - //More than one path, using shortest path for gap length estimation - //We cannot use both common paths and bridges in one attempt; - if (!FillWithMultiplePaths(path, i, path_storage, result)) - FillWithBridge(path, i, path_storage, result); - } else { - //Closing the gap with the unique shortest path - for (size_t j = 0; j < path_storage.paths().front().size(); ++j) { - result.PushBack(path_storage.paths().front()[j]); - } - result.PushBack(path[i]); - } + DEBUG("Gap between " << path[i - 1].int_id() << " and " << path[i].int_id() << " " << path.GapAt(i)); + auto new_gap = CloseGap(path, i, result); + DEBUG("gap after " << new_gap); + result.PushBack(path[i], new_gap); } } return result; } +Gap DijkstraGapCloser::CloseGap(EdgeId target_edge, const Gap &orig_gap, BidirectionalPath &result) const { + VertexId target_vertex = g_.EdgeStart(target_edge); +//TODO:: actually we do not need paths, only edges.. + omnigraph::PathStorageCallback path_storage(g_); + int process_res = omnigraph::ProcessPaths(g_, 0, + max_path_len_, + g_.EdgeEnd(result.Back()), + target_vertex, + path_storage); + if (path_storage.size() == 0 || process_res != 0) { +//No paths found or path_processor error(in particular too many vertices in Dijkstra), keeping the gap + DEBUG("PathProcessor nonzero exit code, gap left unchanged"); + return orig_gap; + } else if (path_storage.size() > 1) { +//More than one result, using shortest result for gap length estimation +//We cannot use both common paths and bridges in one attempt; + Gap gap = FillWithMultiplePaths(path_storage.paths(), result); + if (gap == Gap::INVALID()) + gap = FillWithBridge(orig_gap, path_storage.paths(), result); + return gap; + } else { +//Closing the gap with the unique shortest result + DEBUG("Unique path gap closing:"); + for (EdgeId e : path_storage.paths().front()) { + DEBUG(e.int_id()); + result.PushBack(e); + } + return Gap(0); + } +} + +Gap PathExtenderGapCloser::CloseGap(EdgeId target_edge, const Gap &orig_gap, BidirectionalPath &result) const { + size_t added = 0; + VertexId target_vertex = g_.EdgeStart(target_edge); + while (g_.EdgeEnd(result.Back()) != target_vertex) { + bool has_grown = extender_->MakeGrowStep(result); + if (!has_grown) + break; + added += g_.length(result.Back()); + } + //FIXME think of checking for 0 in advance + return Gap(orig_gap.gap - (int) added, 0, orig_gap.trash_current); +} -bool DijkstraGapCloser::FillWithBridge(const BidirectionalPath& path, size_t index, - const omnigraph::PathStorageCallback& path_storage, - BidirectionalPath& result) const { -//TODO:: constant; - auto counts = CountEdgesQuantity(path_storage, 300); - size_t path_quantity = path_storage.paths().size(); +Gap DijkstraGapCloser::FillWithBridge(const Gap &orig_gap, + const PathsT& paths, + BidirectionalPath& result) const { + //TODO:: constant; + auto counts = CountEdgesQuantity(paths, 300); + DEBUG("filing gap with bridges"); + size_t path_quantity = paths.size(); vector bridges; for (const auto& pair: counts) if (pair.second == path_quantity) bridges.push_back(pair.first); - if (bridges.size() > 0) { + + if (bridges.empty()) { + return orig_gap; + } else { std::sort(bridges.begin(), bridges.end(), [&] (EdgeId e1, EdgeId e2) { - return g_.length(e1) > g_.length(e2); }); + return g_.length(e1) > g_.length(e2);}); EdgeId bridge = bridges[0]; - int min_gap_before = path.GapAt(index); - int min_gap_after = path.GapAt(index); - for (const auto& path:path_storage.paths()) { - int current_before = 0; - for(size_t i = 0; i< path.size(); i++) { - if (path[i] != bridge) - current_before += (int)g_.length(path[i]); - else + + VERIFY(orig_gap.gap >= 0 && orig_gap.NoTrash()); + int min_gap_before = orig_gap.gap; + int min_gap_after = orig_gap.gap; + for (const auto& path : paths) { + size_t current_before = 0; + for (EdgeId e : path) { + if (e == bridge) break; + current_before += g_.length(e); } - int current_after = (int)CumulativeLength(g_, path) - current_before - int(g_.length(bridge)); - min_gap_after = std::min(current_after, min_gap_after); - min_gap_before = std::min(current_before, min_gap_before); + size_t current_after = CumulativeLength(g_, path) - current_before - g_.length(bridge); + min_gap_after = std::min(int(current_after), min_gap_after); + min_gap_before = std::min(int(current_before), min_gap_before); } + min_gap_after = std::max(min_gap_after, min_gap_); min_gap_before = std::max(min_gap_before, min_gap_); - result.PushBack(bridge, min_gap_before); - result.PushBack(path[index], min_gap_after); - return true; - } else { - result.PushBack(path[index], path.GapAt(index)); - return false; + DEBUG(bridge.int_id() << " " << min_gap_before); + result.PushBack(bridge, Gap(min_gap_before)); + return Gap(min_gap_after); } } -bool DijkstraGapCloser::FillWithMultiplePaths(const BidirectionalPath& path, size_t index, - const omnigraph::PathStorageCallback& path_storage, +Gap DijkstraGapCloser::FillWithMultiplePaths(const PathsT& paths, BidirectionalPath& result) const { bool changed = false; - auto left = LCP(path_storage); + auto left = LCP(paths); + DEBUG("Filling gap with prefix") for (auto e : left) { + DEBUG(e.int_id()); result.PushBack(e); changed = true; } - int middle_gap = (int) max(size_t(min_gap_), MinPathLength(path_storage) - - omnigraph::CumulativeLength(g_, left)); - if (changed) - result.PushBack(path[index], middle_gap); - return changed; + if (changed) { + int gap = max(min_gap_, + int(MinPathLength(paths) - omnigraph::CumulativeLength(g_, left))); + return Gap(gap); + } else + return Gap::INVALID(); } -std::map DijkstraGapCloser::CountEdgesQuantity(const omnigraph::PathStorageCallback& path_storage, size_t length_limit ) const{ +std::map DijkstraGapCloser::CountEdgesQuantity(const PathsT &paths, size_t length_limit) const { map res; - for (const auto& path: path_storage.paths()) { + for (const auto& path: paths) { set edge_set(path.begin(), path.end()); for (const auto& e: edge_set) { if (g_.length(e) >= length_limit) { @@ -179,23 +202,23 @@ return res; }; -size_t DijkstraGapCloser::MinPathSize(const omnigraph::PathStorageCallback& path_storage) const { - size_t size = path_storage.paths().front().size(); - for (size_t i = 1; i < path_storage.size(); ++i) { - size = min(size, path_storage.paths()[i].size()); +size_t DijkstraGapCloser::MinPathSize(const PathsT& paths) const { + size_t size = paths.front().size(); + for (size_t i = 1; i < paths.size(); ++i) { + size = min(size, paths[i].size()); } return size; } -vector DijkstraGapCloser::LCP(const omnigraph::PathStorageCallback& path_storage) const { +vector DijkstraGapCloser::LCP(const PathsT& paths) const { bool all_equal = true; size_t index = 0; - size_t min_size = MinPathSize(path_storage); + size_t min_size = MinPathSize(paths); while (index < min_size && all_equal) { - for (size_t i = 1; i < path_storage.size(); ++i) { - auto e = path_storage.paths().front()[index]; - if (e != path_storage.paths()[i][index]) { + for (size_t i = 1; i < paths.size(); ++i) { + auto e = paths.front()[index]; + if (e != paths[i][index]) { all_equal = false; break; } @@ -206,26 +229,28 @@ vector result; for (size_t i = 0; i < index; ++i) { - result.push_back(path_storage.paths().front()[i]); + result.push_back(paths.front()[i]); } return result; } - -EdgeId MatePairGapCloser::FindNext(const BidirectionalPath& path, size_t index, - const set& present_in_paths, VertexId v) const { - auto next_edges = g_.OutgoingEdges(v); +EdgeId MatePairGapCloser::FindNext(const BidirectionalPath& path, + const set& present_in_paths, + VertexId last_v, EdgeId target_edge) const { + auto next_edges = g_.OutgoingEdges(last_v); map candidates; + for (const auto edge: next_edges) if (present_in_paths.find(edge) != present_in_paths.end()) candidates.insert(make_pair(edge, 0)); - if (candidates.size() <= 1 ) { - if (candidates.size() == 0 || candidates.begin()->first == path[index]) + + if (candidates.size() <= 1) { + if (candidates.size() == 0 || candidates.begin()->first == target_edge) return EdgeId(0); else return (candidates.begin()->first); } else { - int i = (int) index - 1; + int i = (int) path.Size() - 1; for (; i >= 0; i--) { if (storage_.IsUnique(path[i])) break; @@ -248,79 +273,73 @@ sort(to_sort.begin(), to_sort.end(), [&] (std::pair a, std::pair b ) { return a.second > b.second; }); - if (to_sort[0].second > to_sort[1].second * weight_priority && to_sort[0].first != path[index]) + if (to_sort[0].second > to_sort[1].second * weight_priority && to_sort[0].first != target_edge) return to_sort[0].first; else return EdgeId(0); } } } - -//TODO: make shorter functions -BidirectionalPath MatePairGapCloser::Polish(const BidirectionalPath& path) { - BidirectionalPath result(g_); - DEBUG("Path " << path.GetId() << " len "<< path.Length() << " size " << path.Size()); - result.PushBack(path[0], path.GapInfoAt(0)); - for (size_t i = 1; i < path.Size(); ++i) { - if (g_.EdgeEnd(path[i - 1]) == g_.EdgeStart(path[i]) || path.GapAt(i) <= min_gap_) { - result.PushBack(path[i], path.GapInfoAt(i)); - } else { - DEBUG("position "<< i <<" gap between edges " << g_.int_id(path[i-1]) << " and " << g_.int_id(path[i]) << " was " << path.GapAt(i)); - - vector addition; - VertexId v = g_.EdgeEnd(path[i - 1]); - EdgeId last = path[i - 1]; - omnigraph::PathStorageCallback path_storage(g_); - omnigraph::ProcessPaths(g_, 0, - max_path_len_, - g_.EdgeEnd(path[i - 1]), - g_.EdgeStart(path[i]), - path_storage); - set present_in_paths; - for(const auto &p: path_storage.paths()) - for(size_t j = 0; j < p.size(); j ++) - present_in_paths.insert(p[j]); - size_t total = 0; - while (last != EdgeId(0)){ - last = FindNext(path, i, present_in_paths, v); - if (last != EdgeId(0)){ - v = g_.EdgeEnd(last); - addition.push_back(last); - total += g_.length(last); - } - if (total > max_path_len_){ - DEBUG("gap between edges " << g_.int_id(path[i-1]) << " and " << g_.int_id(path[i]) << " was: " << path.GapAt(i) << ", closing path length too long: " << total); - break; - } +//FIXME review logic +Gap MatePairGapCloser::CloseGap(EdgeId target_edge, const Gap &orig_gap, BidirectionalPath &path) const { + VertexId target_vertex = g_.EdgeStart(target_edge); +//TODO:: condition about trash_previous - do we need it globally? + if (orig_gap.gap <= min_gap_ || orig_gap.trash_previous > 0) { + return orig_gap; + } else { + vector addition; + EdgeId last_e = path.Back(); + VertexId last_v = g_.EdgeEnd(last_e); + DEBUG("Closing gap with mate pairs between edge " << g_.int_id(last_e) + << " and edge " << g_.int_id(target_edge) << " was " << orig_gap); + omnigraph::PathStorageCallback path_storage(g_); + int process_res = omnigraph::ProcessPaths(g_, 0, + max_path_len_, + last_v, + target_vertex, + path_storage); + if (process_res != 0) { + DEBUG("PathProcessor nonzero exit code, gap left unchanged"); + return orig_gap; + } + set present_in_paths; + for (const auto &p: path_storage.paths()) + for (EdgeId e : p) + present_in_paths.insert(e); + + size_t total = 0; + while (last_e != EdgeId(0)) { + last_e = FindNext(path, present_in_paths, last_v, target_edge); + if (last_e != EdgeId(0)) { + last_v = g_.EdgeEnd(last_e); + addition.push_back(last_e); + total += g_.length(last_e); } if (total > max_path_len_) { - result.PushBack(path[i], path.GapInfoAt(i)); - continue; + DEBUG("Closing result length too long: " << total); + return orig_gap; } - int len = int(CumulativeLength(g_, addition)); - int new_gap = path.GapAt(i) - len; - if (new_gap < min_gap_ && addition.size() > 0) { - if (path.GapAt(i) * 3 < len * 2 ) { -//inserted path significantly longer than estimated gap - DEBUG("Gap size estimation problem: gap between edges " << g_.int_id(path[i - 1]) << " and " << g_.int_id(path[i]) << " was " << - path.GapAt(i) << "filled len" << len); - } - if (g_.EdgeEnd(addition.back()) != g_.EdgeStart(path[i])) - new_gap = min_gap_; - else - new_gap = 0; - } - DEBUG("filling"); - for (size_t j = 0; j < addition.size(); j++) { - DEBUG(g_.int_id(addition[j])); - result.PushBack(addition[j], 0); + } + + int len = int(CumulativeLength(g_, addition)); + Gap gap(orig_gap.gap - len); + if (gap.gap < min_gap_ && addition.size() > 0) { + //todo constant + if (orig_gap.gap * 2 < len) { +//inserted result significantly longer than estimated gap + DEBUG("Filled len" << len); } - result.PushBack(path[i], new_gap); - DEBUG("filled"); + if (g_.EdgeEnd(addition.back()) != target_vertex) + gap = Gap(min_gap_); + else + gap = Gap(); } + for (EdgeId e : addition) { + DEBUG(g_.int_id(e)); + path.PushBack(e); + } + return gap; } - DEBUG("result " << result.GetId() << " len "<< result.Length() << " size " << result.Size()); - return result; } } diff -Nru spades-3.10.1+dfsg/src/common/modules/path_extend/scaffolder2015/path_polisher.hpp spades-3.11.1+dfsg/src/common/modules/path_extend/scaffolder2015/path_polisher.hpp --- spades-3.10.1+dfsg/src/common/modules/path_extend/scaffolder2015/path_polisher.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/path_extend/scaffolder2015/path_polisher.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -5,80 +5,130 @@ #include "assembly_graph/paths/bidirectional_path.hpp" #include "assembly_graph/core/basic_graph_stats.hpp" #include "modules/path_extend/paired_library.hpp" +#include "modules/path_extend/path_extender.hpp" #include "assembly_graph/graph_support/scaff_supplementary.hpp" -#include "common/pipeline/graph_pack.hpp" +#include "pipeline/graph_pack.hpp" namespace path_extend { class PathGapCloser { protected: const Graph& g_; - size_t max_path_len_; - int min_gap_; + const size_t max_path_len_; + const int min_gap_; + + virtual Gap CloseGap(const BidirectionalPath &original_path, size_t position, + BidirectionalPath &path) const = 0; + DECL_LOGGER("PathGapCloser") +public: + BidirectionalPath CloseGaps(const BidirectionalPath &path) const; + + PathGapCloser(const Graph& g, size_t max_path_len): + g_(g), + max_path_len_(max_path_len), + //TODO:: config + min_gap_(int(g.k() + 10)) {} + +}; + +//Intermediate abstract class - majority of GapClosers needs only one next edge after gap, not all original path. +class TargetEdgeGapCloser : public PathGapCloser { +protected: + //returns updated gap to target edge + virtual Gap CloseGap(EdgeId target_edge, const Gap &gap, BidirectionalPath &path) const = 0; + + Gap CloseGap(const BidirectionalPath &original_path, + size_t position, BidirectionalPath &path) const final override { + return CloseGap(original_path.At(position), original_path.GapAt(position), path); + } + public: - virtual BidirectionalPath Polish(const BidirectionalPath& path) = 0; -//TODO:: config - PathGapCloser(const Graph& g, size_t max_path_len): g_(g), max_path_len_(max_path_len), min_gap_(int(g.k() + 10)) {} + TargetEdgeGapCloser(const Graph& g, size_t max_path_len): + PathGapCloser(g, max_path_len) {} + +}; + +class PathExtenderGapCloser: public TargetEdgeGapCloser { + shared_ptr extender_; +protected: + Gap CloseGap(EdgeId target_edge, const Gap &gap, BidirectionalPath &path) const override; + +public: + PathExtenderGapCloser(const Graph& g, size_t max_path_len, shared_ptr extender): + TargetEdgeGapCloser(g, max_path_len), extender_(extender) { + DEBUG("ext added"); + } }; -class MatePairGapCloser: public PathGapCloser { +class MatePairGapCloser: public TargetEdgeGapCloser { const shared_ptr lib_; const ScaffoldingUniqueEdgeStorage& storage_; - //TODO: config? somewhere else? static constexpr double weight_priority = 5; + + EdgeId FindNext(const BidirectionalPath& path, + const set& present_in_paths, + VertexId last_v, EdgeId target_edge) const; +protected: + Gap CloseGap(EdgeId target_edge, const Gap &gap, BidirectionalPath &path) const override; + + DECL_LOGGER("MatePairGapCloser") + public: - EdgeId FindNext(const BidirectionalPath& path, size_t index, - const set& present_in_paths, VertexId v) const; - MatePairGapCloser(const Graph& g, size_t max_path_len, const shared_ptr lib, const ScaffoldingUniqueEdgeStorage& storage): - PathGapCloser(g, max_path_len), lib_(lib), storage_(storage) {} - BidirectionalPath Polish(const BidirectionalPath& path) override; + MatePairGapCloser(const Graph& g, size_t max_path_len, + const shared_ptr lib, + const ScaffoldingUniqueEdgeStorage& storage): + TargetEdgeGapCloser(g, max_path_len), lib_(lib), storage_(storage) {} }; -class DijkstraGapCloser: public PathGapCloser { +//TODO switch to a different Callback, no need to store all paths +class DijkstraGapCloser: public TargetEdgeGapCloser { + typedef vector> PathsT; -protected: + Gap FillWithMultiplePaths(const PathsT& paths, + BidirectionalPath& result) const; - BidirectionalPath Polish(const BidirectionalPath& path) override; + Gap FillWithBridge(const Gap &orig_gap, + const PathsT& paths, BidirectionalPath& result) const; - size_t MinPathLength(const omnigraph::PathStorageCallback& path_storage) const; + size_t MinPathLength(const PathsT& paths) const; - bool FillWithMultiplePaths(const BidirectionalPath& path, size_t index, - const omnigraph::PathStorageCallback& path_storage, - BidirectionalPath& result) const; + size_t MinPathSize(const PathsT& paths) const; - bool FillWithBridge(const BidirectionalPath& path, size_t index, - const omnigraph::PathStorageCallback& path_storage, - BidirectionalPath& result) const; + vector LCP(const PathsT& paths) const; - size_t MinPathSize(const omnigraph::PathStorageCallback& path_storage) const; + std::map CountEdgesQuantity(const PathsT& paths, size_t length_limit) const; - vector LCP(const omnigraph::PathStorageCallback& path_storage) const; +protected: + Gap CloseGap(EdgeId target_edge, const Gap &gap, BidirectionalPath &path) const override; - std::map CountEdgesQuantity(const omnigraph::PathStorageCallback& path_storage, size_t length_limit) const; + DECL_LOGGER("DijkstraGapCloser") public: DijkstraGapCloser(const Graph& g, size_t max_path_len): - PathGapCloser(g, max_path_len) {} - + TargetEdgeGapCloser(g, max_path_len) {} }; class PathPolisher { + static const size_t MAX_POLISH_ATTEMPTS = 5; + + const conj_graph_pack &gp_; + vector> gap_closers_; -private: - const conj_graph_pack& gp_; - vector> gap_closers; + void InfoAboutGaps(const PathContainer& result); -private: - void InfoAboutGaps(const PathContainer & result); BidirectionalPath Polish(const BidirectionalPath& path); + DECL_LOGGER("PathPolisher") public: - PathPolisher(const conj_graph_pack& gp, const config::dataset& dataset_info, const ScaffoldingUniqueEdgeStorage& storage, size_t max_resolvable_len); + PathPolisher(const conj_graph_pack &gp, + const vector> &gap_closers): + gp_(gp), gap_closers_(gap_closers) { + } - void PolishPaths(const PathContainer& paths, PathContainer& result); + PathContainer PolishPaths(const PathContainer &paths); }; diff -Nru spades-3.10.1+dfsg/src/common/modules/path_extend/scaffolder2015/scaffold_graph.hpp spades-3.11.1+dfsg/src/common/modules/path_extend/scaffolder2015/scaffold_graph.hpp --- spades-3.10.1+dfsg/src/common/modules/path_extend/scaffolder2015/scaffold_graph.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/path_extend/scaffolder2015/scaffold_graph.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -9,7 +9,7 @@ #include "connection_condition2015.hpp" #include "utils/standard_base.hpp" -#include "common/adt/iterator_range.hpp" +#include "adt/iterator_range.hpp" namespace path_extend { namespace scaffold_graph { diff -Nru spades-3.10.1+dfsg/src/common/modules/path_extend/scaffolder2015/scaffold_graph_visualizer.cpp spades-3.11.1+dfsg/src/common/modules/path_extend/scaffolder2015/scaffold_graph_visualizer.cpp --- spades-3.10.1+dfsg/src/common/modules/path_extend/scaffolder2015/scaffold_graph_visualizer.cpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/path_extend/scaffolder2015/scaffold_graph_visualizer.cpp 2017-09-28 09:05:10.000000000 +0000 @@ -18,17 +18,17 @@ const string ScaffoldEdgeColorer::default_color = "black"; string ScaffoldGraphLabeler::label(EdgeId e) const { - return "ID: " + ToString(e.getId()) + - "\\n Weight: " + ToString(e.getWeight()) + - "\\n Lib#: " + ToString(e.getColor()); + return "ID: " + std::to_string(e.getId()) + + "\\n Weight: " + std::to_string(e.getWeight()) + + "\\n Lib#: " + std::to_string(e.getColor()); } string ScaffoldGraphLabeler::label(VertexId v) const { auto it = additional_vertex_labels_.find(v); string additional_label = it == additional_vertex_labels_.end() ? "" : it->second + "\n"; - return "ID: " + ToString(graph_.int_id(v)) + - "\\n Len: " + ToString(graph_.AssemblyGraph().length(v)) + - "\\n Cov: " + ToString(graph_.AssemblyGraph().coverage(v)) + "\n" + + return "ID: " + std::to_string(graph_.int_id(v)) + + "\\n Len: " + std::to_string(graph_.AssemblyGraph().length(v)) + + "\\n Cov: " + std::to_string(graph_.AssemblyGraph().coverage(v)) + "\n" + additional_label; } diff -Nru spades-3.10.1+dfsg/src/common/modules/path_extend/weight_counter.hpp spades-3.11.1+dfsg/src/common/modules/path_extend/weight_counter.hpp --- spades-3.10.1+dfsg/src/common/modules/path_extend/weight_counter.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/path_extend/weight_counter.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -80,7 +80,9 @@ public: virtual ~IdealInfoProvider() {} - virtual std::vector FindCoveredEdges(const BidirectionalPath& path, EdgeId candidate) const = 0; + virtual std::vector FindCoveredEdges(const BidirectionalPath& path, EdgeId candidate, int gap = 0) const = 0; +protected: + DECL_LOGGER("IdealInfoProvider"); }; class BasicIdealInfoProvider : public IdealInfoProvider { @@ -89,11 +91,11 @@ BasicIdealInfoProvider(const shared_ptr& lib) : lib_(lib) { } - std::vector FindCoveredEdges(const BidirectionalPath& path, EdgeId candidate) const override { + std::vector FindCoveredEdges(const BidirectionalPath& path, EdgeId candidate, int gap) const override { std::vector covered; for (int i = (int) path.Size() - 1; i >= 0; --i) { double w = lib_->IdealPairedInfo(path[i], candidate, - (int) path.LengthAt(i)); + (int) path.LengthAt(i) + gap); //FIXME think if we need extremely low ideal weights if (math::gr(w, 0.)) { covered.push_back(EdgeWithPairedInfo(i, w)); @@ -107,13 +109,13 @@ protected: const Graph& g_; - const shared_ptr lib_; + shared_ptr lib_; bool normalize_weight_; shared_ptr ideal_provider_; public: - WeightCounter(const Graph& g, const shared_ptr& lib, + WeightCounter(const Graph& g, shared_ptr lib, bool normalize_weight = true, shared_ptr ideal_provider = nullptr) : g_(g), lib_(lib), normalize_weight_(normalize_weight), ideal_provider_(ideal_provider) { @@ -128,15 +130,11 @@ virtual double CountWeight(const BidirectionalPath& path, EdgeId e, const std::set& excluded_edges = std::set(), int gapLength = 0) const = 0; - const PairedInfoLibrary& lib() const { + const PairedInfoLibrary& PairedLibrary() const { return *lib_; } - const shared_ptr get_libptr() const { - return lib_; - }; - -private: +protected: DECL_LOGGER("WeightCounter"); }; @@ -146,7 +144,7 @@ int add_gap = 0) const { std::vector answer; - for (const EdgeWithPairedInfo& e_w_pi : ideal_provider_->FindCoveredEdges(path, e)) { + for (const EdgeWithPairedInfo& e_w_pi : ideal_provider_->FindCoveredEdges(path, e, add_gap)) { double w = lib_->CountPairedInfo(path[e_w_pi.e_], e, (int) path.LengthAt(e_w_pi.e_) + add_gap); @@ -213,17 +211,27 @@ const std::vector& ideally_covered_edges, int add_gap = 0) const { std::vector answer; - for (const EdgeWithPairedInfo& e_w_pi : ideally_covered_edges) { + for (const auto& e_w_pi : ideally_covered_edges) { double ideal_weight = e_w_pi.pi_; + TRACE("Supposedly covered edge " << e_w_pi.e_ << " " + << g_.str(path.At(e_w_pi.e_)) + << " ideal weight " << ideal_weight); + + TRACE("Querying paired library for edges " << g_.str(path[e_w_pi.e_]) + << " " << g_.str(e) << " at dist " + << (path.LengthAt(e_w_pi.e_) + add_gap)); - double weight = lib_->CountPairedInfo( - path[e_w_pi.e_], e, + double weight = lib_->CountPairedInfo(path[e_w_pi.e_], e, (int) path.LengthAt(e_w_pi.e_) + add_gap); + TRACE("Actual weight " << weight); + if (normalize_weight_) { weight /= ideal_weight; } + TRACE("After normalization " << weight << " threshold " << single_threshold_); + if (math::ge(weight, single_threshold_)) { answer.push_back(EdgeWithPairedInfo(e_w_pi.e_, ideal_weight)); } @@ -245,8 +253,9 @@ double CountWeight(const BidirectionalPath& path, EdgeId e, const std::set& excluded_edges, int gap) const override { + TRACE("Counting weight for edge " << g_.str(e)); double lib_weight = 0.; - const auto ideal_coverage = ideal_provider_->FindCoveredEdges(path, e); + const auto ideal_coverage = ideal_provider_->FindCoveredEdges(path, e, gap); for (const auto& e_w_pi : CountLib(path, e, ideal_coverage, gap)) { if (!excluded_edges.count(e_w_pi.e_)) { @@ -255,13 +264,18 @@ } double total_ideal_coverage = TotalIdealNonExcluded(ideal_coverage, excluded_edges); + + TRACE("Excluded edges " << utils::join(excluded_edges, ", ", + [&] (const size_t &i) { return g_.str(path.At(i)); })); + TRACE("Total ideal coverage " << total_ideal_coverage); + TRACE("Lib weight " << lib_weight); return math::eq(total_ideal_coverage, 0.) ? 0. : lib_weight / total_ideal_coverage; } std::set PairInfoExist(const BidirectionalPath& path, EdgeId e, int gap = 0) const override { std::set answer; - for (const auto& e_w_pi : CountLib(path, e, ideal_provider_->FindCoveredEdges(path, e), gap)) { + for (const auto& e_w_pi : CountLib(path, e, ideal_provider_->FindCoveredEdges(path, e, gap), gap)) { if (math::gr(e_w_pi.pi_, 0.)) { answer.insert(e_w_pi.e_); } @@ -292,13 +306,16 @@ VERIFY(read_length_ > g_.k()); } - std::vector FindCoveredEdges(const BidirectionalPath& path, EdgeId candidate) const override { + //TODO optimize number of calls of EstimatePathCoverage(path) + std::vector FindCoveredEdges(const BidirectionalPath& path, EdgeId candidate, int gap) const override { VERIFY(read_length_ != -1ul); //bypassing problems with ultra-low coverage estimates double estimated_coverage = max(EstimatePathCoverage(path), 1.0); double correction_coeff = estimated_coverage / ((double(read_length_) - double(g_.k())) * MAGIC_COEFF); + TRACE("Estimated coverage " << estimated_coverage); + TRACE("Correction coefficient " << correction_coeff); - std::vector answer = BasicIdealInfoProvider::FindCoveredEdges(path, candidate); + std::vector answer = BasicIdealInfoProvider::FindCoveredEdges(path, candidate, gap); for (auto& e_w_pi : answer) { e_w_pi.pi_ *= correction_coeff; } @@ -324,34 +341,6 @@ } }; -//TODO optimize number of calls of EstimatePathCoverage(path) -//class MetagenomicWeightCounter: public WeightCounter { -// shared_ptr cov_info_provider_; -// shared_ptr normalizing_wc_; -// -//public: -// -// //negative raw_threshold leads to the halt if no sufficiently long edges are in the path -// MetagenomicWeightCounter(const Graph& g, const shared_ptr& lib, -// size_t read_length, double weight_threshold) : -// WeightCounter(g, lib) { -// cov_info_provider_ = make_shared(g, lib, read_length); -// normalizing_wc_ = make_shared(g, lib, -// /*normalize weight*/true, weight_threshold, cov_info_provider_); -// } -// -// double CountWeight(const BidirectionalPath& path, EdgeId e, -// const std::set& excluded_edges, int gap = 0) const override { -// VERIFY(path.Length() > 0); -// return normalizing_wc_->CountWeight(path, e, excluded_edges, gap); -// } -// -// std::set PairInfoExist(const BidirectionalPath& path, EdgeId e, -// int gap = 0) const override { -// return normalizing_wc_->PairInfoExist(path, e, gap); -// } -//}; - }; #endif /* WEIGHT_COUNTER_HPP_ */ diff -Nru spades-3.10.1+dfsg/src/common/modules/simplification/bulge_remover.hpp spades-3.11.1+dfsg/src/common/modules/simplification/bulge_remover.hpp --- spades-3.10.1+dfsg/src/common/modules/simplification/bulge_remover.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/simplification/bulge_remover.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -123,12 +123,12 @@ EnsureEndsPositionAligner aligner(CumulativeLength(g_, path), g_.length(edge)); - double prefix_length = 0.; + size_t prefix_length = 0.; vector bulge_prefix_lengths; for (EdgeId e : path) { - prefix_length += (double) g_.length(e); - bulge_prefix_lengths.push_back(aligner.GetPosition((size_t) prefix_length)); + prefix_length += g_.length(e); + bulge_prefix_lengths.push_back(aligner.GetPosition(prefix_length)); } EdgeId edge_to_split = edge; @@ -210,6 +210,7 @@ size_t max_delta_; double max_relative_delta_; size_t max_edge_cnt_; + size_t dijkstra_vertex_limit_; static vector EmptyPath() { static vector vec = {}; @@ -229,14 +230,16 @@ public: AlternativesAnalyzer(const Graph& g, double max_coverage, size_t max_length, double max_relative_coverage, size_t max_delta, - double max_relative_delta, size_t max_edge_cnt) : + double max_relative_delta, size_t max_edge_cnt, + size_t dijkstra_vertex_limit) : g_(g), max_coverage_(max_coverage), max_length_(max_length), max_relative_coverage_(max_relative_coverage), max_delta_(max_delta), max_relative_delta_(max_relative_delta), - max_edge_cnt_(max_edge_cnt) { + max_edge_cnt_(max_edge_cnt), + dijkstra_vertex_limit_(dijkstra_vertex_limit) { DEBUG("Created alternatives analyzer max_length=" << max_length << " max_coverage=" << max_coverage << " max_relative_coverage=" << max_relative_coverage @@ -261,8 +264,10 @@ VertexId end = g_.EdgeEnd(e); TRACE("End " << g_.str(end)); - ProcessPaths(g_, (g_.length(e) > delta) ? g_.length(e) - delta : 0, - g_.length(e) + delta, start, end, path_chooser, max_edge_cnt_); + size_t max_path_len = g_.length(e) + delta; + PathProcessor processor(g_, start, max_path_len, dijkstra_vertex_limit_); + processor.Process(end, (g_.length(e) > delta) ? g_.length(e) - delta : 0, + max_path_len, path_chooser, max_edge_cnt_); const vector& path = path_chooser.most_covered_path(); if (!path.empty()) { @@ -307,6 +312,21 @@ CoverageUpperBound(g, max_coverage))); } +template +func::TypedPredicate +NecessaryBulgeCondition(const Graph& g, const AlternativesAnalyzer& analyzer) { + return NecessaryBulgeCondition(g, analyzer.max_length(), analyzer.max_coverage()); +} + +template +InterestingFinderPtr +BulgeCandidateFinder(const Graph &g, + const AlternativesAnalyzer &analyzer, + size_t chunk_cnt) { + return std::make_shared>( + omnigraph::NecessaryBulgeCondition(g, analyzer), chunk_cnt); +}; + /** * This class removes simple bulges from given graph with the following algorithm: it iterates through all edges of * the graph and for each edge checks if this edge is likely to be a simple bulge @@ -345,13 +365,13 @@ typedef std::function& path)> BulgeCallbackF; - BulgeRemover(Graph& g, const std::shared_ptr>& interesting_finder, + BulgeRemover(Graph& g, size_t chunk_cnt, const AlternativesAnalyzer& alternatives_analyzer, BulgeCallbackF opt_callback = 0, std::function removal_handler = 0, bool track_changes = true) : base(g, - interesting_finder, + BulgeCandidateFinder(g, alternatives_analyzer, chunk_cnt), /*canonical_only*/true, CoverageComparator(g), track_changes), @@ -369,9 +389,10 @@ template class ParallelBulgeRemover : public PersistentAlgorithmBase { private: + static const size_t SMALL_BUFFER_THR = 1000; typedef typename Graph::EdgeId EdgeId; typedef typename Graph::VertexId VertexId; - typedef std::shared_ptr> CandidateFinderPtr; + typedef InterestingFinderPtr CandidateFinderPtr; typedef SmartSetIterator> SmartEdgeSet; size_t buff_size_; @@ -438,6 +459,12 @@ }; + SmartEdgeSet AsSmartSet(const std::vector &edges) { + SmartEdgeSet smart_set(this->g(), false, CoverageComparator(this->g())); + smart_set.insert(edges.begin(), edges.end()); + return smart_set; + } + bool CheckInteracting(const BulgeInfo& info, const std::unordered_set& involved_edges) const { if (involved_edges.count(info.e)) return true; @@ -462,55 +489,55 @@ } } - //false if time to stop + //returns false if time to stop bool FillEdgeBuffer(vector& buffer, func::TypedPredicate proceed_condition) { VERIFY(buffer.empty()); DEBUG("Filling edge buffer of size " << buff_size_); - perf_counter perf; - double low_cov = 0.; - double cov_diff = 0.; + utils::perf_counter perf; + double max_cov = std::numeric_limits::min(); + bool exhausted = false; + while (!it_.IsEnd() && buffer.size() < buff_size_) { EdgeId e = *it_; TRACE("Current edge " << this->g().str(e)); - if (!proceed_condition(e)) { - TRACE("Stop condition was reached."); - //need to release last element of the iterator to make it replaceable by new elements - it_.ReleaseCurrent(); - return false; - } double cov = this->g().coverage(e); if (buffer.empty()) { - low_cov = cov; - cov_diff = max(buff_cov_diff_, buff_cov_rel_diff_ * low_cov); - } else { - if (math::gr(cov, low_cov + cov_diff)) { - //need to release last element of the iterator to make it replaceable by new elements - it_.ReleaseCurrent(); - return true; - } + max_cov = cov + max(buff_cov_diff_, buff_cov_rel_diff_ * cov); + DEBUG("Coverage interval [" << cov << ", " << max_cov << "]"); + } + + if (!proceed_condition(e)) { + DEBUG("Stop condition was reached."); + exhausted = true; + break; + } + + if (math::gr(cov, max_cov)) { + DEBUG("Coverage exceeded " << cov << " > " << max_cov); + break; } + TRACE("Potential bulge edge"); buffer.push_back(e); ++it_; } + exhausted |= it_.IsEnd(); + it_.ReleaseCurrent(); + DEBUG("Filled in " << perf.time() << " seconds"); - if (buffer.size() == buff_size_) { - TRACE("Buffer filled"); - return true; - } else { - TRACE("No more edges in iterator"); - return false; - } + DEBUG("Candidate queue exhausted " << exhausted); + return !exhausted; } std::vector> FindBulges(const std::vector& edge_buffer) const { - DEBUG("Looking for bulges (in parallel). Edge buffer size " << edge_buffer.size()); - perf_counter perf; + DEBUG("Looking for bulges in parallel"); + utils::perf_counter perf; std::vector> bulge_buffers(omp_get_max_threads()); - size_t n = edge_buffer.size(); + const size_t n = edge_buffer.size(); //order is in agreement with coverage + DEBUG("Edge buffer size " << n); #pragma omp parallel for schedule(guided) for (size_t i = 0; i < n; ++i) { EdgeId e = edge_buffer[i]; @@ -519,13 +546,13 @@ bulge_buffers[omp_get_thread_num()].push_back(BulgeInfo(i, e, std::move(alternative))); } } - DEBUG("Bulges found in " << perf.time() << " seconds"); + DEBUG("Bulges found (in parallel) in " << perf.time() << " seconds"); return bulge_buffers; } std::vector MergeBuffers(std::vector>&& buffers) const { DEBUG("Merging bulge buffers"); - perf_counter perf; + utils::perf_counter perf; std::vector merged_bulges; for (auto& bulge_buffer : buffers) { @@ -545,7 +572,7 @@ SmartEdgeSet RetainIndependentBulges(std::vector& bulges) const { DEBUG("Looking for independent bulges"); size_t total_cnt = bulges.size(); - perf_counter perf; + utils::perf_counter perf; std::vector filtered; filtered.reserve(bulges.size()); @@ -574,9 +601,24 @@ return interacting_edges; } - size_t ProcessBulges(const std::vector& independent_bulges, SmartEdgeSet&& interacting_edges) { + size_t BasicProcessBulges(SmartEdgeSet& edges) { + size_t triggered = 0; + //usual br strategy + for (; !edges.IsEnd(); ++edges) { + EdgeId e = *edges; + TRACE("Processing edge " << this->g().str(e)); + std::vector alternative = alternatives_analyzer_(e); + if (!alternative.empty()) { + gluer_(e, alternative); + triggered++; + } + } + return triggered; + } + + size_t ProcessBulges(const std::vector& independent_bulges, SmartEdgeSet& interacting_edges) { DEBUG("Processing bulges"); - perf_counter perf; + utils::perf_counter perf; size_t triggered = 0; @@ -590,16 +632,7 @@ perf.reset(); DEBUG("Processing remaining interacting bulges " << interacting_edges.size()); - //usual br strategy - for (; !interacting_edges.IsEnd(); ++interacting_edges) { - EdgeId e = *interacting_edges; - TRACE("Processing edge " << this->g().str(e)); - std::vector alternative = alternatives_analyzer_(e); - if (!alternative.empty()) { - gluer_(e, alternative); - triggered++; - } - } + triggered += BasicProcessBulges(interacting_edges); DEBUG("Interacting edges processed in " << perf.time() << " seconds"); return triggered; } @@ -608,22 +641,29 @@ typedef std::function& path)> BulgeCallbackF; - ParallelBulgeRemover(Graph& g, const CandidateFinderPtr& interesting_edge_finder, - size_t buff_size, double buff_cov_diff, - double buff_cov_rel_diff, const AlternativesAnalyzer& alternatives_analyzer, + ParallelBulgeRemover(Graph& g, + size_t chunk_cnt, + size_t buff_size, + double buff_cov_diff, + double buff_cov_rel_diff, + const AlternativesAnalyzer& alternatives_analyzer, BulgeCallbackF opt_callback = 0, std::function removal_handler = 0, bool track_changes = true) : + PersistentAlgorithmBase(g), buff_size_(buff_size), buff_cov_diff_(buff_cov_diff), buff_cov_rel_diff_(buff_cov_rel_diff), alternatives_analyzer_(alternatives_analyzer), gluer_(g, opt_callback, removal_handler), - interesting_edge_finder_(interesting_edge_finder), + interesting_edge_finder_(BulgeCandidateFinder(g, alternatives_analyzer, chunk_cnt)), tracking_(track_changes), curr_iteration_(0), - it_(g, true, CoverageComparator(g), true) { + it_(g, /*add new*/true, + CoverageComparator(g), + /*canonical only*/true, + NecessaryBulgeCondition(g, alternatives_analyzer)) { VERIFY(buff_size_ > 0); it_.Detach(); } @@ -639,32 +679,46 @@ } if (primary_launch) { it_.clear(); - TRACE("Primary launch."); - TRACE("Start search for interesting edges"); + DEBUG("Primary launch."); + DEBUG("Start search for interesting edges"); interesting_edge_finder_->Run(this->g(), [&](EdgeId e) {it_.push(e);}); - TRACE(it_.size() << " interesting edges to process"); + DEBUG(it_.size() << " interesting edges to process"); } else { VERIFY(tracking_); - TRACE(it_.size() << " edges to process"); + DEBUG(it_.size() << " edges to process"); } size_t triggered = 0; bool proceed = true; while (proceed) { std::vector edge_buffer; + DEBUG("Filling edge buffer"); edge_buffer.reserve(buff_size_); proceed = FillEdgeBuffer(edge_buffer, proceed_condition); + DEBUG("Edge buffer filled"); - std::vector bulges = MergeBuffers(FindBulges(edge_buffer)); - - auto interacting_edges = RetainIndependentBulges(bulges); + DEBUG("Edge buffer size " << edge_buffer.size()); + size_t inner_triggered = 0; + //FIXME magic constant + if (edge_buffer.size() < SMALL_BUFFER_THR) { + DEBUG("Processing small buffer"); + utils::perf_counter perf; + //TODO implement via moves? + auto edges = AsSmartSet(edge_buffer); + inner_triggered = BasicProcessBulges(edges); + DEBUG("Small buffer processed in " << perf.time() << " seconds"); + } else { + std::vector bulges = MergeBuffers(FindBulges(edge_buffer)); + auto interacting_edges = RetainIndependentBulges(bulges); + inner_triggered = ProcessBulges(bulges, interacting_edges); + } - size_t inner_triggered = ProcessBulges(bulges, std::move(interacting_edges)); proceed |= (inner_triggered > 0); triggered += inner_triggered; + DEBUG("Buffer processed"); } - TRACE("Finished processing. Triggered = " << triggered); + DEBUG("Finished processing. Triggered = " << triggered); if (!tracking_) it_.Detach(); diff -Nru spades-3.10.1+dfsg/src/common/modules/simplification/cleaner.hpp spades-3.11.1+dfsg/src/common/modules/simplification/cleaner.hpp --- spades-3.10.1+dfsg/src/common/modules/simplification/cleaner.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/simplification/cleaner.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -36,4 +36,10 @@ } }; +template +size_t CleanIsolatedVertices(Graph &g, size_t chunk_cnt = 1) { + Cleaner cleaner(g, chunk_cnt); + return cleaner.Run(); +} + } diff -Nru spades-3.10.1+dfsg/src/common/modules/simplification/complex_bulge_remover.hpp spades-3.11.1+dfsg/src/common/modules/simplification/complex_bulge_remover.hpp --- spades-3.10.1+dfsg/src/common/modules/simplification/complex_bulge_remover.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/simplification/complex_bulge_remover.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -10,7 +10,7 @@ #include #include #include -#include "common/adt/concurrent_dsu.hpp" +#include "adt/concurrent_dsu.hpp" #include "utils/standard_base.hpp" #include "assembly_graph/components/graph_component.hpp" #include "math/xmath.h" @@ -114,7 +114,7 @@ // } bool CheckCompleteness() const { - for (VertexId v : key_set(vertex_depth_)) { + for (VertexId v : utils::key_set(vertex_depth_)) { if (v == start_vertex_) continue; if (!AllEdgeIn(v) && !AllEdgeOut(v)) @@ -125,7 +125,7 @@ bool NeedsProjection() const { DEBUG("Checking if component needs projection"); - for (VertexId v : key_set(vertex_depth_)) { + for (VertexId v : utils::key_set(vertex_depth_)) { if (v == start_vertex_) continue; vector filtered_incoming; @@ -161,7 +161,7 @@ set avg_distances() const { set distances; - for (VertexId v : key_set(vertex_depth_)) { + for (VertexId v : utils::key_set(vertex_depth_)) { distances.insert(avg_distance(v)); } return distances; @@ -194,12 +194,12 @@ } GraphComponent AsGraphComponent() const { - return GraphComponent::FromVertices(g_, key_set(vertex_depth_)); + return GraphComponent::FromVertices(g_, utils::key_set(vertex_depth_)); } bool ContainsConjugateVertices() const { set conjugate_vertices; - for (VertexId v : key_set(vertex_depth_)) { + for (VertexId v : utils::key_set(vertex_depth_)) { if (conjugate_vertices.count(v) == 0) { conjugate_vertices.insert(g_.conjugate(v)); } else { @@ -256,7 +256,7 @@ "Inserting vertex " << g_.str(new_vertex) << " to component during split"); vertex_depth_.insert(make_pair(new_vertex, new_vertex_depth)); height_2_vertices_.insert( - make_pair(Average(new_vertex_depth), new_vertex)); + std::make_pair(Average(new_vertex_depth), new_vertex)); } } @@ -308,25 +308,25 @@ return vertices_.count(v) > 0; } - virtual void HandleDelete(VertexId v) { + void HandleDelete(VertexId v) override { //verify v not in the tree VERIFY(!Contains(v)); } - virtual void HandleDelete(EdgeId e) { + void HandleDelete(EdgeId e) override { //verify e not in the tree DEBUG("Trying to delete " << br_comp_.g().str(e)); VERIFY(!Contains(e)); } - virtual void HandleMerge(const vector& old_edges, EdgeId /*new_edge*/) { + void HandleMerge(const vector& old_edges, EdgeId /*new_edge*/) override { //verify false for (EdgeId e : old_edges) { VERIFY(!Contains(e)); } } - virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) { + void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) override { // verify edge2 in tree // put new_edge instead of edge2 DEBUG("Glueing " << br_comp_.g().str(new_edge) << " " << br_comp_.g().str(edge1) << " " << br_comp_.g().str(edge2)); @@ -338,8 +338,8 @@ } } - virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge_1, - EdgeId new_edge_2) { + void HandleSplit(EdgeId old_edge, EdgeId new_edge_1, + EdgeId new_edge_2) override { VERIFY(old_edge != br_comp_.g().conjugate(old_edge)); if (Contains(old_edge)) { edges_.erase(old_edge); @@ -499,7 +499,7 @@ typedef typename Graph::EdgeId EdgeId; typedef typename Graph::VertexId VertexId; - typedef ConcurrentDSU color_partition_ds_t; + typedef dsu::ConcurrentDSU color_partition_ds_t; const LocalizedComponent& component_; const ComponentColoring& coloring_; @@ -554,7 +554,7 @@ vector answer; for (VertexId v : vertices) { if (component_.end_vertices().count(v) == 0) { - push_back_all(answer, GoodOutgoingEdges(v)); + utils::push_back_all(answer, GoodOutgoingEdges(v)); } } return answer; @@ -627,10 +627,10 @@ } } size_t coverage = 0; - for (size_t cov : value_set(best_subtrees_coverage)) { + for (size_t cov : utils::value_set(best_subtrees_coverage)) { coverage += cov; } - next_edges_[v] = SetAsVector(value_set(best_alternatives)); + next_edges_[v] = SetAsVector(utils::value_set(best_alternatives)); subtree_coverage_[v] = coverage; } @@ -677,7 +677,7 @@ VERIFY(!level_vertices.empty()); //looking for good edges - insert_all(good_edges_, + utils::insert_all(good_edges_, GoodOutgoingEdges( vector(level_vertices.begin(), level_vertices.end()))); @@ -751,7 +751,7 @@ bool SplitComponent() { DEBUG("Splitting component"); set level_heights(component_.avg_distances()); - DEBUG("Level heights " << ToString(level_heights)); + DEBUG("Level heights " << utils::ContainerToString(level_heights)); GraphComponent gc = component_.AsGraphComponent(); @@ -763,7 +763,7 @@ DEBUG("Processing edge " << g_.str(*it) << " avg_start " << start_dist << " avg_end " << end_dist); set dist_to_split(level_heights.lower_bound(start_dist), level_heights.upper_bound(end_dist)); - DEBUG("Distances to split " << ToString(dist_to_split)); + DEBUG("Distances to split " << utils::ContainerToString(dist_to_split)); size_t offset = start_dist; EdgeId e = *it; @@ -1108,8 +1108,8 @@ if (!pics_folder_.empty()) { PrintComponent(component, tree, pics_folder_ + "success/" - + ToString(this->g().int_id(component.start_vertex())) - + "_" + ToString(candidate_cnt) + ".dot"); + + std::to_string(this->g().int_id(component.start_vertex())) + + "_" + std::to_string(candidate_cnt) + ".dot"); } ComponentProjector projector(this->g(), component, coloring, tree); @@ -1126,7 +1126,7 @@ //todo check if we rewrite all of the previous pics! PrintComponent(component, pics_folder_ + "fail/" - + ToString(this->g().int_id(component.start_vertex())) //+ "_" + ToString(candidate_cnt) + + std::to_string(this->g().int_id(component.start_vertex())) //+ "_" + std::to_string(candidate_cnt) + ".dot"); } return false; diff -Nru spades-3.10.1+dfsg/src/common/modules/simplification/complex_tip_clipper.hpp spades-3.11.1+dfsg/src/common/modules/simplification/complex_tip_clipper.hpp --- spades-3.10.1+dfsg/src/common/modules/simplification/complex_tip_clipper.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/simplification/complex_tip_clipper.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -160,7 +160,7 @@ if (!pics_folder_.empty()) { visualization::visualization_utils::WriteComponentSinksSources(component, pics_folder_ - + ToString(this->g().int_id(v)) //+ "_" + ToString(candidate_cnt) + + std::to_string(this->g().int_id(v)) //+ "_" + std::to_string(candidate_cnt) + ".dot"); } diff -Nru spades-3.10.1+dfsg/src/common/modules/simplification/compressor.hpp spades-3.11.1+dfsg/src/common/modules/simplification/compressor.hpp --- spades-3.10.1+dfsg/src/common/modules/simplification/compressor.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/simplification/compressor.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -118,7 +118,7 @@ * Method compresses all vertices which can be compressed. */ template -bool CompressAllVertices(Graph &g, bool safe_merging = true, size_t chunk_cnt = 1) { +size_t CompressAllVertices(Graph &g, size_t chunk_cnt = 1, bool safe_merging = true) { CompressingProcessor compressor(g, chunk_cnt, safe_merging); return compressor.Run(); } diff -Nru spades-3.10.1+dfsg/src/common/modules/simplification/dominated_set_finder.hpp spades-3.11.1+dfsg/src/common/modules/simplification/dominated_set_finder.hpp --- spades-3.10.1+dfsg/src/common/modules/simplification/dominated_set_finder.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/simplification/dominated_set_finder.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -15,10 +15,10 @@ std::map dominated_; bool CheckCanBeProcessed(VertexId v) const { - DEBUG( "Check if vertex " << g_.str(v) << " is dominated close neighbour"); + DEBUG("Check if vertex " << g_.str(v) << " is dominated close neighbour"); for (EdgeId e : g_.IncomingEdges(v)) { if (dominated_.count(g_.EdgeStart(e)) == 0) { - DEBUG( "Blocked by external vertex " << g_.int_id(g_.EdgeStart(e)) << " that starts edge " << g_.int_id(e)); + DEBUG("Blocked by external vertex " << g_.int_id(g_.EdgeStart(e)) << " that starts edge " << g_.int_id(e)); DEBUG("Check fail"); return false; } @@ -31,7 +31,7 @@ std::queue& can_be_processed) const { DEBUG("Updating can be processed"); for (EdgeId e : g_.OutgoingEdges(v)) { - DEBUG("Considering edge " << ToString(e)); + DEBUG("Considering edge " << g_.str(e)); VertexId neighbour_v = g_.EdgeEnd(e); if (CheckCanBeProcessed(neighbour_v)) { can_be_processed.push(neighbour_v); @@ -115,13 +115,13 @@ } GraphComponent AsGraphComponent() const { - return GraphComponent::FromVertices(g_, key_set(dominated_)); + return GraphComponent::FromVertices(g_, utils::key_set(dominated_)); } //little meaning if FillDominated returned false const map CountBorder() const { map border; - for (VertexId v : key_set(border)) { + for (VertexId v : utils::key_set(border)) { for (EdgeId e : g_.OutgoingEdges(v)) { VertexId e_end = g_.EdgeEnd(e); if (dominated_.count(e_end) == 0) { diff -Nru spades-3.10.1+dfsg/src/common/modules/simplification/ec_threshold_finder.hpp spades-3.11.1+dfsg/src/common/modules/simplification/ec_threshold_finder.hpp --- spades-3.10.1+dfsg/src/common/modules/simplification/ec_threshold_finder.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/simplification/ec_threshold_finder.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -8,9 +8,9 @@ #ifndef OMNI_TOOLS_HPP_ #define OMNI_TOOLS_HPP_ -#include "utils/simple_tools.hpp" +#include "utils/stl_utils.hpp" -#include "utils/path_helper.hpp" +#include "utils/filesystem/path_helper.hpp" #include "assembly_graph/graph_support/basic_edge_conditions.hpp" #include "assembly_graph/graph_support/parallel_processing.hpp" #include "assembly_graph/graph_support/basic_vertex_conditions.hpp" @@ -39,9 +39,9 @@ return false; std::vector v1; - push_back_all(v1, graph_.OutgoingEdges(graph_.EdgeStart(e))); + utils::push_back_all(v1, graph_.OutgoingEdges(graph_.EdgeStart(e))); std::vector v2; - push_back_all(v2, graph_.IncomingEdges(graph_.EdgeEnd(e))); + utils::push_back_all(v2, graph_.IncomingEdges(graph_.EdgeEnd(e))); bool eq = (v1.size() == 2 && v2.size() == 2) && ((v1[0] == v2[0] && v1[1] == v2[1]) || (v1[0] == v2[1] && v1[0] == v2[1])); return !eq; } diff -Nru spades-3.10.1+dfsg/src/common/modules/simplification/erroneous_connection_remover.hpp spades-3.11.1+dfsg/src/common/modules/simplification/erroneous_connection_remover.hpp --- spades-3.10.1+dfsg/src/common/modules/simplification/erroneous_connection_remover.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/simplification/erroneous_connection_remover.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -130,7 +130,7 @@ VertexId end = g.EdgeEnd(e); TRACE("End " << g.str(end)); - ProcessPaths(g, 0, std::numeric_limits::max(), start, end, path_chooser, std::numeric_limits::max()); + ProcessPaths(g, 0, std::numeric_limits::max(), start, end, path_chooser); const vector& path = path_chooser.most_covered_path(); double path_coverage = path_chooser.max_coverage(); @@ -156,7 +156,7 @@ VertexId end = g.EdgeEnd(forbidden_edge); TRACE("End " << g.str(end)); - ProcessPaths(g, 0, std::numeric_limits::max(), start, end, path_chooser, std::numeric_limits::max()); + ProcessPaths(g, 0, std::numeric_limits::max(), start, end, path_chooser); const vector& path = path_chooser.most_covered_path(); double path_coverage = path_chooser.max_coverage(); @@ -182,8 +182,8 @@ else{ VertexId start = g.EdgeStart(e), end = g.EdgeEnd(e); vector incident; - push_back_all(incident, g.IncomingEdges(end)); - push_back_all(incident, g.OutgoingEdges(start)); + utils::push_back_all(incident, g.IncomingEdges(end)); + utils::push_back_all(incident, g.OutgoingEdges(start)); for (auto it = incident.begin(); it != incident.end(); ++it){ res = IsAlternativeInclusivePathExist(g, *it, e); if(res){ @@ -260,7 +260,7 @@ TopologicalThornCondition(Graph& g, size_t max_jump_dist, - size_t max_edge_cnt = -1ul) + size_t max_edge_cnt = std::numeric_limits::max()) : base(g), max_jump_distance_(max_jump_dist), max_edge_cnt_(max_edge_cnt) { @@ -490,7 +490,7 @@ return false; } vector edges; - push_back_all(edges, this->g().OutgoingEdges(v)); + utils::push_back_all(edges, this->g().OutgoingEdges(v)); VERIFY(edges.size() == 2); if (this->g().conjugate(edges[0]) != edges[1]) { return false; @@ -554,7 +554,7 @@ omnigraph::MultiplicityCounter mult_counter(this->g(), uniqueness_length_, 8); vector edges; - push_back_all(edges, this->g().OutgoingEdges(this->g().EdgeEnd(e))); + utils::push_back_all(edges, this->g().OutgoingEdges(this->g().EdgeEnd(e))); VERIFY(edges.size() == 2); return (this->g().conjugate(edges[0]) == edges[1] && mult_counter.count(e, this->g().EdgeStart(e)) <= 1) || this->g().length(e) >= uniqueness_length_; diff -Nru spades-3.10.1+dfsg/src/common/modules/simplification/parallel_simplification_algorithms.hpp spades-3.11.1+dfsg/src/common/modules/simplification/parallel_simplification_algorithms.hpp --- spades-3.10.1+dfsg/src/common/modules/simplification/parallel_simplification_algorithms.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/simplification/parallel_simplification_algorithms.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -340,22 +340,7 @@ bool ShouldFilterConjugate() const { return true; } -// bool operator()(EdgeId e) { -// if (ec_condition_->Check(e)) { -// edges_to_remove_.push_back(e); -// } -// return false; -// } -// -// void RemoveCollectedEdges() { -// omnigraph::SmartSetIterator to_delete(g_, edges_to_remove_.begin(), edges_to_remove_.end()); -// while (!to_delete.IsEnd()) { -// EdgeId e = *to_delete; -// handler_f_(e); -// g_.DeleteEdge(e); -// ++to_delete; -// } -// } + private: DECL_LOGGER("ParallelLowCoverageFunctor"); }; @@ -727,70 +712,6 @@ ; }; -template -class SemiParallelAlgorithmRunner { - typedef typename Graph::VertexId VertexId; - typedef typename Graph::EdgeId EdgeId; - - const Graph& g_; - -public: - - const Graph& g() const { - return g_; - } - - SemiParallelAlgorithmRunner(Graph& g) - : g_(g) { - - } - - template> - bool RunFromChunkIterators(Algo& algo, const ItVec& chunk_iterators, - const Comparator& comp = Comparator()) { - VERIFY(chunk_iterators.size() > 1); - omnigraph::SmartSetIterator it(g_, false, comp); - - omnigraph::FindInterestingFromChunkIterators(chunk_iterators, - [&](ElementType el) {return algo.IsOfInterest(el);}, - [&](ElementType el) {it.push(el);}); - - bool changed = false; - for (; !it.IsEnd(); ++it) { - changed |= algo.Process(*it); - } - return changed; - } - -private: - DECL_LOGGER("SemiParallelAlgorithmRunner"); -}; - -template -class SemiParallelEdgeRemovingAlgorithm { - typedef typename Graph::EdgeId EdgeId; - typedef typename Graph::VertexId VertexId; - Graph& g_; - func::TypedPredicate condition_; - omnigraph::EdgeRemover edge_remover_; - -public: - SemiParallelEdgeRemovingAlgorithm(Graph& g, - func::TypedPredicate condition, - std::function removal_handler = 0) : - g_(g), condition_(condition), edge_remover_(g, removal_handler) { - } - - bool IsOfInterest(EdgeId e) const { - return condition_(e); - } - - bool Process(EdgeId e) { - edge_remover_.DeleteEdge(e); - return true; - } -}; - template bool RunVertexAlgorithm(Graph& g, AlgoRunner& runner, Algo& algo, size_t chunk_cnt) { return runner.RunFromChunkIterators(algo, omnigraph::IterationHelper(g).Chunks(chunk_cnt)); @@ -801,6 +722,7 @@ return runner.RunFromChunkIterators(algo, omnigraph::IterationHelper(g).Chunks(chunk_cnt)); } +//Deprecated template void ParallelCompress(Graph &g, size_t chunk_cnt, bool loop_post_compression = true) { INFO("Parallel compression"); @@ -817,6 +739,7 @@ } } +//Deprecated template bool ParallelClipTips(Graph &g, size_t max_length, @@ -839,6 +762,7 @@ return true; } +//TODO review if can be useful... AFAIK never actually worked //template //bool ParallelRemoveBulges(Graph &g, // const config::debruijn_config::simplification::bulge_remover &br_config, @@ -867,6 +791,8 @@ // return true; //} +//TODO looks obsolete +//Deprecated template bool ParallelEC(Graph &g, size_t max_length, diff -Nru spades-3.10.1+dfsg/src/common/modules/simplification/relative_coverage_remover.hpp spades-3.11.1+dfsg/src/common/modules/simplification/relative_coverage_remover.hpp --- spades-3.10.1+dfsg/src/common/modules/simplification/relative_coverage_remover.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/simplification/relative_coverage_remover.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -253,7 +253,7 @@ DEBUG("Max local coverage incoming - " << rel_helper_.MaxLocalCoverage(this->g().IncomingEdges(v), v)); DEBUG("Max local coverage outgoing - " << rel_helper_.MaxLocalCoverage(this->g().OutgoingEdges(v), v)); return rel_helper_.AnyHighlyCoveredOnBothSides(v, coverage_edge_around_v) && - HighCoverageComponentFinder(this->g(), this->g().coverage(e) * diff_mult_) + HighCoverageComponentFinder(this->g(), this->g().coverage(e) * diff_mult_, min_neighbourhood_size_) .EdgeSummaryLength(v) >= min_neighbourhood_size_; } @@ -332,7 +332,7 @@ if (cycle_detected_) return -1u; VERIFY(max_distance_.count(v) > 0); - answer = std::max(answer, get(max_distance_, v)); + answer = std::max(answer, utils::get(max_distance_, v)); } VERIFY(answer >= 0); if (answer == 0) @@ -546,7 +546,7 @@ vertices.insert(g_.EdgeEnd(e)); } - auto filename = success ? vis_dir_ + "/success/" + ToString(succ_cnt_++) : vis_dir_ + "/fail/" + ToString(fail_cnt_++); + auto filename = success ? vis_dir_ + "/success/" + std::to_string(succ_cnt_++) : vis_dir_ + "/fail/" + std::to_string(fail_cnt_++); visualization::visualization_utils::WriteComponent( ComponentCloser(g_, 0).CloseComponent( GraphComponent::FromVertices(g_, vertices)), @@ -578,9 +578,9 @@ VERIFY(tip_allowing_length_bound >= length_bound); TRACE("Coverage gap " << min_coverage_gap); if (!vis_dir_.empty()) { - path::make_dirs(vis_dir_); - path::make_dirs(vis_dir_ + "/success/"); - path::make_dirs(vis_dir_ + "/fail/"); + fs::make_dirs(vis_dir_); + fs::make_dirs(vis_dir_ + "/success/"); + fs::make_dirs(vis_dir_ + "/fail/"); } } @@ -662,7 +662,7 @@ max_coverage, vertex_count_limit, vis_dir), component_remover_(g, handler_function) { this->interest_el_finder_ = std::make_shared>( - [&](EdgeId e) { return finder_(e); }, chunk_cnt); + [&](EdgeId e) { return static_cast(finder_(e)); }, chunk_cnt); } protected: diff -Nru spades-3.10.1+dfsg/src/common/modules/simplification/tip_clipper.hpp spades-3.11.1+dfsg/src/common/modules/simplification/tip_clipper.hpp --- spades-3.10.1+dfsg/src/common/modules/simplification/tip_clipper.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/modules/simplification/tip_clipper.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -147,22 +147,19 @@ typedef typename Graph::VertexId VertexId; typedef EdgeCondition base; const double max_AT_percentage_; - const size_t max_tip_length_; const bool check_tip_ ; public: - ATCondition(const Graph& g, double max_AT_percentage, size_t max_tip_length, bool check_tip) : - base(g), max_AT_percentage_(max_AT_percentage), max_tip_length_(max_tip_length), check_tip_(check_tip) { + ATCondition(const Graph& g, double max_AT_percentage, bool check_tip) : + base(g), max_AT_percentage_(max_AT_percentage), check_tip_(check_tip) { DEBUG("check_tip: " << check_tip_); } bool Check(EdgeId e) const { //+1 is a trick to deal with edges of 0 coverage from iterative run + //FIXME where is the trick? size_t start = 0; - //TODO: Do we need this check? - if(this->g().length(e) > max_tip_length_) - return false; size_t end = this->g().length(e) + this->g().k(); if (check_tip_) { if (this->g().OutgoingEdgeCount(this->g().EdgeEnd(e)) == 0) @@ -175,13 +172,12 @@ const Sequence &s_edge = this->g().EdgeNucls(e); for (size_t position = start; position < end; position ++) { - counts[s_edge[position]] ++; + counts[s_edge[position]]++; } size_t curm = *std::max_element(counts.begin(), counts.end()); - if (curm > max_AT_percentage_ * double(end - start)) { + if (math::gr(double(curm), max_AT_percentage_ * double(end - start))) { DEBUG("deleting edge" << s_edge.str());; DEBUG("curm: " << curm); - DEBUG("start end cutoff" << start << " " << end << " " << max_AT_percentage_ * double(this->g().length(e))); return true; diff -Nru spades-3.10.1+dfsg/src/common/paired_info/CMakeLists.txt spades-3.11.1+dfsg/src/common/paired_info/CMakeLists.txt --- spades-3.10.1+dfsg/src/common/paired_info/CMakeLists.txt 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/paired_info/CMakeLists.txt 2017-09-28 09:05:10.000000000 +0000 @@ -0,0 +1,11 @@ +############################################################################ +# Copyright (c) 2015 Saint Petersburg State University +# Copyright (c) 2011-2014 Saint Petersburg Academic University +# All Rights Reserved +# See file LICENSE for details. +############################################################################ + +project(paired_info CXX) + +add_library(paired_info STATIC + distance_estimation.cpp weighted_distance_estimation.cpp smoothing_distance_estimation.cpp) diff -Nru spades-3.10.1+dfsg/src/common/paired_info/distance_estimation.cpp spades-3.11.1+dfsg/src/common/paired_info/distance_estimation.cpp --- spades-3.10.1+dfsg/src/common/paired_info/distance_estimation.cpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/paired_info/distance_estimation.cpp 2017-09-28 09:05:10.000000000 +0000 @@ -0,0 +1,176 @@ +#include "distance_estimation.hpp" + +namespace omnigraph { +namespace de { + +using namespace debruijn_graph; + +std::vector GraphDistanceFinder::GetGraphDistancesLengths(EdgeId e1, EdgeId e2) const { + LengthMap m; + m.insert({e2, {}}); + + FillGraphDistancesLengths(e1, m); + + return m[e2]; +} + +void GraphDistanceFinder::FillGraphDistancesLengths(EdgeId e1, LengthMap &second_edges) const { + vector path_lower_bounds; + size_t path_upper_bound = PairInfoPathLengthUpperBound(graph_.k(), insert_size_, delta_); + PathProcessor paths_proc(graph_, graph_.EdgeEnd(e1), path_upper_bound); + + for (auto &entry : second_edges) { + EdgeId e2 = entry.first; + size_t path_lower_bound = PairInfoPathLengthLowerBound(graph_.k(), graph_.length(e1), + graph_.length(e2), gap_, delta_); + + TRACE("Bounds for paths are " << path_lower_bound << " " << path_upper_bound); + + DistancesLengthsCallback callback(graph_); + paths_proc.Process(graph_.EdgeStart(e2), path_lower_bound, path_upper_bound, callback); + GraphLengths lengths = callback.distances(); + for (size_t j = 0; j < lengths.size(); ++j) { + lengths[j] += graph_.length(e1); + TRACE("Resulting distance set for " << + " edge " << graph_.int_id(e2) << + " #" << j << " length " << lengths[j]); + } + + if (e1 == e2) + lengths.push_back(0); + + std::sort(lengths.begin(), lengths.end()); + entry.second = lengths; + } +} + +void AbstractDistanceEstimator::FillGraphDistancesLengths(EdgeId e1, LengthMap &second_edges) const { + distance_finder_.FillGraphDistancesLengths(e1, second_edges); +} + +AbstractDistanceEstimator::OutHistogram AbstractDistanceEstimator::ClusterResult(EdgePair, + const EstimHist &estimated) const { + OutHistogram result; + for (size_t i = 0; i < estimated.size(); ++i) { + size_t left = i; + DEWeight weight = DEWeight(estimated[i].second); + while (i + 1 < estimated.size() && + (estimated[i + 1].first - estimated[i].first) <= (int) linkage_distance_) { + ++i; + weight += estimated[i].second; + } + DEDistance center = DEDistance((estimated[left].first + estimated[i].first) * 0.5); + DEVariance var = DEVariance((estimated[i].first - estimated[left].first) * 0.5); + result.insert(Point(center, weight, var)); + } + return result; +} + +void AbstractDistanceEstimator::AddToResult(const OutHistogram &clustered, EdgePair ep, + PairedInfoBuffer &result) const { + result.AddMany(ep.first, ep.second, clustered); +} + +void DistanceEstimator::Estimate(PairedInfoIndexT &result, size_t nthreads) const { + this->Init(); + const auto &index = this->index(); + + DEBUG("Collecting edge infos"); + std::vector edges; + for (auto it = this->graph().ConstEdgeBegin(); !it.IsEnd(); ++it) + edges.push_back(*it); + + DEBUG("Processing"); + PairedInfoBuffersT buffer(this->graph(), nthreads); +# pragma omp parallel for num_threads(nthreads) schedule(guided, 10) + for (size_t i = 0; i < edges.size(); ++i) { + EdgeId edge = edges[i]; + ProcessEdge(edge, index, buffer[omp_get_thread_num()]); + } + + for (size_t i = 0; i < nthreads; ++i) { + result.Merge(buffer[i]); + buffer[i].clear(); + } +} + +DistanceEstimator::EstimHist DistanceEstimator::EstimateEdgePairDistances(EdgePair ep, const InHistogram &histogram, + const GraphLengths &raw_forward) const { + using std::abs; + using namespace math; + EdgeId e1 = ep.first, e2 = ep.second; + size_t first_len = this->graph().length(e1), second_len = this->graph().length(e2); + int minD = rounded_d(histogram.min()), maxD = rounded_d(histogram.max()); + + TRACE("Bounds are " << minD << " " << maxD); + EstimHist result; + vector forward; + forward.reserve(raw_forward.size()); + for (auto raw_length : raw_forward) { + int length = int(raw_length); + if (minD - int(max_distance_) <= length && length <= maxD + int(max_distance_)) + forward.push_back(DEDistance(length)); + } + if (forward.size() == 0) + return result; + + size_t cur_dist = 0; + vector weights(forward.size(), 0); + for (auto point : histogram) { + if (ls(2 * point.d + DEDistance(second_len), DEDistance(first_len))) + continue; + while (cur_dist + 1 < forward.size() && forward[cur_dist + 1] < point.d) + ++cur_dist; + + if (cur_dist + 1 < forward.size() && + ls(forward[cur_dist + 1] - point.d, point.d - forward[cur_dist])) { + ++cur_dist; + + if (le(abs(forward[cur_dist] - point.d), max_distance_)) + weights[cur_dist] += point.weight; + } else if (cur_dist + 1 < forward.size() && + eq(forward[cur_dist + 1] - point.d, point.d - forward[cur_dist])) { + if (le(abs(forward[cur_dist] - point.d), max_distance_)) + weights[cur_dist] += point.weight * 0.5; + ++cur_dist; + if (le(abs(forward[cur_dist] - point.d), max_distance_)) + weights[cur_dist] += point.weight * 0.5; + } else { + if (le(abs(forward[cur_dist] - point.d), max_distance_)) + weights[cur_dist] += point.weight; + } + } + + for (size_t i = 0; i < forward.size(); ++i) + if (ge(weights[i], DEWeight(0))) + result.push_back(make_pair(forward[i], weights[i])); + + VERIFY(result.size() == forward.size()); + return result; +} + +void DistanceEstimator::ProcessEdge(EdgeId e1, const InPairedIndex &pi, PairedInfoBuffer &result) const { + typename base::LengthMap second_edges; + auto inner_map = pi.GetHalf(e1); + for (auto i : inner_map) + second_edges[i.first]; + + this->FillGraphDistancesLengths(e1, second_edges); + + for (const auto &entry: second_edges) { + EdgeId e2 = entry.first; + EdgePair ep(e1, e2); + + VERIFY(ep <= pi.ConjugatePair(ep)); + + const GraphLengths &forward = entry.second; + TRACE("Edge pair is " << this->graph().int_id(ep.first) + << " " << this->graph().int_id(ep.second)); + auto hist = pi.Get(e1, e2); + const EstimHist &estimated = this->EstimateEdgePairDistances(ep, hist, forward); + OutHistogram res = this->ClusterResult(ep, estimated); + this->AddToResult(res, ep, result); + } +} +} +} diff -Nru spades-3.10.1+dfsg/src/common/paired_info/distance_estimation.hpp spades-3.11.1+dfsg/src/common/paired_info/distance_estimation.hpp --- spades-3.10.1+dfsg/src/common/paired_info/distance_estimation.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/paired_info/distance_estimation.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -8,154 +8,94 @@ #ifndef DISTANCE_ESTIMATION_HPP_ #define DISTANCE_ESTIMATION_HPP_ -#include "math/xmath.h" -#include "utils/openmp_wrapper.h" - -#include "paired_info.hpp" +#include "utils/parallel/openmp_wrapper.h" +#include "assembly_graph/core/basic_graph_stats.hpp" +#include "assembly_graph/core/graph.hpp" #include "assembly_graph/paths/path_processor.hpp" + #include "paired_info/pair_info_bounds.hpp" +#include "paired_info.hpp" +#include "math/xmath.h" namespace omnigraph { namespace de { //todo move to some more common place -template class GraphDistanceFinder { - typedef typename Graph::EdgeId EdgeId; - typedef typename Graph::VertexId VertexId; - typedef std::vector Path; + typedef std::vector Path; typedef std::vector GraphLengths; - typedef std::map LengthMap; + typedef std::map LengthMap; public: - GraphDistanceFinder(const Graph &graph, size_t insert_size, size_t read_length, size_t delta) : + GraphDistanceFinder(const debruijn_graph::Graph &graph, size_t insert_size, size_t read_length, size_t delta) : graph_(graph), insert_size_(insert_size), gap_((int) (insert_size - 2 * read_length)), delta_((double) delta) { } - std::vector GetGraphDistancesLengths(EdgeId e1, EdgeId e2) const { - LengthMap m; - m.insert({e2, {}}); - - FillGraphDistancesLengths(e1, m); - - return m[e2]; - } + std::vector GetGraphDistancesLengths(debruijn_graph::EdgeId e1, debruijn_graph::EdgeId e2) const; // finds all distances from a current edge to a set of edges - void FillGraphDistancesLengths(EdgeId e1, LengthMap &second_edges) const { - vector path_lower_bounds; - - size_t path_upper_bound = PairInfoPathLengthUpperBound(graph_.k(), insert_size_, delta_); - - PathProcessor paths_proc(graph_, graph_.EdgeEnd(e1), path_upper_bound); - - for (auto &entry : second_edges) { - EdgeId e2 = entry.first; - size_t path_lower_bound = PairInfoPathLengthLowerBound(graph_.k(), graph_.length(e1), - graph_.length(e2), gap_, delta_); - - TRACE("Bounds for paths are " << path_lower_bound << " " << path_upper_bound); - - DistancesLengthsCallback callback(graph_); - paths_proc.Process(graph_.EdgeStart(e2), path_lower_bound, path_upper_bound, callback); - GraphLengths lengths = callback.distances(); - for (size_t j = 0; j < lengths.size(); ++j) { - lengths[j] += graph_.length(e1); - TRACE("Resulting distance set for " << - " edge " << graph_.int_id(e2) << - " #" << j << " length " << lengths[j]); - } - - if (e1 == e2) - lengths.push_back(0); - - std::sort(lengths.begin(), lengths.end()); - entry.second = lengths; - } - } + void FillGraphDistancesLengths(debruijn_graph::EdgeId e1, LengthMap &second_edges) const; private: DECL_LOGGER("GraphDistanceFinder"); - - const Graph &graph_; + const debruijn_graph::Graph &graph_; const size_t insert_size_; const int gap_; const double delta_; }; -template class AbstractDistanceEstimator { protected: - typedef UnclusteredPairedInfoIndexT InPairedIndex; - typedef PairedInfoIndexT OutPairedIndex; + typedef UnclusteredPairedInfoIndexT InPairedIndex; + typedef PairedInfoIndexT OutPairedIndex; typedef typename InPairedIndex::HistProxy InHistogram; typedef typename OutPairedIndex::Histogram OutHistogram; public: - AbstractDistanceEstimator(const Graph &graph, + AbstractDistanceEstimator(const debruijn_graph::Graph &graph, const InPairedIndex &index, - const GraphDistanceFinder &distance_finder, + const GraphDistanceFinder &distance_finder, size_t linkage_distance = 0) : graph_(graph), index_(index), distance_finder_(distance_finder), linkage_distance_(linkage_distance) { } - virtual void Estimate(PairedInfoIndexT &result, size_t nthreads) const = 0; + virtual void Estimate(PairedInfoIndexT &result, size_t nthreads) const = 0; virtual ~AbstractDistanceEstimator() { } protected: - typedef typename Graph::EdgeId EdgeId; - typedef pair EdgePair; + typedef pair EdgePair; typedef vector > EstimHist; typedef vector GraphLengths; - typedef std::map LengthMap; + typedef std::map LengthMap; - const Graph &graph() const { return graph_; } + const debruijn_graph::Graph &graph() const { return graph_; } const InPairedIndex &index() const { return index_; } - void FillGraphDistancesLengths(EdgeId e1, LengthMap &second_edges) const { - distance_finder_.FillGraphDistancesLengths(e1, second_edges); - } + void FillGraphDistancesLengths(debruijn_graph::EdgeId e1, LengthMap &second_edges) const; - OutHistogram ClusterResult(EdgePair /*ep*/, const EstimHist &estimated) const { - OutHistogram result; - for (size_t i = 0; i < estimated.size(); ++i) { - size_t left = i; - DEWeight weight = DEWeight(estimated[i].second); - while (i + 1 < estimated.size() && - (estimated[i + 1].first - estimated[i].first) <= (int) linkage_distance_) { - ++i; - weight += estimated[i].second; - } - DEDistance center = DEDistance((estimated[left].first + estimated[i].first) * 0.5); - DEVariance var = DEVariance((estimated[i].first - estimated[left].first) * 0.5); - result.insert(Point(center, weight, var)); - } - return result; - } + OutHistogram ClusterResult(EdgePair /*ep*/, const EstimHist &estimated) const; - void AddToResult(const OutHistogram &clustered, EdgePair ep, PairedInfoBuffer &result) const { - result.AddMany(ep.first, ep.second, clustered); - } + void AddToResult(const OutHistogram &clustered, EdgePair ep, PairedInfoBuffer &result) const; private: - const Graph &graph_; + const debruijn_graph::Graph &graph_; const InPairedIndex &index_; - const GraphDistanceFinder &distance_finder_; + const GraphDistanceFinder &distance_finder_; const size_t linkage_distance_; virtual const string Name() const = 0; + + DECL_LOGGER("AbstractDistanceEstimator"); }; -template -class DistanceEstimator : public AbstractDistanceEstimator { - typedef AbstractDistanceEstimator base; - typedef typename Graph::EdgeId EdgeId; +class DistanceEstimator : public AbstractDistanceEstimator { + typedef AbstractDistanceEstimator base; typedef vector GraphLengths; typedef vector > EstimHist; - typedef pair EdgePair; + typedef pair EdgePair; protected: typedef typename base::InPairedIndex InPairedIndex; @@ -164,9 +104,9 @@ typedef typename base::OutHistogram OutHistogram; public: - DistanceEstimator(const Graph &graph, + DistanceEstimator(const debruijn_graph::Graph &graph, const InPairedIndex &index, - const GraphDistanceFinder &distance_finder, + const GraphDistanceFinder &distance_finder, size_t linkage_distance, size_t max_distance) : base(graph, index, distance_finder, linkage_distance), max_distance_(max_distance) { } @@ -176,114 +116,19 @@ INFO("Using " << this->Name() << " distance estimator"); } - virtual void Estimate(OutPairedIndex &result, size_t nthreads) const { - this->Init(); - const auto &index = this->index(); - - DEBUG("Collecting edge infos"); - std::vector edges; - for (auto it = this->graph().ConstEdgeBegin(); !it.IsEnd(); ++it) - edges.push_back(*it); - - DEBUG("Processing"); - PairedInfoBuffersT buffer(this->graph(), nthreads); -# pragma omp parallel for num_threads(nthreads) schedule(guided, 10) - for (size_t i = 0; i < edges.size(); ++i) { - EdgeId edge = edges[i]; - ProcessEdge(edge, index, buffer[omp_get_thread_num()]); - } - - for (size_t i = 0; i < nthreads; ++i) { - result.Merge(buffer[i]); - buffer[i].clear(); - } - } + virtual void Estimate(OutPairedIndex &result, size_t nthreads) const; protected: const DEDistance max_distance_; virtual EstimHist EstimateEdgePairDistances(EdgePair ep, const InHistogram &histogram, - const GraphLengths &raw_forward) const { - using std::abs; - using namespace math; - EdgeId e1 = ep.first, e2 = ep.second; - size_t first_len = this->graph().length(e1), second_len = this->graph().length(e2); - int minD = rounded_d(histogram.min()), maxD = rounded_d(histogram.max()); - - TRACE("Bounds are " << minD << " " << maxD); - EstimHist result; - vector forward; - forward.reserve(raw_forward.size()); - for (auto raw_length : raw_forward) { - int length = int(raw_length); - if (minD - int(max_distance_) <= length && length <= maxD + int(max_distance_)) - forward.push_back(DEDistance(length)); - } - if (forward.size() == 0) - return result; - - size_t cur_dist = 0; - vector weights(forward.size(), 0); - for (auto point : histogram) { - if (ls(2 * point.d + DEDistance(second_len), DEDistance(first_len))) - continue; - while (cur_dist + 1 < forward.size() && forward[cur_dist + 1] < point.d) - ++cur_dist; - - if (cur_dist + 1 < forward.size() && - ls(forward[cur_dist + 1] - point.d, point.d - forward[cur_dist])) { - ++cur_dist; - - if (le(abs(forward[cur_dist] - point.d), max_distance_)) - weights[cur_dist] += point.weight; - } else if (cur_dist + 1 < forward.size() && - eq(forward[cur_dist + 1] - point.d, point.d - forward[cur_dist])) { - if (le(abs(forward[cur_dist] - point.d), max_distance_)) - weights[cur_dist] += point.weight * 0.5; - ++cur_dist; - if (le(abs(forward[cur_dist] - point.d), max_distance_)) - weights[cur_dist] += point.weight * 0.5; - } else { - if (le(abs(forward[cur_dist] - point.d), max_distance_)) - weights[cur_dist] += point.weight; - } - } - - for (size_t i = 0; i < forward.size(); ++i) - if (ge(weights[i], DEWeight(0))) - result.push_back(make_pair(forward[i], weights[i])); - - VERIFY(result.size() == forward.size()); - return result; - } + const GraphLengths &raw_forward) const; private: - virtual void ProcessEdge(EdgeId e1, + virtual void ProcessEdge(debruijn_graph::EdgeId e1, const InPairedIndex &pi, - PairedInfoBuffer &result) const { - typename base::LengthMap second_edges; - auto inner_map = pi.GetHalf(e1); - for (auto i : inner_map) - second_edges[i.first]; - - this->FillGraphDistancesLengths(e1, second_edges); - - for (const auto &entry: second_edges) { - EdgeId e2 = entry.first; - EdgePair ep(e1, e2); - - VERIFY(ep <= pi.ConjugatePair(ep)); - - const GraphLengths &forward = entry.second; - TRACE("Edge pair is " << this->graph().int_id(ep.first) - << " " << this->graph().int_id(ep.second)); - auto hist = pi.Get(e1, e2); - const EstimHist &estimated = this->EstimateEdgePairDistances(ep, hist, forward); - OutHistogram res = this->ClusterResult(ep, estimated); - this->AddToResult(res, ep, result); - } - } + PairedInfoBuffer &result) const; virtual const string Name() const { static const string my_name = "SIMPLE"; diff -Nru spades-3.10.1+dfsg/src/common/paired_info/histogram.hpp spades-3.11.1+dfsg/src/common/paired_info/histogram.hpp --- spades-3.10.1+dfsg/src/common/paired_info/histogram.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/paired_info/histogram.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -8,8 +8,8 @@ #pragma once #include -#include "common/adt/flat_set.hpp" -#include "common/adt/small_pod_vector.hpp" +#include "adt/flat_set.hpp" +#include "adt/small_pod_vector.hpp" #include "index_point.hpp" namespace omnigraph { diff -Nru spades-3.10.1+dfsg/src/common/paired_info/paired_info.hpp spades-3.11.1+dfsg/src/common/paired_info/paired_info.hpp --- spades-3.10.1+dfsg/src/common/paired_info/paired_info.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/paired_info/paired_info.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -7,7 +7,7 @@ #pragma once -#include "common/adt/iterator_range.hpp" +#include "adt/iterator_range.hpp" #include #include diff -Nru spades-3.10.1+dfsg/src/common/paired_info/smoothing_distance_estimation.cpp spades-3.11.1+dfsg/src/common/paired_info/smoothing_distance_estimation.cpp --- spades-3.10.1+dfsg/src/common/paired_info/smoothing_distance_estimation.cpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/paired_info/smoothing_distance_estimation.cpp 2017-09-28 09:05:10.000000000 +0000 @@ -0,0 +1,185 @@ +#include "smoothing_distance_estimation.hpp" + +namespace omnigraph { +namespace de { + +using namespace debruijn_graph; + +SmoothingDistanceEstimator::EstimHist SmoothingDistanceEstimator::FindEdgePairDistances(EdgePair ep, + const TempHistogram &raw_hist) const { + size_t first_len = this->graph().length(ep.first); + size_t second_len = this->graph().length(ep.second); + TRACE("Lengths are " << first_len << " " << second_len); + TempHistogram data; + for (auto I = raw_hist.begin(), E = raw_hist.end(); I != E; ++I) { + Point p = *I; + if (math::ge(2 * (long) rounded_d(p) + (long) second_len, (long) first_len)) if ( + (long) rounded_d(p) + (long) OVERLAP_TOLERANCE >= (long) first_len) + data.insert(p); + } + EstimHist result; + double picture_weight = 0.; + for (auto I = data.begin(), E = data.end(); I != E; ++I) + picture_weight += I->weight; + if (math::ls(picture_weight, 3.)) + return result; + + DataDivider data_divider(threshold_, + vector(data.begin(), data.end())); + + PairInfos infos; + infos.reserve(data.size()); + const vector &clusters = + data_divider.DivideAndSmoothData(ep, infos, this->weight_f_); + DEBUG("Seeking for distances"); + TRACE("size " << infos.size()); + + for (size_t i = 0; i < clusters.size(); ++i) { + size_t begin = clusters[i].first; + size_t end = clusters[i].second; + TRACE("begin " << begin << " at " << rounded_d(infos[begin]) + << ", " << " end " << end << " at " << rounded_d(infos[end - 1])); + size_t data_length = rounded_d(infos[end - 1]) - rounded_d(infos[begin]) + 1; + TRACE("data length " << data_length); + if (end - begin > min_peak_points_) { + size_t range = (size_t) math::round((double) data_length * range_coeff_); + size_t delta = (size_t) math::round((double) data_length * delta_coeff_); + PeakFinder peakfinder(infos, begin, end, range, delta, percentage_, deriv_thr); + DEBUG("Processing window : " << rounded_d(infos[begin]) + << " " << rounded_d(infos[end - 1])); + peakfinder.FFTSmoothing(cutoff_); + TRACE("Listing peaks"); + const EstimHist &peaks = peakfinder.ListPeaks(); + //for (auto iter = peaks.begin(); iter != peaks.end(); ++iter) { + //TRACE("PEAKS " << iter->first << " " << iter->second); + //} + if (peaks.size() == 0) + continue; + size_t index_of_max_weight = 0; + for (size_t i = 0; i < peaks.size(); ++i) + if (math::ls(peaks[index_of_max_weight].second, peaks[i].second)) + index_of_max_weight = i; + result.push_back(peaks[index_of_max_weight]); + } + } + + if (result.size() == 0) + return result; + size_t index_of_max_weight = 0; + for (size_t i = 0; i < result.size(); ++i) + if (math::ls(result[index_of_max_weight].second, result[i].second)) + index_of_max_weight = i; + + EstimHist new_result; + for (size_t i = 0; i < result.size(); ++i) + if (result[i].second > .5 * result[index_of_max_weight].second) + new_result.push_back(result[i]); + return new_result; +} + +void SmoothingDistanceEstimator::ProcessEdge(EdgeId e1, const InPairedIndex &pi, + PairedInfoBuffer &result) const { + typename base::LengthMap second_edges; + auto inner_map = pi.GetHalf(e1); + for (auto I : inner_map) + second_edges[I.first]; + + this->FillGraphDistancesLengths(e1, second_edges); + + for (const auto &entry: second_edges) { + EdgeId e2 = entry.first; + EdgePair ep(e1, e2); + + VERIFY(ep <= pi.ConjugatePair(ep)); + + TRACE("Processing edge pair " << this->graph().int_id(e1) + << " " << this->graph().int_id(e2)); + const GraphLengths &forward = entry.second; + + auto hist = pi.Get(e1, e2).Unwrap(); + EstimHist estimated; + //DEBUG("Extending paired information"); + //DEBUG("Extend left"); + //this->base::ExtendInfoLeft(e1, e2, hist, 1000); + DEBUG("Extend right"); + this->ExtendInfoRight(e1, e2, hist, 1000); + if (forward.size() == 0) { + estimated = FindEdgePairDistances(ep, hist); + ++gap_distances; + } else if (forward.size() > 0 && (!only_scaffolding_)) { + //TODO: remove THIS + InPairedIndex temp_index(this->graph()); + temp_index.AddMany(e1, e2, hist); + auto hist = temp_index.Get(e1, e2); + estimated = this->base::EstimateEdgePairDistances(ep, hist, forward); + } + DEBUG(gap_distances << " distances between gap edge pairs have been found"); + OutHistogram res = this->ClusterResult(ep, estimated); + this->AddToResult(res, ep, result); + } +} + +bool SmoothingDistanceEstimator::IsTipTip(EdgeId e1, EdgeId e2) const { + return (this->graph().OutgoingEdgeCount(this->graph().EdgeEnd(e1)) == 0 && + this->graph().IncomingEdgeCount(this->graph().EdgeEnd(e1)) == 1 && + this->graph().IncomingEdgeCount(this->graph().EdgeStart(e2)) == 0 && + this->graph().OutgoingEdgeCount(this->graph().EdgeStart(e2)) == 1); +} + +void SmoothingDistanceEstimator::MergeInto(const InHistogram &what, TempHistogram &where, int shift) const { + // assuming they are sorted already + if (what.size() == 0) + return; + + if (where.size() == 0) { + for (auto to_be_added : what) { + to_be_added.d += shift; + where.insert(to_be_added); + } + + return; + } + + // Check, whether two histograms intersect. If not, we can just merge them + // straightforwardly. + if (math::ls(where.rbegin()->d, what.min().d + float(shift)) || + math::gr(where.begin()->d, what.max().d + float(shift))) { + for (auto to_be_added : what) { + to_be_added.d += shift; + where.insert(to_be_added); + } + } else { + for (auto to_be_added : what) { + to_be_added.d += shift; + auto low_bound = std::lower_bound(where.begin(), where.end(), to_be_added); + if (low_bound != where.end() && to_be_added == *low_bound) { + to_be_added.weight += low_bound->weight; + where.erase(to_be_added); + where.insert(to_be_added); + } else + where.insert(low_bound, to_be_added); + } + } +} + +void SmoothingDistanceEstimator::ExtendRightDFS(const EdgeId &first, EdgeId current, TempHistogram &data, int shift, + size_t max_shift) const { + auto end = this->graph().EdgeEnd(current); + if (current == first) + return; + if (this->graph().IncomingEdgeCount(end) > 1) + return; + + for (EdgeId next : this->graph().OutgoingEdges(end)) { + auto hist = this->index().Get(first, next); + if (-shift < (int) max_shift) + ExtendRightDFS(first, next, data, shift - (int) this->graph().length(current), max_shift); + + //auto filtered_infos = FilterPositive(hist, this->graph().length(first), this->graph().length(next)); + //if (filtered_infos.size() > 0) + // MergeInto(filtered_infos, data, shift - (int) this->graph().length(current)); + MergeInto(hist, data, shift - (int) this->graph().length(current)); + } +} +} +} diff -Nru spades-3.10.1+dfsg/src/common/paired_info/smoothing_distance_estimation.hpp spades-3.11.1+dfsg/src/common/paired_info/smoothing_distance_estimation.hpp --- spades-3.10.1+dfsg/src/common/paired_info/smoothing_distance_estimation.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/paired_info/smoothing_distance_estimation.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -8,21 +8,19 @@ #ifndef SMOOTHING_DISTANCE_ESTIMATION_HPP_ #define SMOOTHING_DISTANCE_ESTIMATION_HPP_ -#include "paired_info.hpp" +#include "weighted_distance_estimation.hpp" #include "data_divider.hpp" #include "peak_finder.hpp" -#include "weighted_distance_estimation.hpp" namespace omnigraph { namespace de { -template -class SmoothingDistanceEstimator : public WeightedDistanceEstimator { +class SmoothingDistanceEstimator : public WeightedDistanceEstimator { //FIXME configure static const size_t OVERLAP_TOLERANCE = 1000; protected: - typedef WeightedDistanceEstimator base; + typedef WeightedDistanceEstimator base; typedef typename base::InPairedIndex InPairedIndex; typedef typename base::OutPairedIndex OutPairedIndex; typedef typename base::InHistogram InHistogram; @@ -30,9 +28,9 @@ typedef typename InPairedIndex::Histogram TempHistogram; public: - SmoothingDistanceEstimator(const Graph &graph, + SmoothingDistanceEstimator(const debruijn_graph::Graph &graph, const InPairedIndex &histogram, - const GraphDistanceFinder &dist_finder, + const GraphDistanceFinder &dist_finder, std::function weight_f, size_t linkage_distance, size_t max_distance, size_t threshold, double range_coeff, double delta_coeff, @@ -57,10 +55,9 @@ virtual ~SmoothingDistanceEstimator() { } protected: - typedef typename Graph::EdgeId EdgeId; - typedef pair EdgePair; + typedef pair EdgePair; typedef vector > EstimHist; - typedef vector > PairInfos; + typedef vector > PairInfos; typedef vector GraphLengths; EstimHist EstimateEdgePairDistances(EdgePair /*ep*/, @@ -87,186 +84,23 @@ mutable size_t gap_distances; EstimHist FindEdgePairDistances(EdgePair ep, - const TempHistogram &raw_hist) const { - size_t first_len = this->graph().length(ep.first); - size_t second_len = this->graph().length(ep.second); - TRACE("Lengths are " << first_len << " " << second_len); - TempHistogram data; - for (auto I = raw_hist.begin(), E = raw_hist.end(); I != E; ++I) { - Point p = *I; - if (math::ge(2 * (long) rounded_d(p) + (long) second_len, (long) first_len)) if ( - (long) rounded_d(p) + (long) OVERLAP_TOLERANCE >= (long) first_len) - data.insert(p); - } - EstimHist result; - double picture_weight = 0.; - for (auto I = data.begin(), E = data.end(); I != E; ++I) - picture_weight += I->weight; - if (math::ls(picture_weight, 3.)) - return result; - - DataDivider data_divider(threshold_, - vector(data.begin(), data.end())); - - PairInfos infos; - infos.reserve(data.size()); - const vector &clusters = - data_divider.DivideAndSmoothData(ep, infos, this->weight_f_); - DEBUG("Seeking for distances"); - TRACE("size " << infos.size()); - - for (size_t i = 0; i < clusters.size(); ++i) { - size_t begin = clusters[i].first; - size_t end = clusters[i].second; - TRACE("begin " << begin << " at " << rounded_d(infos[begin]) - << ", " << " end " << end << " at " << rounded_d(infos[end - 1])); - size_t data_length = rounded_d(infos[end - 1]) - rounded_d(infos[begin]) + 1; - TRACE("data length " << data_length); - if (end - begin > min_peak_points_) { - size_t range = (size_t) math::round((double) data_length * range_coeff_); - size_t delta = (size_t) math::round((double) data_length * delta_coeff_); - PeakFinder peakfinder(infos, begin, end, range, delta, percentage_, deriv_thr); - DEBUG("Processing window : " << rounded_d(infos[begin]) - << " " << rounded_d(infos[end - 1])); - peakfinder.FFTSmoothing(cutoff_); - TRACE("Listing peaks"); - const EstimHist &peaks = peakfinder.ListPeaks(); - //for (auto iter = peaks.begin(); iter != peaks.end(); ++iter) { - //TRACE("PEAKS " << iter->first << " " << iter->second); - //} - if (peaks.size() == 0) - continue; - size_t index_of_max_weight = 0; - for (size_t i = 0; i < peaks.size(); ++i) - if (math::ls(peaks[index_of_max_weight].second, peaks[i].second)) - index_of_max_weight = i; - result.push_back(peaks[index_of_max_weight]); - } - } - - if (result.size() == 0) - return result; - size_t index_of_max_weight = 0; - for (size_t i = 0; i < result.size(); ++i) - if (math::ls(result[index_of_max_weight].second, result[i].second)) - index_of_max_weight = i; - - EstimHist new_result; - for (size_t i = 0; i < result.size(); ++i) - if (result[i].second > .5 * result[index_of_max_weight].second) - new_result.push_back(result[i]); - return new_result; - } + const TempHistogram &raw_hist) const; - void ProcessEdge(EdgeId e1, + void ProcessEdge(debruijn_graph::EdgeId e1, const InPairedIndex &pi, - PairedInfoBuffer &result) const override { - typename base::LengthMap second_edges; - auto inner_map = pi.GetHalf(e1); - for (auto I : inner_map) - second_edges[I.first]; - - this->FillGraphDistancesLengths(e1, second_edges); - - for (const auto &entry: second_edges) { - EdgeId e2 = entry.first; - EdgePair ep(e1, e2); - - VERIFY(ep <= pi.ConjugatePair(ep)); - - TRACE("Processing edge pair " << this->graph().int_id(e1) - << " " << this->graph().int_id(e2)); - const GraphLengths &forward = entry.second; - - auto hist = pi.Get(e1, e2).Unwrap(); - EstimHist estimated; - //DEBUG("Extending paired information"); - //DEBUG("Extend left"); - //this->base::ExtendInfoLeft(e1, e2, hist, 1000); - DEBUG("Extend right"); - this->ExtendInfoRight(e1, e2, hist, 1000); - if (forward.size() == 0) { - estimated = FindEdgePairDistances(ep, hist); - ++gap_distances; - } else if (forward.size() > 0 && (!only_scaffolding_)) { - //TODO: remove THIS - InPairedIndex temp_index(this->graph()); - temp_index.AddMany(e1, e2, hist); - auto hist = temp_index.Get(e1, e2); - estimated = this->base::EstimateEdgePairDistances(ep, hist, forward); - } - DEBUG(gap_distances << " distances between gap edge pairs have been found"); - OutHistogram res = this->ClusterResult(ep, estimated); - this->AddToResult(res, ep, result); - } - } + PairedInfoBuffer &result) const override; - bool IsTipTip(EdgeId e1, EdgeId e2) const { - return (this->graph().OutgoingEdgeCount(this->graph().EdgeEnd(e1)) == 0 && - this->graph().IncomingEdgeCount(this->graph().EdgeEnd(e1)) == 1 && - this->graph().IncomingEdgeCount(this->graph().EdgeStart(e2)) == 0 && - this->graph().OutgoingEdgeCount(this->graph().EdgeStart(e2)) == 1); - } + bool IsTipTip(debruijn_graph::EdgeId e1, debruijn_graph::EdgeId e2) const; - void ExtendInfoRight(EdgeId e1, EdgeId e2, TempHistogram &data, size_t max_shift) const { + void ExtendInfoRight(debruijn_graph::EdgeId e1, debruijn_graph::EdgeId e2, TempHistogram &data, + size_t max_shift) const { ExtendRightDFS(e1, e2, data, 0, max_shift); } - void MergeInto(const InHistogram &what, TempHistogram &where, int shift) const { - // assuming they are sorted already - if (what.size() == 0) - return; - - if (where.size() == 0) { - for (auto to_be_added : what) { - to_be_added.d += shift; - where.insert(to_be_added); - } - - return; - } - - // Check, whether two histograms intersect. If not, we can just merge them - // straightforwardly. - if (math::ls(where.rbegin()->d, what.min().d + float(shift)) || - math::gr(where.begin()->d, what.max().d + float(shift))) { - for (auto to_be_added : what) { - to_be_added.d += shift; - where.insert(to_be_added); - } - } else { - for (auto to_be_added : what) { - to_be_added.d += shift; - auto low_bound = std::lower_bound(where.begin(), where.end(), to_be_added); - if (low_bound != where.end() && to_be_added == *low_bound) { - to_be_added.weight += low_bound->weight; - where.erase(to_be_added); - where.insert(to_be_added); - } else - where.insert(low_bound, to_be_added); - } - } - } + void MergeInto(const InHistogram &what, TempHistogram &where, int shift) const; - void ExtendRightDFS(const EdgeId &first, EdgeId current, TempHistogram &data, int shift, - size_t max_shift) const { - auto end = this->graph().EdgeEnd(current); - if (current == first) - return; - if (this->graph().IncomingEdgeCount(end) > 1) - return; - - for (EdgeId next : this->graph().OutgoingEdges(end)) { - auto hist = this->index().Get(first, next); - if (-shift < (int) max_shift) - ExtendRightDFS(first, next, data, shift - (int) this->graph().length(current), max_shift); - - //auto filtered_infos = FilterPositive(hist, this->graph().length(first), this->graph().length(next)); - //if (filtered_infos.size() > 0) - // MergeInto(filtered_infos, data, shift - (int) this->graph().length(current)); - MergeInto(hist, data, shift - (int) this->graph().length(current)); - } - } + void ExtendRightDFS(const debruijn_graph::EdgeId &first, debruijn_graph::EdgeId current, TempHistogram &data, + int shift, size_t max_shift) const; const string Name() const override { static const string my_name = "SMOOTHING"; diff -Nru spades-3.10.1+dfsg/src/common/paired_info/split_path_constructor.hpp spades-3.11.1+dfsg/src/common/paired_info/split_path_constructor.hpp --- spades-3.10.1+dfsg/src/common/paired_info/split_path_constructor.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/paired_info/split_path_constructor.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -111,7 +111,7 @@ for (size_t j = 0; j < common_part.size(); ++j) total_length += graph_.length(common_part[j]); - DEBUG("Common part " << ToString(common_part)); + DEBUG("Common part " << std::to_string(total_length)); for (size_t j = 0; j < common_part.size(); ++j) { PairInfo cur_pi(cur_edge, common_part[j], cur_info.d() - (double) total_length, diff -Nru spades-3.10.1+dfsg/src/common/paired_info/weighted_distance_estimation.cpp spades-3.11.1+dfsg/src/common/paired_info/weighted_distance_estimation.cpp --- spades-3.10.1+dfsg/src/common/paired_info/weighted_distance_estimation.cpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/paired_info/weighted_distance_estimation.cpp 2017-09-28 09:05:10.000000000 +0000 @@ -0,0 +1,63 @@ +#include "weighted_distance_estimation.hpp" + +namespace omnigraph { +namespace de { + +using namespace debruijn_graph; + +WeightedDistanceEstimator::EstimHist WeightedDistanceEstimator::EstimateEdgePairDistances(EdgePair ep, + const InHistogram &histogram, + const GraphLengths &raw_forward) const { + using std::abs; + using namespace math; + TRACE("Estimating with weight function"); + size_t first_len = this->graph().length(ep.first); + size_t second_len = this->graph().length(ep.second); + + EstimHist result; + int maxD = rounded_d(histogram.max()), minD = rounded_d(histogram.min()); + vector forward; + for (auto len : raw_forward) { + int length = (int) len; + if (minD - (int) this->max_distance_ <= length && length <= maxD + (int) this->max_distance_) { + forward.push_back(length); + } + } + if (forward.size() == 0) + return result; + + DEDistance max_dist = this->max_distance_; + size_t i = 0; + vector weights(forward.size()); + for (auto point : histogram) { + DEDistance cur_dist(forward[i]), next_dist(forward[i + 1]); + if (le(2 * point.d + DEDistance(second_len), DEDistance(first_len))) + continue; + while (i + 1 < forward.size() && next_dist < point.d) { + ++i; + } + if (i + 1 < forward.size() && ls(DEDistance(next_dist) - point.d, point.d - DEDistance(cur_dist))) { + ++i; + if (le(abs(cur_dist - point.d), max_dist)) + weights[i] += point.weight * weight_f_(forward[i] - rounded_d(point)); + } + else if (i + 1 < forward.size() && eq(next_dist - point.d, point.d - cur_dist)) { + if (le(abs(cur_dist - point.d), max_dist)) + weights[i] += point.weight * 0.5 * weight_f_(forward[i] - rounded_d(point)); + + ++i; + + if (le(abs(cur_dist - point.d), max_dist)) + weights[i] += point.weight * 0.5 * weight_f_(forward[i] - rounded_d(point)); + } else if (le(abs(cur_dist - point.d), max_dist)) + weights[i] += point.weight * weight_f_(forward[i] - rounded_d(point)); + } + + for (size_t i = 0; i < forward.size(); ++i) + if (gr(weights[i], 0.)) + result.push_back(make_pair(forward[i], weights[i])); + + return result; +} +} +} diff -Nru spades-3.10.1+dfsg/src/common/paired_info/weighted_distance_estimation.hpp spades-3.11.1+dfsg/src/common/paired_info/weighted_distance_estimation.hpp --- spades-3.10.1+dfsg/src/common/paired_info/weighted_distance_estimation.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/paired_info/weighted_distance_estimation.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -8,27 +8,24 @@ #ifndef WEIGHTED_DISTANCE_ESTIMATION_HPP_ #define WEIGHTED_DISTANCE_ESTIMATION_HPP_ -#include "math/xmath.h" -#include "paired_info.hpp" #include "distance_estimation.hpp" namespace omnigraph { namespace de { -template -class WeightedDistanceEstimator : public DistanceEstimator { +class WeightedDistanceEstimator : public DistanceEstimator { protected: - typedef DistanceEstimator base; + typedef DistanceEstimator base; typedef typename base::InPairedIndex InPairedIndex; typedef typename base::OutPairedIndex OutPairedIndex; typedef typename base::InHistogram InHistogram; typedef typename base::OutHistogram OutHistogram; public: - WeightedDistanceEstimator(const Graph &graph, + WeightedDistanceEstimator(const debruijn_graph::Graph &graph, const InPairedIndex &histogram, - const GraphDistanceFinder &distance_finder, + const GraphDistanceFinder &distance_finder, std::function weight_f, size_t linkage_distance, size_t max_distance) : base(graph, histogram, distance_finder, linkage_distance, max_distance), weight_f_(weight_f) { } @@ -36,74 +33,24 @@ virtual ~WeightedDistanceEstimator() { } protected: - typedef typename Graph::EdgeId EdgeId; typedef vector > EstimHist; - typedef pair EdgePair; + typedef pair EdgePair; typedef vector GraphLengths; std::function weight_f_; virtual EstimHist EstimateEdgePairDistances(EdgePair ep, const InHistogram &histogram, - const GraphLengths &raw_forward) const override { - using std::abs; - using namespace math; - TRACE("Estimating with weight function"); - size_t first_len = this->graph().length(ep.first); - size_t second_len = this->graph().length(ep.second); - - EstimHist result; - int maxD = rounded_d(histogram.max()), minD = rounded_d(histogram.min()); - vector forward; - for (auto len : raw_forward) { - int length = (int) len; - if (minD - (int) this->max_distance_ <= length && length <= maxD + (int) this->max_distance_) { - forward.push_back(length); - } - } - if (forward.size() == 0) - return result; - - DEDistance max_dist = this->max_distance_; - size_t i = 0; - vector weights(forward.size()); - for (auto point : histogram) { - DEDistance cur_dist(forward[i]), next_dist(forward[i + 1]); - if (le(2 * point.d + DEDistance(second_len), DEDistance(first_len))) - continue; - while (i + 1 < forward.size() && next_dist < point.d) { - ++i; - } - if (i + 1 < forward.size() && ls(DEDistance(next_dist) - point.d, point.d - DEDistance(cur_dist))) { - ++i; - if (le(abs(cur_dist - point.d), max_dist)) - weights[i] += point.weight * weight_f_(forward[i] - rounded_d(point)); - } - else if (i + 1 < forward.size() && eq(next_dist - point.d, point.d - cur_dist)) { - if (le(abs(cur_dist - point.d), max_dist)) - weights[i] += point.weight * 0.5 * weight_f_(forward[i] - rounded_d(point)); - - ++i; - - if (le(abs(cur_dist - point.d), max_dist)) - weights[i] += point.weight * 0.5 * weight_f_(forward[i] - rounded_d(point)); - } else if (le(abs(cur_dist - point.d), max_dist)) - weights[i] += point.weight * weight_f_(forward[i] - rounded_d(point)); - } - - for (size_t i = 0; i < forward.size(); ++i) - if (gr(weights[i], 0.)) - result.push_back(make_pair(forward[i], weights[i])); - - return result; - } + const GraphLengths &raw_forward) const override; const string Name() const override { static const string my_name = "WEIGHTED"; return my_name; } +private: + DECL_LOGGER("WeightedDistanceEstimator"); }; } diff -Nru spades-3.10.1+dfsg/src/common/paired_info/weights.hpp spades-3.11.1+dfsg/src/common/paired_info/weights.hpp --- spades-3.10.1+dfsg/src/common/paired_info/weights.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/paired_info/weights.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -1,11 +1,10 @@ #pragma once +namespace debruijn_graph { using omnigraph::Path; using omnigraph::MappingPath; -using omnigraph::Range; using omnigraph::MappingRange; -namespace debruijn_graph { inline double PairedReadCountWeight(const std::pair&, const MappingRange&, const MappingRange&) { return 1.; diff -Nru spades-3.10.1+dfsg/src/common/pipeline/config_common.hpp spades-3.11.1+dfsg/src/common/pipeline/config_common.hpp --- spades-3.10.1+dfsg/src/common/pipeline/config_common.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/pipeline/config_common.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -7,8 +7,8 @@ #pragma once -#include "utils/simple_tools.hpp" -#include "utils/path_helper.hpp" +#include "utils/stl_utils.hpp" +#include "utils/filesystem/path_helper.hpp" #include "utils/verify.hpp" // todo: undo dirty fix @@ -82,12 +82,12 @@ return; } for (size_t i = 1; ; i++) { - value = pt.get_optional(key + "#" + ToString(i)); + value = pt.get_optional(key + "#" + std::to_string(i)); if (value) { vec.push_back(*value); continue; } - value = pt.get_optional(key + "." + ToString(i)); + value = pt.get_optional(key + "." + std::to_string(i)); if (value) { vec.push_back(*value); continue; diff -Nru spades-3.10.1+dfsg/src/common/pipeline/config_struct.cpp spades-3.11.1+dfsg/src/common/pipeline/config_struct.cpp --- spades-3.10.1+dfsg/src/common/pipeline/config_struct.cpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/pipeline/config_struct.cpp 2017-09-28 09:05:10.000000000 +0000 @@ -7,7 +7,7 @@ #include "pipeline/config_struct.hpp" #include "pipeline/config_common.hpp" -#include "utils/openmp_wrapper.h" +#include "utils/parallel/openmp_wrapper.h" #include "utils/logger/logger.hpp" #include "utils/verify.hpp" @@ -249,6 +249,7 @@ load(br.max_delta, pt, "max_delta", complete); load(br.max_relative_delta, pt, "max_relative_delta", complete); load(br.max_number_edges, pt, "max_number_edges", complete); + load(br.dijkstra_vertex_limit, pt, "dijkstra_vertex_limit", complete); load(br.parallel, pt, "parallel", complete); load(br.buff_size, pt, "buff_size", complete); load(br.buff_cov_diff, pt, "buff_cov_diff", complete); @@ -292,13 +293,22 @@ load(rcc.max_coverage_coeff, pt, "max_coverage_coeff", complete); } -void load(debruijn_config::simplification::isolated_edges_remover& ier, +void load(debruijn_config::simplification::isolated_edge_remover& ier, boost::property_tree::ptree const& pt, bool complete) { using config_common::load; load(ier.enabled, pt, "enabled", complete); load(ier.max_length, pt, "max_length", complete); + load(ier.use_rl_for_max_length, pt, "use_rl_for_max_length", complete); load(ier.max_coverage, pt, "max_coverage", complete); load(ier.max_length_any_cov, pt, "max_length_any_cov", complete); + load(ier.use_rl_for_max_length_any_cov, pt, "use_rl_for_max_length_any_cov", complete); +} + +void load(debruijn_config::simplification::low_covered_edge_remover& lcer, + boost::property_tree::ptree const& pt, bool complete) { + using config_common::load; + load(lcer.enabled, pt, "lcer_enabled", complete); + load(lcer.coverage_threshold, pt, "lcer_coverage_threshold", complete); } void load(debruijn_config::simplification::init_cleaning& init_clean, @@ -517,6 +527,13 @@ load(graph_read_corr.binary, pt, "binary"); } +void load(debruijn_config::strand_specificity& ss, + boost::property_tree::ptree const& pt, bool /*complete*/) { + using config_common::load; + load(ss.ss_enabled, pt, "ss_enabled"); + load(ss.antisense, pt, "antisense"); +} + void load(debruijn_config::kmer_coverage_model& kcm, boost::property_tree::ptree const& pt, bool /*complete*/) { using config_common::load; @@ -544,23 +561,27 @@ std::string input_dir) { if (ds.reads_filename[0] != '/') ds.reads_filename = input_dir + ds.reads_filename; - path::CheckFileExistenceFATAL(ds.reads_filename); + fs::CheckFileExistenceFATAL(ds.reads_filename); ds.reads.load(ds.reads_filename); } void load_reference_genome(dataset &ds, std::string input_dir) { if (ds.reference_genome_filename == "") { - ds.reference_genome = ""; + ds.reference_genome = vector(0); return; } if (ds.reference_genome_filename[0] != '/') ds.reference_genome_filename = input_dir + ds.reference_genome_filename; - path::CheckFileExistenceFATAL(ds.reference_genome_filename); + fs::CheckFileExistenceFATAL(ds.reference_genome_filename); io::FileReadStream genome_stream(ds.reference_genome_filename); - io::SingleRead genome; - genome_stream >> genome; - ds.reference_genome = genome.GetSequenceString(); + while (!genome_stream.eof()) { + io::SingleRead genome; + genome_stream >> genome; + ds.reference_genome.push_back(genome.GetSequenceString()); + } + + } void load(debruijn_config::simplification& simp, @@ -591,8 +612,6 @@ load(simp.init_clean, pt, "init_clean", complete); // presimplification load(simp.final_tc, pt, "final_tc", complete); load(simp.final_br, pt, "final_br", complete); - simp.second_final_br = simp.final_br; - load(simp.second_final_br, pt, "second_final_br", false); } void load(debruijn_config::info_printer& printer, @@ -650,7 +669,7 @@ // input options: load(cfg.dataset_file, pt, "dataset"); // input dir is based on dataset file location (all paths in datasets are relative to its location) - cfg.input_dir = path::parent_path(cfg.dataset_file); + cfg.input_dir = fs::parent_path(cfg.dataset_file); if (cfg.input_dir[cfg.input_dir.length() - 1] != '/') cfg.input_dir += '/'; @@ -702,7 +721,7 @@ load(cfg.max_memory, pt, "max_memory"); - path::CheckFileExistenceFATAL(cfg.dataset_file); + fs::CheckFileExistenceFATAL(cfg.dataset_file); boost::property_tree::ptree ds_pt; boost::property_tree::read_info(cfg.dataset_file, ds_pt); load(cfg.ds, ds_pt, true); @@ -725,8 +744,6 @@ load(cfg.co, pt, "contig_output", complete); - load(cfg.use_unipaths, pt, "use_unipaths", complete); - load(cfg.pb, pt, "pacbio_processor", complete); load(cfg.two_step_rr, pt, "two_step_rr", complete); @@ -753,6 +770,8 @@ load(cfg.flanking_range, pt, "flanking_range", complete); load(cfg.graph_read_corr, pt, "graph_read_corr", complete); load(cfg.kcm, pt, "kmer_coverage_model", complete); + //TODO come up with a fix to this hack + load(cfg.simp.lcer, pt, "lcer", complete); //low coverage edge remover load(cfg.pos, pt, "pos", complete); // position handler: load(cfg.rm, pt, "resolving_mode", complete); @@ -763,12 +782,23 @@ load(cfg.sensitive_map, pt, "sensitive_mapper", complete); + bool save_gp; + load(save_gp, pt, "save_gp", complete); load(cfg.info_printers, pt, "info_printers", complete); + if (save_gp) { + INFO("Graph pack will be saved before repeat resolution"); + cfg.info_printers[info_printer_pos::before_repeat_resolution].save_graph_pack = true; + } + load(cfg.bwa, pt, "bwa_aligner", complete); load(cfg.series_analysis, pt, "series_analysis", complete); + load(cfg.ss, pt, "strand_specificity", complete); + load(cfg.calculate_coverage_for_each_lib, pt, "calculate_coverage_for_each_lib", complete); + + if (pt.count("plasmid")) { VERIFY_MSG(!cfg.pd, "Option can be loaded only once"); cfg.pd.reset(debruijn_config::plasmid()); diff -Nru spades-3.10.1+dfsg/src/common/pipeline/config_struct.hpp spades-3.11.1+dfsg/src/common/pipeline/config_struct.hpp --- spades-3.10.1+dfsg/src/common/pipeline/config_struct.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/pipeline/config_struct.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -185,7 +185,7 @@ std::string reference_genome_filename; std::string reads_filename; - std::string reference_genome; + std::vector reference_genome; dataset(): max_read_length(0), average_coverage(0.0) { } @@ -236,6 +236,7 @@ size_t max_delta; double max_relative_delta; size_t max_number_edges; + size_t dijkstra_vertex_limit; bool parallel; size_t buff_size; double buff_cov_diff; @@ -279,10 +280,17 @@ size_t plausibility_length; }; - struct isolated_edges_remover { + struct low_covered_edge_remover { bool enabled; + double coverage_threshold; + }; + + struct isolated_edge_remover { + bool enabled; + bool use_rl_for_max_length; // max_length will be taken max with read_length size_t max_length; double max_coverage; + bool use_rl_for_max_length_any_cov; size_t max_length_any_cov; }; @@ -320,7 +328,7 @@ bool early_it_only; double activation_cov; - isolated_edges_remover ier; + isolated_edge_remover ier; std::string tip_condition; std::string ec_condition; double disconnect_flank_cov; @@ -343,13 +351,12 @@ tr_based_ec_remover trec; interstrand_ec_remover isec; max_flow_ec_remover mfec; - isolated_edges_remover ier; + isolated_edge_remover ier; complex_bulge_remover cbr; hidden_ec_remover her; - + low_covered_edge_remover lcer; tip_clipper final_tc; bulge_remover final_br; - bulge_remover second_final_br; init_cleaning init_clean; }; @@ -496,9 +503,7 @@ std::string output_base; std::string output_dir; std::string tmp_dir; - std::string output_suffix; std::string output_saves; - std::string final_contigs_file; std::string log_filename; std::string series_analysis; @@ -507,7 +512,6 @@ bool compute_paths_number; bool use_additional_contigs; - bool use_unipaths; std::string additional_contigs; struct scaffold_correction { @@ -528,6 +532,11 @@ output_broken_scaffolds obs_mode; }; + struct strand_specificity { + bool ss_enabled; + bool antisense; + }; + contig_output co; boost::optional sc_cor; @@ -554,7 +563,6 @@ size_t buffer_size; std::string temp_bin_reads_dir; std::string temp_bin_reads_path; - std::string temp_bin_reads_info; std::string paired_read_prefix; std::string single_read_prefix; @@ -588,6 +596,9 @@ boost::optional pd; size_t flanking_range; + bool calculate_coverage_for_each_lib; + strand_specificity ss; + bool need_mapping; debruijn_config() : diff -Nru spades-3.10.1+dfsg/src/common/pipeline/genomic_info_filler.cpp spades-3.11.1+dfsg/src/common/pipeline/genomic_info_filler.cpp --- spades-3.10.1+dfsg/src/common/pipeline/genomic_info_filler.cpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/pipeline/genomic_info_filler.cpp 2017-09-28 09:05:10.000000000 +0000 @@ -6,19 +6,13 @@ //*************************************************************************** #include "genomic_info_filler.hpp" - -#include "utils/coverage_model/kmer_coverage_model.hpp" +#include "modules/coverage_model/kmer_coverage_model.hpp" #include "modules/simplification/ec_threshold_finder.hpp" #include "llvm/Support/YAMLTraits.h" #include "llvm/Support/Errc.h" #include "llvm/Support/FileSystem.h" -#include - -#include -#include - using namespace llvm; using namespace debruijn_graph; @@ -123,7 +117,7 @@ gp.ginfo.set_cov_histogram(extract(tmp)); // Fit the coverage model and get the threshold - utils::coverage_model::KMerCoverageModel CovModel(gp.ginfo.cov_histogram(), cfg::get().kcm.probability_threshold, cfg::get().kcm.strong_probability_threshold); + coverage_model::KMerCoverageModel CovModel(gp.ginfo.cov_histogram(), cfg::get().kcm.probability_threshold, cfg::get().kcm.strong_probability_threshold); CovModel.Fit(); gp.ginfo.set_genome_size(CovModel.GetGenomeSize()); diff -Nru spades-3.10.1+dfsg/src/common/pipeline/graphio.hpp spades-3.11.1+dfsg/src/common/pipeline/graphio.hpp --- spades-3.10.1+dfsg/src/common/pipeline/graphio.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/pipeline/graphio.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -171,11 +171,6 @@ } } -// template -// void SaveEdgeAssociatedInfo(const C& c, ostream& out) const { -// SaveEdgeAssociatedInfo(boost::bind(&C::operator[], c, _1), out); -// } - template void SaveEdgeAssociatedInfo(const C& c, ostream& out) const { out << component_.e_size() << endl; @@ -220,7 +215,6 @@ void SaveEdgeSequences(const string& file_name) const { ofstream out(file_name + ".sqn"); //todo switch to general function after its switching to fasta - //SaveEdgeAssociatedInfo(boost::bind(&Graph::EdgeNucls, component_.g(), _1), out); DEBUG("Saving sequences, " << file_name <<" created"); for (auto iter = component_.e_begin(); iter != component_.e_end(); ++iter) { EdgeId e = *iter; @@ -411,11 +405,6 @@ } } -// template -// void LoadEdgeAssociatedInfo(const C& c, ostream& out) const { -// SaveEdgeAssociatedInfo(boost::bind(&C::operator[], c, _1), out); -// } - public: virtual void LoadGraph(const string& file_name) = 0; @@ -426,7 +415,7 @@ } bool LoadFlankingCoverage(const string& file_name, FlankingCoverage& flanking_cov) { - if (!path::FileExists(file_name + ".flcvr")) { + if (!fs::FileExists(file_name + ".flcvr")) { INFO("Flanking coverage saves are absent"); return false; } @@ -659,7 +648,7 @@ inline std::string MakeSingleReadsFileName(const std::string& file_name, size_t index) { - return file_name + "_paths_" + ToString(index) + ".mpr"; + return file_name + "_paths_" + std::to_string(index) + ".mpr"; } //helper methods @@ -728,21 +717,21 @@ void PrintUnclusteredIndices(const string& file_name, DataPrinter& printer, const UnclusteredPairedInfoIndicesT& paired_indices) { for (size_t i = 0; i < paired_indices.size(); ++i) - PrintUnclusteredIndex(file_name + "_" + ToString(i), printer, paired_indices[i]); + PrintUnclusteredIndex(file_name + "_" + std::to_string(i), printer, paired_indices[i]); } template void PrintClusteredIndices(const string& file_name, DataPrinter& printer, const PairedInfoIndicesT& paired_indices) { for (size_t i = 0; i < paired_indices.size(); ++i) - PrintClusteredIndex(file_name + "_" + ToString(i), printer, paired_indices[i]); + PrintClusteredIndex(file_name + "_" + std::to_string(i), printer, paired_indices[i]); } template void PrintScaffoldingIndices(const string& file_name, DataPrinter& printer, const PairedInfoIndicesT& paired_indices) { for (size_t i = 0; i < paired_indices.size(); ++i) - PrintScaffoldingIndex(file_name + "_" + ToString(i), printer, paired_indices[i]); + PrintScaffoldingIndex(file_name + "_" + std::to_string(i), printer, paired_indices[i]); } template @@ -920,7 +909,7 @@ UnclusteredPairedInfoIndicesT& paired_indices, bool force_exists = true) { for (size_t i = 0; i < paired_indices.size(); ++i) - ScanPairedIndex(file_name + "_" + ToString(i), scanner, paired_indices[i], force_exists); + ScanPairedIndex(file_name + "_" + std::to_string(i), scanner, paired_indices[i], force_exists); } template @@ -928,7 +917,7 @@ PairedInfoIndicesT& paired_indices, bool force_exists = true) { for (size_t i = 0; i < paired_indices.size(); ++i) - ScanClusteredIndex(file_name + "_" + ToString(i), scanner, paired_indices[i], force_exists); + ScanClusteredIndex(file_name + "_" + std::to_string(i), scanner, paired_indices[i], force_exists); } template @@ -936,7 +925,7 @@ PairedInfoIndicesT& paired_indices, bool force_exists = true) { for (size_t i = 0; i < paired_indices.size(); ++i) - ScanScaffoldingIndex(file_name + "_" + ToString(i), scanner, paired_indices[i], force_exists); + ScanScaffoldingIndex(file_name + "_" + std::to_string(i), scanner, paired_indices[i], force_exists); } template @@ -944,7 +933,7 @@ PairedInfoIndicesT& scaffold_indices) { for (size_t i = 0; i < scaffold_indices.size(); ++i) { - ScanScaffoldIndex(file_name + "_" + ToString(i), scanner, scaffold_indices[i]); + ScanScaffoldIndex(file_name + "_" + std::to_string(i), scanner, scaffold_indices[i]); } } diff -Nru spades-3.10.1+dfsg/src/common/pipeline/graph_pack.hpp spades-3.11.1+dfsg/src/common/pipeline/graph_pack.hpp --- spades-3.10.1+dfsg/src/common/pipeline/graph_pack.hpp 2017-02-28 14:55:36.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/pipeline/graph_pack.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -7,8 +7,8 @@ #pragma once -#include "utils/indices/edge_position_index.hpp" -#include "utils/indices/storing_traits.hpp" +#include "assembly_graph/index/edge_position_index.hpp" +#include "utils/ph_map/storing_traits.hpp" #include "sequence/genome_storage.hpp" #include "assembly_graph/handlers/id_track_handler.hpp" #include "assembly_graph/handlers/edges_position_handler.hpp" @@ -23,8 +23,9 @@ #include "assembly_graph/graph_support/detail_coverage.hpp" #include "assembly_graph/components/connected_component.hpp" #include "modules/alignment/kmer_mapper.hpp" -#include "common/visualization/position_filler.hpp" -#include "common/assembly_graph/paths/bidirectional_path.hpp" +#include "visualization/position_filler.hpp" +#include "assembly_graph/paths/bidirectional_path_container.hpp" +#include "common/modules/alignment/rna/ss_coverage.hpp" namespace debruijn_graph { @@ -50,6 +51,7 @@ PairedInfoIndicesT clustered_indices; PairedInfoIndicesT scaffolding_indices; LongReadContainerT single_long_reads; + vector ss_coverage; GenomicInfo ginfo; GenomeStorage genome; @@ -59,7 +61,7 @@ path_extend::PathContainer contig_paths; graph_pack(size_t k, const std::string &workdir, size_t lib_count, - const std::string &genome = "", + const std::vector &genome = std::vector(0), size_t flanking_range = 50, size_t max_mapping_gap = 0, size_t max_gap_diff = 0, @@ -71,6 +73,7 @@ clustered_indices(g, lib_count), scaffolding_indices(g, lib_count), single_long_reads(g, lib_count), + ss_coverage(lib_count, SSCoverageStorage(g)), genome(genome), edge_qual(g), edge_pos(g, max_mapping_gap + k, max_gap_diff), @@ -121,8 +124,8 @@ edge_pos.Attach(); } edge_pos.clear(); - visualization::position_filler::FillPos(*this, genome.GetSequence(), "ref0"); - visualization::position_filler::FillPos(*this, !genome.GetSequence(), "ref1"); + visualization::position_filler::FillPos(*this, genome.str(), "ref0"); + visualization::position_filler::FillPos(*this, ReverseComplement(genome.str()), "ref1"); } void EnsureDebugInfo() { diff -Nru spades-3.10.1+dfsg/src/common/pipeline/library.cpp spades-3.11.1+dfsg/src/common/pipeline/library.cpp --- spades-3.10.1+dfsg/src/common/pipeline/library.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/pipeline/library.cpp 2017-09-28 09:05:10.000000000 +0000 @@ -6,7 +6,7 @@ //*************************************************************************** #include "pipeline/library.hpp" -#include "utils/path_helper.hpp" +#include "utils/filesystem/path_helper.hpp" #include "llvm/Support/YAMLTraits.h" #include "llvm/Support/Errc.h" diff -Nru spades-3.10.1+dfsg/src/common/pipeline/library.hpp spades-3.11.1+dfsg/src/common/pipeline/library.hpp --- spades-3.10.1+dfsg/src/common/pipeline/library.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/pipeline/library.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -8,8 +8,8 @@ #ifndef __IO_LIBRARY_HPP__ #define __IO_LIBRARY_HPP__ -#include "common/adt/chained_iterator.hpp" -#include "common/adt/iterator_range.hpp" +#include "adt/chained_iterator.hpp" +#include "adt/iterator_range.hpp" #include @@ -88,7 +88,7 @@ inner_iterator right_; }; - typedef chained_iterator::const_iterator> single_reads_iterator; + typedef typename adt::chained_iterator::const_iterator> single_reads_iterator; SequencingLibraryBase() : type_(LibraryType::PairedEnd), orientation_(LibraryOrientation::FR) {} @@ -261,8 +261,8 @@ public: typedef typename LibraryStorage::iterator iterator; typedef typename LibraryStorage::const_iterator const_iterator; - typedef chained_iterator single_reads_iterator; - typedef chained_iterator paired_reads_iterator; + typedef adt::chained_iterator single_reads_iterator; + typedef adt::chained_iterator paired_reads_iterator; DataSet() {} explicit DataSet(const std::string &path) { load(path); } diff -Nru spades-3.10.1+dfsg/src/common/pipeline/library.inl spades-3.11.1+dfsg/src/common/pipeline/library.inl --- spades-3.10.1+dfsg/src/common/pipeline/library.inl 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/pipeline/library.inl 2017-09-28 09:05:10.000000000 +0000 @@ -55,7 +55,7 @@ throw; } - std::string input_dir = path::parent_path(filename); + std::string input_dir = fs::parent_path(filename); if (input_dir[input_dir.length() - 1] != '/') input_dir += '/'; diff -Nru spades-3.10.1+dfsg/src/common/pipeline/stage.cpp spades-3.11.1+dfsg/src/common/pipeline/stage.cpp --- spades-3.10.1+dfsg/src/common/pipeline/stage.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/pipeline/stage.cpp 2017-09-28 09:05:10.000000000 +0000 @@ -18,7 +18,7 @@ void AssemblyStage::load(debruijn_graph::conj_graph_pack& gp, const std::string &load_from, const char* prefix) { - std::string p = path::append_path(load_from, prefix == NULL ? id_ : prefix); + std::string p = fs::append_path(load_from, prefix == NULL ? id_ : prefix); INFO("Loading current state from " << p); debruijn_graph::graphio::ScanAll(p, gp, false); @@ -28,7 +28,7 @@ void AssemblyStage::save(const debruijn_graph::conj_graph_pack& gp, const std::string &save_to, const char* prefix) const { - std::string p = path::append_path(save_to, prefix == NULL ? id_ : prefix); + std::string p = fs::append_path(save_to, prefix == NULL ? id_ : prefix); INFO("Saving current state to " << p); debruijn_graph::graphio::PrintAll(p, gp); diff -Nru spades-3.10.1+dfsg/src/common/sequence/genome_storage.hpp spades-3.11.1+dfsg/src/common/sequence/genome_storage.hpp --- spades-3.10.1+dfsg/src/common/sequence/genome_storage.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/sequence/genome_storage.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -10,23 +10,68 @@ #include "sequence.hpp" #include "nucl.hpp" +struct Chromosome { + std::string name; + std::string sequence; + Chromosome(string chr_name, string seq): name(chr_name), sequence(seq){} +}; + class GenomeStorage { - std::string s_; +//all chromosomes glued + std::string glued_genome_; + std::vector full_genome_; + + std::string ACGTise(const std::string &s) const { + std::stringstream ss; + std::string tmp(s); + transform(tmp.begin(), tmp.end(), tmp.begin(), ::toupper); + for (size_t i = 0; i < tmp.length(); i++){ + if (is_nucl(tmp[i])) { + ss << tmp[i]; + } + } + return ss.str(); + } + public: GenomeStorage() { } - GenomeStorage(const std::string &s): s_(s) { + GenomeStorage(const std::string &s): glued_genome_(s), full_genome_() { + full_genome_.push_back(Chromosome("genome", ACGTise(glued_genome_))); } + GenomeStorage(const vector &chromosomes): full_genome_(chromosomes) { + std::stringstream ss; + for (const auto &s: chromosomes) { + ss << ACGTise(s.sequence); +//do we need a separator between? + } + glued_genome_ = ss.str(); + } + + GenomeStorage(const vector &chromosomes): full_genome_() { + std::stringstream ss; + int count = 0; + for (const auto &s: chromosomes) { + count ++; + std::string fxd = ACGTise(s); + full_genome_.push_back(Chromosome("chr" + std::to_string(count), fxd)); + ss << fxd; +//do we need a separator between? + } + glued_genome_ = ss.str(); + } + + //TODO exterminate this where possible Sequence GetSequence() const { stringstream ss; size_t l = 0, r = 0; - for(size_t i = 0; i < s_.size(); i++) { - if (!is_nucl(s_[i]) ) { + for (size_t i = 0; i < glued_genome_.size(); i++) { + if (!is_nucl(glued_genome_[i]) ) { if (r > l) { - ss << s_.substr(l, r - l); + ss << glued_genome_.substr(l, r - l); } r = i + 1; l = i + 1; @@ -35,21 +80,25 @@ } } if (r > l) { - ss << s_.substr(l, r - l); + ss << glued_genome_.substr(l, r - l); } return Sequence(ss.str()); } + std::vector GetChromosomes() const{ + return full_genome_; + } + void SetSequence(const Sequence &s) { - s_ = s.str(); + glued_genome_ = s.str(); } std::string str() const { - return s_; + return glued_genome_; } size_t size() const { - return s_.size(); + return glued_genome_.size(); } }; diff -Nru spades-3.10.1+dfsg/src/common/sequence/levenshtein.hpp spades-3.11.1+dfsg/src/common/sequence/levenshtein.hpp --- spades-3.10.1+dfsg/src/common/sequence/levenshtein.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/sequence/levenshtein.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -0,0 +1,241 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include +#include +#include "utils/stl_utils.hpp" + +/* + * Little modified copy-paste from http://www.merriampark.com/ldcpp.htm + */ +inline size_t edit_distance(const std::string &source, const std::string &target) { + + // Step 1 + + const size_t n = source.length(); + const size_t m = target.length(); + if (n == 0) { + return m; + } + if (m == 0) { + return n; + } + + // Good form to declare a TYPEDEF + + typedef std::vector > Tmatrix; + + Tmatrix matrix(n + 1); + + // Size the vectors in the 2.nd dimension. Unfortunately C++ doesn't + // allow for allocation on declaration of 2.nd dimension of vec of vec + + for (size_t i = 0; i <= n; i++) { + matrix[i].resize(m + 1); + } + + // Step 2 + + for (size_t i = 0; i <= n; i++) { + matrix[i][0] = i; + } + + for (size_t j = 0; j <= m; j++) { + matrix[0][j] = j; + } + + // Step 3 + + for (size_t i = 1; i <= n; i++) { + + const char s_i = source[i - 1]; + + // Step 4 + + for (size_t j = 1; j <= m; j++) { + + const char t_j = target[j - 1]; + + // Step 5 + + size_t cost; + if (s_i == t_j) { + cost = 0; + } + else { + cost = 1; + } + + // Step 6 + + const size_t above = matrix[i - 1][j]; + const size_t left = matrix[i][j - 1]; + const size_t diag = matrix[i - 1][j - 1]; + size_t cell = std::min(above + 1, std::min(left + 1, diag + cost)); + + // Step 6A: Cover transposition, in addition to deletion, + // insertion and substitution. This step is taken from: + // Berghel, Hal ; Roach, David : "An Extension of Ukkonen's + // Enhanced Dynamic Programming ASM Algorithm" + // (http://www.acm.org/~hlb/publications/asm/asm.html) + + if (i > 2 && j > 2) { + size_t trans = matrix[i - 2][j - 2] + 1; + if (source[i - 2] != t_j) trans++; + if (s_i != target[j - 2]) trans++; + if (cell > trans) cell = trans; + } + + matrix[i][j] = cell; + } + } + + // Step 7 + + return matrix[n][m]; +} + +inline std::pair, std::string> best_edit_distance_cigar(const std::string &source, + const std::string &target) { + + // Step 1 + + const size_t n = source.length(); + const size_t m = target.length(); +// if (n == 0) { +// return m; +// } +// if (m == 0) { +// return n; +// } + + // Good form to declare a TYPEDEF + + typedef std::vector > Tmatrix; + + Tmatrix matrix(n + 1); + + // Size the vectors in the 2.nd dimension. Unfortunately C++ doesn't + // allow for allocation on declaration of 2.nd dimension of vec of vec + + for (size_t i = 0; i <= n; i++) { + matrix[i].resize(m + 1); + } + + // Step 2 + + for (size_t i = 0; i <= n; i++) { + matrix[i][0] = (int) i; + } + + for (size_t j = 0; j <= m; j++) { + matrix[0][j] = 0; //free inserts in front + } + + // Step 3 + + for (size_t i = 1; i <= n; i++) { + + const char s_i = source[i - 1]; + + // Step 4 + + for (size_t j = 1; j <= m; j++) { + + const char t_j = target[j - 1]; + + // Step 5 + + int cost; + if (s_i == t_j) { + cost = 0; + } + else { + cost = 1; + } + + // Step 6 + + const int above = matrix[i - 1][j]; + const int left = matrix[i][j - 1]; + const int diag = matrix[i - 1][j - 1]; + int cell = std::min(above + 1, std::min(left + 1, diag + cost)); + + // Step 6A: Cover transposition, in addition to deletion, + // insertion and substitution. This step is taken from: + // Berghel, Hal ; Roach, David : "An Extension of Ukkonen's + // Enhanced Dynamic Programming ASM Algorithm" + // (http://www.acm.org/~hlb/publications/asm/asm.html) + +// if (i>2 && j>2) { +// int trans=matrix[i-2][j-2]+1; +// if (source[i-2]!=t_j) trans++; +// if (s_i!=target[j-2]) trans++; +// if (cell>trans) cell=trans; +// } + + matrix[i][j] = cell; + } + } + + // Step 7 + int min = matrix[n][m]; + size_t min_m = m; + + for (size_t j = 0; j <= m; j++) { + if (min > matrix[n][j]) { + min = matrix[n][j]; + min_m = j; + } + } + +// INFO("min = "< 0) { +// for (int i = 0; i <= n; i++) { +// INFO(std::to_string(matrix[i])); +// } +// } + + while ((cur_pos_i > 0) && (cur_pos_j > 0)) { + if (matrix[cur_pos_i - 1][cur_pos_j] < matrix[cur_pos_i][cur_pos_j]) { + cur_operation = 'I'; + cur_pos_i--; + } + else { + if (matrix[cur_pos_i][cur_pos_j - 1] < matrix[cur_pos_i][cur_pos_j]) { + cur_operation = 'D'; + cur_pos_j--; + } + else { + cur_operation = 'M'; + cur_pos_i--; + cur_pos_j--; + } + } + if (cur_operation != last_operation) { + if (last_operation != 0) + res = std::to_string(cnt_last_operation) + last_operation + res; + last_operation = cur_operation; + cnt_last_operation = 1; + } + else { + cnt_last_operation++; + } + } + res = std::to_string(cnt_last_operation) + last_operation + res; + return std::make_pair(std::make_pair(cur_pos_j, min_m), res); +} diff -Nru spades-3.10.1+dfsg/src/common/sequence/range.hpp spades-3.11.1+dfsg/src/common/sequence/range.hpp --- spades-3.10.1+dfsg/src/common/sequence/range.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/sequence/range.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -0,0 +1,87 @@ +#pragma once + +#include "utils/verify.hpp" + +struct Range { +private: + bool inside(size_t left, size_t right, size_t point) const { + return left <= point && point <= right; + } + +public: + //inclusive + size_t start_pos; + //exclusive + size_t end_pos; + + size_t size() const { + VERIFY(end_pos >= start_pos); + return end_pos - start_pos; + } + + void shift(int shift) { + VERIFY(shift > 0 || size_t(-shift) <= start_pos); + start_pos += shift; + end_pos += shift; + } + + Range(): start_pos(0), end_pos(0) { + VERIFY(end_pos >= start_pos); + } + + Range(size_t start_pos, size_t end_pos) + : start_pos(start_pos), + end_pos(end_pos) { + VERIFY(end_pos >= start_pos); + } + + bool operator<(const Range &other) const { + if (start_pos != other.start_pos) + return start_pos < other.start_pos; + return end_pos < other.end_pos; + } + + bool contains(const Range& that) const { + return start_pos <= that.start_pos && end_pos >= that.end_pos; + } + + Range Merge(const Range &other) const { + return Range(this->start_pos, other.end_pos); + } + + Range Invert(size_t base_length) const { + VERIFY(base_length >= end_pos); + return Range(base_length - end_pos, base_length - start_pos); + } + + Range& operator=(const Range& other) { + start_pos = other.start_pos; + end_pos = other.end_pos; + return *this; + } + + bool empty() const { + return start_pos == end_pos; + } + + bool Intersect(const Range &other) const { + return inside(start_pos, end_pos, other.start_pos) || inside(start_pos, end_pos, other.end_pos) || + inside(other.start_pos, other.end_pos, start_pos); + } + + bool IntersectLeftOf(const Range &other) const { + return inside(start_pos, end_pos, other.start_pos) && inside(other.start_pos, other.end_pos, end_pos); + } + + bool operator==(const Range &that) const { + return start_pos == that.start_pos && end_pos == that.end_pos; + } + + bool operator!=(const Range &that) const { + return !(*this == that); + } +}; + +inline std::ostream& operator<<(std::ostream& os, const Range& range) { + return os << "[" << range.start_pos << ", " << range.end_pos << ")"; +} diff -Nru spades-3.10.1+dfsg/src/common/sequence/rtseq.hpp spades-3.11.1+dfsg/src/common/sequence/rtseq.hpp --- spades-3.10.1+dfsg/src/common/sequence/rtseq.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/sequence/rtseq.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -20,7 +20,7 @@ #include #include #include "nucl.hpp" -#include "utils/log.hpp" +#include "math/log.hpp" #include "seq_common.hpp" #include "seq.hpp" #include "simple_seq.hpp" @@ -255,6 +255,9 @@ } } + explicit RuntimeSeq(size_t k, const RuntimeSeq &seq) + : RuntimeSeq(k, seq.data_.data()) {} + template explicit RuntimeSeq(const Seq &seq, bool) : size_(size2_) { VERIFY(size_ <= max_size_); @@ -323,6 +326,10 @@ this->data_[cur] = 0; } + RuntimeSeq start(size_t K) const { + return RuntimeSeq(K, data_.data()); + } + /** * Reads sequence from the file (in the same format as BinWrite writes it) * and returns false if error occured, true otherwise. @@ -668,20 +675,20 @@ return operator[](0); } - static size_t GetHash(const DataType *data, size_t sz, uint32_t seed = 0) { + static size_t GetHash(const DataType *data, size_t sz, uint64_t seed = 0) { return CityHash64WithSeed((const char *) data, sz * sizeof(DataType), 0x9E3779B9 ^ seed); } - size_t GetHash(unsigned seed = 0) const { + size_t GetHash(uint64_t seed = 0) const { return GetHash(data_.data(), GetDataSize(size_), seed); } struct hash { - size_t operator()(const RuntimeSeq &seq, uint32_t seed = 0) const { + size_t operator()(const RuntimeSeq &seq, uint64_t seed = 0) const { return seq.GetHash(seed); } - size_t operator()(const DataType *data, size_t sz, unsigned seed = 0) { + size_t operator()(const DataType *data, size_t sz, uint64_t seed = 0) { return GetHash(data, sz, seed); } }; diff -Nru spades-3.10.1+dfsg/src/common/sequence/seq.hpp spades-3.11.1+dfsg/src/common/sequence/seq.hpp --- spades-3.10.1+dfsg/src/common/sequence/seq.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/sequence/seq.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -36,7 +36,7 @@ #include "utils/verify.hpp" #include "nucl.hpp" -#include "utils/log.hpp" +#include "math/log.hpp" #include "seq_common.hpp" @@ -461,20 +461,20 @@ return operator[](0); } - static size_t GetHash(const DataType *data, size_t sz = DataSize, uint32_t seed = 0) { + static size_t GetHash(const DataType *data, size_t sz = DataSize, uint64_t seed = 0) { return CityHash64WithSeed((const char *) data, sz * sizeof(DataType), 0x9E3779B9 ^ seed); } - size_t GetHash(uint32_t seed = 0) const { + size_t GetHash(uint64_t seed = 0) const { return GetHash(data_.data(), DataSize, seed); } struct hash { - size_t operator()(const Seq &seq, uint32_t seed = 0) const { + size_t operator()(const Seq &seq, uint64_t seed = 0) const { return seq.GetHash(seed); } - size_t operator()(const DataType *data, size_t sz = DataSize, uint32_t seed = 0) { + size_t operator()(const DataType *data, size_t sz = DataSize, uint64_t seed = 0) { return GetHash(data, sz, seed); } }; diff -Nru spades-3.10.1+dfsg/src/common/sequence/sequence.hpp spades-3.11.1+dfsg/src/common/sequence/sequence.hpp --- spades-3.10.1+dfsg/src/common/sequence/sequence.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/sequence/sequence.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -16,6 +16,9 @@ #include "seq.hpp" #include "rtseq.hpp" +#include +#include + class Sequence { // Type to store Seq in Sequences typedef seq_element_type ST; @@ -26,16 +29,36 @@ // Number of bits in STN (for faster div and mod) const static size_t STNBits = log_::value; - template - struct array_deleter { - void operator()(const T *p) { delete[] p; } + class ManagedNuclBuffer final : public llvm::ThreadSafeRefCountedBase, + protected llvm::TrailingObjects { + friend TrailingObjects; + + ManagedNuclBuffer() {} + + ManagedNuclBuffer(size_t nucls, ST *buf) { + std::uninitialized_copy(buf, buf + Sequence::DataSize(nucls), data()); + } + + public: + static ManagedNuclBuffer *create(size_t nucls) { + void *mem = ::operator new(totalSizeToAlloc(Sequence::DataSize(nucls))); + return new (mem) ManagedNuclBuffer(); + } + + static ManagedNuclBuffer *create(size_t nucls, ST *data) { + void *mem = ::operator new(totalSizeToAlloc(Sequence::DataSize(nucls))); + return new (mem) ManagedNuclBuffer(nucls, data); + } + + const ST *data() const { return getTrailingObjects(); } + ST *data() { return getTrailingObjects(); } }; private: size_t from_; size_t size_; bool rtl_; // Right to left + complimentary (?) - std::shared_ptr data_; + llvm::IntrusiveRefCntPtr data_; static size_t DataSize(size_t size) { return (size + STN - 1) >> STNBits; @@ -44,7 +67,7 @@ template void InitFromNucls(const S &s, bool rc = false) { size_t bytes_size = DataSize(size_); - ST *bytes = data_.get(); + ST *bytes = data_->data(); VERIFY(is_dignucl(s[0]) || is_nucl(s[0])); @@ -95,64 +118,64 @@ } + Sequence(size_t size, int) + : from_(0), size_(size), rtl_(false), data_(ManagedNuclBuffer::create(size_)) {} + public: /** * Sequence initialization (arbitrary size string) * * @param s ACGT or 0123-string */ - explicit Sequence(const char *s, bool rc = false) : - from_(0), size_(strlen(s)), rtl_(false), data_(new ST[DataSize(size_)], array_deleter()) { + explicit Sequence(const char *s, bool rc = false) + : Sequence(strlen(s), 0) { InitFromNucls(s, rc); } - explicit Sequence(char *s, bool rc = false) : - from_(0), size_(strlen(s)), rtl_(false), data_(new ST[DataSize(size_)], array_deleter()) { + explicit Sequence(char *s, bool rc = false) + : Sequence(strlen(s), 0) { InitFromNucls(s, rc); } template - explicit Sequence(const S &s, bool rc = false) : - from_(0), size_(s.size()), rtl_(false), data_(new ST[DataSize(size_)], array_deleter()) { + explicit Sequence(const S &s, bool rc = false) + : Sequence(s.size(), 0) { InitFromNucls(s, rc); } - Sequence() : - from_(0), size_(0), rtl_(false), data_(new ST[DataSize(size_)], array_deleter()) { - memset(data_.get(), 0, DataSize(size_)); + Sequence() + : Sequence(size_t(0), 0) { + memset(data_->data(), 0, DataSize(size_)); } template - explicit Sequence(const Seq &kmer, size_t) : - from_(0), size_(kmer.size()), rtl_(false), data_(new ST[DataSize(size_)], array_deleter()) { - - kmer.copy_data(data_.get()); + explicit Sequence(const Seq &kmer, size_t) + : Sequence(kmer.size(), 0) { + kmer.copy_data(data_->data()); } template - explicit Sequence(const RuntimeSeq &kmer, size_t) : - from_(0), size_(kmer.size()), rtl_(false), data_(new ST[DataSize(size_)], array_deleter()) { - - kmer.copy_data(data_.get()); + explicit Sequence(const RuntimeSeq &kmer, size_t) + : Sequence(kmer.size(), 0) { + kmer.copy_data(data_->data()); } - Sequence(const Sequence &seq, size_t from, size_t size, bool rtl) : - from_(from), size_(size), rtl_(rtl), data_(seq.data_) { - } + Sequence(const Sequence &seq, size_t from, size_t size, bool rtl) + : from_(from), size_(size), rtl_(rtl), data_(seq.data_) {} - Sequence(const Sequence &s) : - from_(s.from_), size_(s.size_), rtl_(s.rtl_), data_(s.data_) { - } + Sequence(const Sequence &s) + : Sequence(s, s.from_, s.size_, s.rtl_) {} ~Sequence() { } const Sequence &operator=(const Sequence &rhs) { - if (&rhs != this) { - from_ = rhs.from_; - size_ = rhs.size_; - rtl_ = rhs.rtl_; - data_ = rhs.data_; - } + if (&rhs == this) + return *this; + + from_ = rhs.from_; + size_ = rhs.size_; + rtl_ = rhs.rtl_; + data_ = rhs.data_; return *this; } @@ -160,7 +183,7 @@ char operator[](const size_t index) const { //todo can be put back after switching to distributing release without asserts //VERIFY(index < size_); - const ST *bytes = data_.get(); + const ST *bytes = data_->data(); if (rtl_) { size_t i = from_ + size_ - 1 - index; return complement((bytes[i >> STNBits] >> ((i & (STN - 1)) << 1)) & 3); @@ -171,13 +194,11 @@ } bool operator==(const Sequence &that) const { - if (size_ != that.size_) { + if (size_ != that.size_) return false; - } - if (data_ == that.data_ && from_ == that.from_ && rtl_ == that.rtl_) { + if (data_ == that.data_ && from_ == that.from_ && rtl_ == that.rtl_) return true; - } for (size_t i = 0; i < size_; ++i) { if (this->operator[](i) != that[i]) { @@ -275,12 +296,10 @@ private: inline bool ReadHeader(std::istream &file); - inline bool WriteHeader(std::ostream &file) const; public: inline bool BinRead(std::istream &file); - inline bool BinWrite(std::ostream &file) const; }; @@ -302,7 +321,7 @@ size_t start = from_ >> STNBits; size_t end = (from_ + size_ - 1) >> STNBits; size_t shift = (from_ & (STN - 1)) << 1; - const ST *bytes = data_.get(); + const ST *bytes = data_->data(); for (size_t i = start; i <= end; ++i) { result[i - start] = bytes[i] >> shift; @@ -451,10 +470,10 @@ std::string Sequence::err() const { std::ostringstream oss; - oss << "{ *data=" << data_ << - ", from_=" << from_ << - ", size_=" << size_ << - ", rtl_=" << int(rtl_) << " }"; + oss << "{ *data=" << data_->data() << + ", from_=" << from_ << + ", size_=" << size_ << + ", rtl_=" << int(rtl_) << " }"; return oss.str(); } @@ -485,8 +504,8 @@ bool Sequence::BinRead(std::istream &file) { ReadHeader(file); - data_ = std::shared_ptr(new ST[DataSize(size_)], array_deleter()); - file.read((char *) data_.get(), DataSize(size_) * sizeof(ST)); + data_ = llvm::IntrusiveRefCntPtr(ManagedNuclBuffer::create(size_)); + file.read((char *) data_->data(), DataSize(size_) * sizeof(ST)); return !file.fail(); } @@ -500,7 +519,7 @@ WriteHeader(file); - file.write((const char *) data_.get(), DataSize(size_) * sizeof(ST)); + file.write((const char *) data_->data(), DataSize(size_) * sizeof(ST)); return !file.fail(); } @@ -536,6 +555,10 @@ return buf_.size(); } + void clear() { + return buf_.clear(); + } + char operator[](const size_t index) const { VERIFY(index < buf_.size()); return buf_[index]; diff -Nru spades-3.10.1+dfsg/src/common/sequence/sequence_tools.hpp spades-3.11.1+dfsg/src/common/sequence/sequence_tools.hpp --- spades-3.10.1+dfsg/src/common/sequence/sequence_tools.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/sequence/sequence_tools.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -14,7 +14,7 @@ #include "nucl.hpp" #include "sequence.hpp" -#include "utils/levenshtein.hpp" +#include "levenshtein.hpp" inline const std::string Reverse(const std::string &s) { return std::string(s.rbegin(), s.rend()); @@ -22,7 +22,7 @@ inline const std::string Complement(const std::string &s) { std::string res(s.size(), 0); - transform(s.begin(), s.end(), res.begin(), nucl_complement); + std::transform(s.begin(), s.end(), res.begin(), nucl_complement); return res; } diff -Nru spades-3.10.1+dfsg/src/common/sequence/simple_seq.hpp spades-3.11.1+dfsg/src/common/sequence/simple_seq.hpp --- spades-3.10.1+dfsg/src/common/sequence/simple_seq.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/sequence/simple_seq.hpp 2017-09-28 09:05:10.000000000 +0000 @@ -23,7 +23,7 @@ #include "utils/verify.hpp" #include "nucl.hpp" -#include "utils/log.hpp" +#include "math/log.hpp" #include "seq_common.hpp" /** * @param T is max number of nucleotides, type for storage diff -Nru spades-3.10.1+dfsg/src/common/stages/construction.cpp spades-3.11.1+dfsg/src/common/stages/construction.cpp --- spades-3.10.1+dfsg/src/common/stages/construction.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/stages/construction.cpp 2017-09-28 09:05:11.000000000 +0000 @@ -22,7 +22,7 @@ config::debruijn_config::construction params = cfg::get().con; params.early_tc.enable &= !cfg::get().gap_closer_enable; - ReadStatistics stats = ConstructGraphWithCoverage(params, streams, gp.g, + utils::ReadStatistics stats = ConstructGraphWithCoverage(params, streams, gp.g, gp.index, gp.flanking_cov, contigs_stream); size_t rl = stats.max_read_length_; diff -Nru spades-3.10.1+dfsg/src/common/stages/simplification.cpp spades-3.11.1+dfsg/src/common/stages/simplification.cpp --- spades-3.10.1+dfsg/src/common/stages/simplification.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/stages/simplification.cpp 2017-09-28 09:05:11.000000000 +0000 @@ -11,6 +11,7 @@ #include "stages/simplification_pipeline/graph_simplification.hpp" #include "stages/simplification_pipeline/single_cell_simplification.hpp" #include "stages/simplification_pipeline/rna_simplification.hpp" +#include "modules/simplification/cleaner.hpp" #include "simplification.hpp" @@ -59,17 +60,17 @@ return true; } - void RemoveShortPolyATEdges(size_t max_length, - HandlerF removal_handler = 0, size_t chunk_cnt = 1) { + void RemoveShortPolyATEdges(HandlerF removal_handler, size_t chunk_cnt) { INFO("Removing short polyAT"); EdgeRemover er(g_, removal_handler); - ATCondition condition (g_, 0.8, max_length, false); - for (auto iter = g_.SmartEdgeBegin(); !iter.IsEnd(); ++iter){ + ATCondition condition(g_, 0.8, false); + for (auto iter = g_.SmartEdgeBegin(/*canonical only*/true); !iter.IsEnd(); ++iter){ if (g_.length(*iter) == 1 && condition.Check(*iter)) { er.DeleteEdgeNoCompress(*iter); } } - ParallelCompress(g_, chunk_cnt); + omnigraph::CompressAllVertices(g_, chunk_cnt); + omnigraph::CleanIsolatedVertices(g_, chunk_cnt); } void InitialCleaning() { @@ -85,8 +86,12 @@ algos); if (info_container_.mode() == config::pipeline_type::rna){ - RemoveShortPolyATEdges(1, removal_handler_, info_container_.chunk_cnt()); - PushValid(ShortPolyATEdgesRemoverInstance(g_, 1, removal_handler_, info_container_.chunk_cnt()), "Short PolyA/T Edges",algos) ; + //TODO create algo + RemoveShortPolyATEdges(removal_handler_, info_container_.chunk_cnt()); + PushValid(std::make_shared>(g_, func::And(LengthUpperBound(g_, 1), + ATCondition(g_, 0.8, false)), + info_container_.chunk_cnt(), removal_handler_, true), + "Short PolyA/T Edges", algos) ; PushValid(ATTipClipperInstance(g_, removal_handler_, info_container_.chunk_cnt()), "AT Tips", algos); } @@ -113,7 +118,7 @@ removal_handler_), "Initial ec remover", algos); - + PushValid( LowFlankDisconnectorInstance(g_, gp_.flanking_cov, simplif_cfg_.init_clean.disconnect_flank_cov, info_container_, @@ -255,12 +260,6 @@ //FIXME need better configuration if (info_container_.mode() == config::pipeline_type::meta) { - PushValid( - BRInstance(g_, simplif_cfg_.second_final_br, - info_container_, removal_handler_), - "Yet another final bulge remover", - algos); - EdgePredicate meta_thorn_condition = And(LengthUpperBound(g_, LengthThresholdFinder::MaxErroneousConnectionLength( g_.k(), simplif_cfg_.isec.max_ec_length_coefficient)), @@ -286,17 +285,25 @@ PushValid(ATTipClipperInstance(g_, removal_handler_, info_container_.chunk_cnt()), "AT Tips", algos); } + PushValid( + LowCoverageEdgeRemoverInstance(g_, + simplif_cfg_.lcer, + info_container_), + "Removing edges with low coverage", + algos); + + const size_t primary_launch_cnt = 2; size_t iteration = 0; bool enable_flag = true; - while (enable_flag) { + while (iteration < primary_launch_cnt || enable_flag) { enable_flag = false; - INFO("Iteration " << iteration); + INFO("Iteration " << iteration + 1); enable_flag |= FinalRemoveErroneousEdges(); cnt_callback_.Report(); - enable_flag |= RunAlgos(algos); + enable_flag |= RunAlgos(algos, iteration < primary_launch_cnt); iteration++; @@ -401,7 +408,7 @@ //cannot stop simply if nothing changed, since threshold changes on every iteration while (iteration < simplif_cfg_.cycle_iter_count || graph_changed) { INFO("PROCEDURE == Simplification cycle, iteration " << iteration + 1); - graph_changed = RunAlgos(algos); + graph_changed = RunAlgos(algos, iteration == simplif_cfg_.cycle_iter_count - 1); ++iteration; } @@ -536,8 +543,10 @@ auto isolated_edge_remover = IsolatedEdgeRemoverInstance(gp.g, cfg::get().simp.ier, info_container, (EdgeRemovalHandlerF)nullptr); - if (isolated_edge_remover != nullptr) + if (isolated_edge_remover != nullptr) { + INFO("Removing isolated edges"); isolated_edge_remover->Run(); + } double low_threshold = gp.ginfo.trusted_bound(); if (math::gr(low_threshold, 0.0)) { diff -Nru spades-3.10.1+dfsg/src/common/stages/simplification_pipeline/graph_simplification.hpp spades-3.11.1+dfsg/src/common/stages/simplification_pipeline/graph_simplification.hpp --- spades-3.10.1+dfsg/src/common/stages/simplification_pipeline/graph_simplification.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/stages/simplification_pipeline/graph_simplification.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -382,11 +382,12 @@ DEBUG("Length bound " << max_length); return AlternativesAnalyzer(g, config.max_coverage, - max_length, - config.max_relative_coverage, - config.max_delta, - config.max_relative_delta, - config.max_number_edges); + max_length, + config.max_relative_coverage, + config.max_delta, + config.max_relative_delta, + config.max_number_edges, + config.dijkstra_vertex_limit); } template @@ -488,17 +489,21 @@ template AlgoPtr IsolatedEdgeRemoverInstance(Graph &g, - config::debruijn_config::simplification::isolated_edges_remover ier, + config::debruijn_config::simplification::isolated_edge_remover ier, const SimplifInfoContainer &info, EdgeRemovalHandlerF removal_handler = 0) { if (!ier.enabled) { return nullptr; } - size_t max_length_any_cov = std::max(info.read_length(), ier.max_length_any_cov); + size_t max_length_any_cov = ier.use_rl_for_max_length_any_cov ? + std::max(info.read_length(), ier.max_length_any_cov) : ier.max_length_any_cov; + size_t max_length = ier.use_rl_for_max_length ? + std::max(info.read_length(), ier.max_length) : ier.max_length; + auto condition = func::And(IsolatedEdgeCondition(g), func::Or(LengthUpperBound(g, max_length_any_cov), - func::And(LengthUpperBound(g, ier.max_length), + func::And(LengthUpperBound(g, max_length), CoverageUpperBound(g, ier.max_coverage)))); return std::make_shared>(g, @@ -608,15 +613,10 @@ auto alternatives_analyzer = ParseBRConfig(g, br_config); - auto candidate_finder = std::make_shared>( - omnigraph::NecessaryBulgeCondition(g, - alternatives_analyzer.max_length(), - alternatives_analyzer.max_coverage()), - info.chunk_cnt()); if (br_config.parallel) { INFO("Creating parallel br instance"); return make_shared>(g, - candidate_finder, + info.chunk_cnt(), br_config.buff_size, br_config.buff_cov_diff, br_config.buff_cov_rel_diff, @@ -627,7 +627,7 @@ } else { INFO("Creating br instance"); return make_shared>(g, - candidate_finder, + info.chunk_cnt(), alternatives_analyzer, nullptr, removal_handler, @@ -657,6 +657,25 @@ } template +AlgoPtr LowCoverageEdgeRemoverInstance(Graph &g, + const config::debruijn_config::simplification::low_covered_edge_remover &lcer_config, + const SimplifInfoContainer &info) { + if (!lcer_config.enabled) { + return nullptr; + } + VERIFY(info.read_length() > g.k()); + double threshold = lcer_config.coverage_threshold * double(info.read_length() - g.k()) / double(info.read_length()); + INFO("Low coverage edge removal (LCER) activated and will remove edges of coverage lower than " << threshold); + return make_shared>> + (g, + CoverageUpperBound(g, threshold), + info.chunk_cnt(), + (EdgeRemovalHandlerF)nullptr, + /*canonical_only*/true, + CoverageComparator(g)); +} + +template bool RemoveHiddenLoopEC(Graph &g, const FlankingCoverage &flanking_cov, double determined_coverage_threshold, diff -Nru spades-3.10.1+dfsg/src/common/stages/simplification_pipeline/rna_simplification.hpp spades-3.11.1+dfsg/src/common/stages/simplification_pipeline/rna_simplification.hpp --- spades-3.10.1+dfsg/src/common/stages/simplification_pipeline/rna_simplification.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/stages/simplification_pipeline/rna_simplification.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -7,15 +7,10 @@ namespace simplification { template -AlgoPtr ShortPolyATEdgesRemoverInstance(Graph &g, size_t max_length, EdgeRemovalHandlerF removal_handler = 0, size_t chunk_cnt = 1) { - auto condition = func::And(ATCondition(g, 0.8, max_length, false), LengthUpperBound(g, 1)); - return std::make_shared>(g, condition, chunk_cnt, removal_handler, true); -} - -template AlgoPtr ATTipClipperInstance(Graph &g, EdgeRemovalHandlerF removal_handler = 0, size_t chunk_cnt = 1) { //TODO: review params 0.8, 200? - return std::make_shared>(g, ATCondition(g, 0.8, 200, true), chunk_cnt, removal_handler, true); + return std::make_shared>(g, func::And(omnigraph::LengthUpperBound(g, 200), ATCondition(g, 0.8, true)), + chunk_cnt, removal_handler, true); } } diff -Nru spades-3.10.1+dfsg/src/common/utils/autocompletion.cpp spades-3.11.1+dfsg/src/common/utils/autocompletion.cpp --- spades-3.10.1+dfsg/src/common/utils/autocompletion.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/autocompletion.cpp 2017-09-28 09:05:11.000000000 +0000 @@ -11,7 +11,7 @@ #include #include -namespace online_visualization { +namespace utils { std::vector commands; diff -Nru spades-3.10.1+dfsg/src/common/utils/autocompletion.hpp spades-3.11.1+dfsg/src/common/utils/autocompletion.hpp --- spades-3.10.1+dfsg/src/common/utils/autocompletion.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/autocompletion.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -9,7 +9,7 @@ #include #include -namespace online_visualization { +namespace utils { void InitAutocompletion(const std::vector& commands); diff -Nru spades-3.10.1+dfsg/src/common/utils/CMakeLists.txt spades-3.11.1+dfsg/src/common/utils/CMakeLists.txt --- spades-3.10.1+dfsg/src/common/utils/CMakeLists.txt 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/CMakeLists.txt 2017-09-28 09:05:11.000000000 +0000 @@ -8,8 +8,8 @@ project(utils CXX) set(utils_src - copy_file.cpp - path_helper.cpp + filesystem/copy_file.cpp + filesystem/path_helper.cpp logger/logger_impl.cpp) if (READLINE_FOUND) diff -Nru spades-3.10.1+dfsg/src/common/utils/copy_file.cpp spades-3.11.1+dfsg/src/common/utils/copy_file.cpp --- spades-3.10.1+dfsg/src/common/utils/copy_file.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/copy_file.cpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,158 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#include "copy_file.hpp" - -#include "utils/path_helper.hpp" -#include "utils/logger/logger.hpp" - -#include - -#include -#include -#include -#include - -#include -#include - -#include -#include - -namespace path { - -namespace details { - -using namespace path; - -void copy_file(std::string from_path, std::string to_path) { - using namespace std; - - make_full_path(from_path); - make_full_path(to_path ); - - if (from_path == to_path) - return; - - std::ifstream source(from_path, ios::binary); - std::ofstream dest (to_path.c_str() , ios::binary); - - dest << source.rdbuf(); -} - - -void hard_link(std::string from_path, std::string to_path) { - make_full_path(from_path); - make_full_path(to_path ); - - if (from_path == to_path) - return; - - if (link(from_path.c_str(), to_path.c_str()) == -1) { - WARN("Failed to create link. Reason: " << strerror(errno) << ". Error code: " << errno << ". Copying instead"); - copy_file(from_path, to_path); - } -} - -files_t files_in_folder(std::string const& path) { - DIR *dp; - if ((dp = opendir(path.c_str())) == NULL) - throw std::runtime_error("can not open folder " + path); - - files_t files; - - struct dirent *dirp; - while ((dirp = readdir(dp)) != NULL) - if (dirp->d_type == DT_REG) - files.push_back(append_path(path, dirp->d_name)); - - closedir(dp); - return files; -} - -files_t folders_in_folder(std::string const& path) { - DIR *dp; - if ((dp = opendir(path.c_str())) == NULL) - throw std::runtime_error("can not open folder " + path); - - files_t folders; - - struct dirent *dirp; - while ((dirp = readdir(dp)) != NULL) - if (dirp->d_type == DT_DIR) { - std::string folder = dirp->d_name; - - if (folder != "." && folder != "..") - folders.push_back(append_path(path, folder)); - } - - closedir(dp); - return folders; -} - -} // details - -path::files_t files_by_prefix(std::string const& path) { - using namespace details; - files_t files; - - std::string folder(parent_path(path)); - std::string prefix = filename(path); - - files_t out_files; - const files_t all_files = files_in_folder(folder); - - for (auto it = all_files.begin(); it != all_files.end(); ++it) // no std::copy_if before C++11 - if (boost::starts_with(filename(*it), prefix)) - out_files.push_back(*it); - - return out_files; -} - -void copy_files_by_prefix(path::files_t const& files, std::string const& to_folder) { - using namespace details; - - for (auto it = files.begin(); it != files.end(); ++it) { - files_t files_to_copy = files_by_prefix(*it); - - for (auto it = files_to_copy.begin(); it != files_to_copy.end(); ++it) - copy_file(*it, append_path(to_folder, filename(*it))); - } -} - -void link_files_by_prefix(path::files_t const& files, std::string const& to_folder) { - using namespace details; - - for (auto it = files.begin(); it != files.end(); ++it) { - files_t files_to_copy = files_by_prefix(*it); - - for (auto it = files_to_copy.begin(); it != files_to_copy.end(); ++it) - hard_link(*it, append_path(to_folder, filename(*it))); - } -} - -void copy_files_by_ext(std::string const& from_folder, std::string const& to_folder, std::string const& ext, bool recursive) { - using namespace details; - - files_t files = files_in_folder(from_folder); - - for (auto it = files.begin(); it != files.end(); ++it) - if (boost::ends_with(*it, ext)) - copy_file(*it, append_path(to_folder, filename(*it))); - - if (recursive) { - files_t folders = folders_in_folder(from_folder); - - for (auto it = folders.begin(); it != folders.end(); ++it) { - std::string subdir = append_path(to_folder, filename(*it)); - path:: make_dir(subdir); - copy_files_by_ext(*it, subdir, ext, recursive); - } - } -} - -} diff -Nru spades-3.10.1+dfsg/src/common/utils/copy_file.hpp spades-3.11.1+dfsg/src/common/utils/copy_file.hpp --- spades-3.10.1+dfsg/src/common/utils/copy_file.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/copy_file.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,18 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#include "utils/path_helper.hpp" -#include - -namespace path { - -path::files_t files_by_prefix(std::string const& path); -void copy_files_by_prefix(path::files_t const& files, std::string const& to_folder); -void link_files_by_prefix(path::files_t const& files, std::string const& to_folder); -void copy_files_by_ext(std::string const& from_folder, std::string const& to_folder, std::string const& ext, bool recursive); - -} diff -Nru spades-3.10.1+dfsg/src/common/utils/coverage_model/CMakeLists.txt spades-3.11.1+dfsg/src/common/utils/coverage_model/CMakeLists.txt --- spades-3.10.1+dfsg/src/common/utils/coverage_model/CMakeLists.txt 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/coverage_model/CMakeLists.txt 1970-01-01 00:00:00.000000000 +0000 @@ -1,14 +0,0 @@ -############################################################################ -# Copyright (c) 2015 Saint Petersburg State University -# Copyright (c) 2011-2014 Saint Petersburg Academic University -# All Rights Reserved -# See file LICENSE for details. -############################################################################ - -project(coverage_model CXX) - -add_library(coverage_model STATIC - kmer_coverage_model.cpp) - -target_link_libraries(coverage_model nlopt) - diff -Nru spades-3.10.1+dfsg/src/common/utils/coverage_model/kmer_coverage_model.cpp spades-3.11.1+dfsg/src/common/utils/coverage_model/kmer_coverage_model.cpp --- spades-3.10.1+dfsg/src/common/utils/coverage_model/kmer_coverage_model.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/coverage_model/kmer_coverage_model.cpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,380 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#include "kmer_coverage_model.hpp" - -#include "utils/logger/logger.hpp" -#include "utils/verify.hpp" -#include "math/xmath.h" -#include "math/smooth.hpp" - -#include -#include -#include -#include -#include - -#include - -#include - -#include -#include -#include -#include - -namespace utils { -namespace coverage_model { - -using std::isfinite; - -static const size_t MaxCopy = 10; - -static double dzeta(double x, double p) { - return pow(x, -p - 1) / boost::math::zeta(p + 1); -} - -static double perr(size_t i, double scale, double shape) { - return pow((1 + shape * ((double) (i - 1)) / scale), -1.0 / shape) - - pow((1 + shape * ((double) i) / scale), -1.0 / shape); -} - -static double pgood(size_t i, double zp, double u, double sd, double shape, - double* mixprobs = NULL) { - double res = 0; - - for (unsigned copy = 0; copy < MaxCopy; ++copy) { - boost::math::skew_normal snormal((copy + 1) * u, sd * sqrt(copy + 1), shape); - // res += (mixprobs ? mixprobs[copy] : dzeta(copy + 1, zp)) * (boost::math::cdf(snormal, i + 1) - boost::math::cdf(snormal, i)); - res += (mixprobs ? mixprobs[copy] : dzeta(copy + 1, zp)) * boost::math::pdf(snormal, i); - } - - return res; -} - -class CovModelLogLike { - const std::vector& cov; - -public: - CovModelLogLike(const std::vector& cov) - : cov(cov) {} - - int getN() const { return 7; }; - -private: - - double eval_(const double* x) const { - double zp = x[0], p = x[1], shape = x[2], u = x[3], sd = x[4], scale = x[5], shape2 = x[6]; - - if (zp <= 1 || shape <= 0 || sd <= 0 || p < 1e-9 || p > 1 - 1e-9 || u <= 0 || scale <= 0 || - !isfinite(zp) || !isfinite(shape) || !isfinite(sd) || !isfinite(p) || !isfinite(u) || - !isfinite(scale) || !isfinite(shape2)) - return +std::numeric_limits::infinity(); - - std::vector kmer_probs(cov.size()); - - // Error - for (size_t i = 0; i < kmer_probs.size(); ++i) - kmer_probs[i] += p * perr(i + 1, scale, shape); - - // Good - for (size_t i = 0; i < kmer_probs.size(); ++i) - kmer_probs[i] += (1 - p) * pgood(i + 1, zp, u, sd, shape2); - - double res = 0; - for (size_t i = 0; i < kmer_probs.size(); ++i) - res += (double) (cov[i]) * log(kmer_probs[i]); - - return -res; - } -}; - -struct CovModelLogLikeEMData { - const std::vector& cov; - const std::vector& z; -}; - -static double CovModelLogLikeEM(unsigned, const double* x, double*, void* data) { - double zp = x[0], shape = x[1], u = x[2], sd = x[3], scale = x[4], shape2 = x[5]; - - // INFO("Entry: " << x[0] << " " << x[1] << " " << x[2] << " " << x[3] << " " << x[4]); - - if (zp <= 1 || shape <= 0 || sd <= 0 || u <= 0 || scale <= 0 || - !isfinite(zp) || !isfinite(shape) || !isfinite(sd) || !isfinite(u) || - !isfinite(scale) || !isfinite(shape2)) - return -std::numeric_limits::infinity(); - - const std::vector& cov = static_cast(data)->cov; - const std::vector& z = static_cast(data)->z; - - std::vector kmer_probs(cov.size(), 0); - - // Error - for (size_t i = 0; i < kmer_probs.size(); ++i) { - if (cov[i] == 0) - continue; - - kmer_probs[i] += z[i] * log(perr(i + 1, scale, shape)); - } - - // Good - // Pre-compute mixing probabilities - std::vector mixprobs(MaxCopy, 0); - for (unsigned copy = 0; copy < MaxCopy; ++copy) - mixprobs[copy] = dzeta(copy + 1, zp); - - // Compute the density - for (size_t i = 0; i < kmer_probs.size(); ++i) { - if (cov[i] == 0) - continue; - - double val = log(pgood(i + 1, zp, u, sd, shape2, &mixprobs[0])); - if (!isfinite(val)) - val = -1000.0; - kmer_probs[i] += (1 - z[i]) * val; - } - - double res = 0; - for (size_t i = 0; i < kmer_probs.size(); ++i) - res += (double) (cov[i]) * kmer_probs[i]; - - // INFO("f: " << res); - return res; -} - - -static std::vector EStep(const std::vector& x, - double p, size_t N) { - double zp = x[0], shape = x[1], u = x[2], sd = x[3], scale = x[4], shape2 = x[5]; - - std::vector res(N); - for (size_t i = 0; i < N; ++i) { - double pe = p * perr(i + 1, scale, shape); - res[i] = pe / (pe + (1 - p) * pgood(i + 1, zp, u, sd, shape2)); - if (!isfinite(res[i])) - res[i] = 1.0; - } - - return res; -} - -// Estimate the coverage mean by finding the max past the -// first valley. -size_t KMerCoverageModel::EstimateValley() const { - // Smooth the histogram - std::vector scov; - math::Smooth3RS3R(scov, cov_); - - size_t Valley = scov[0]; - - // Start finding the valley - size_t Idx = 1; - while (scov[Idx] < Valley && Idx < scov.size()) { - Valley = scov[Idx]; - Idx += 1; - } - Idx -= 1; - - INFO("Kmer coverage valley at: " << Idx); - - return Idx; -} - -void KMerCoverageModel::Fit() { - VERIFY_MSG(cov_.size() > 10, "Invalid kmer coverage histogram, make sure that the coverage is indeed uniform"); - - // Find the minimal coverage point using smoothed histogram. - Valley_ = EstimateValley(); - - // First estimate of coverage is the first maximum after the valley. - MaxCov_ = Valley_ + 1; - size_t MaxHist = cov_[MaxCov_]; - for (size_t i = Valley_ + 1; i < cov_.size(); ++i) { - if (cov_[i] > MaxHist) { - MaxHist = cov_[i]; - MaxCov_ = i; - } - } - INFO("K-mer histogram maximum: " << MaxCov_); - - // Refine the estimate via median - size_t AfterValley = 0, SecondValley = std::min(2 * MaxCov_ - Valley_, cov_.size()); - for (size_t i = Valley_ + 1; i < SecondValley; ++i) - AfterValley += cov_[i]; - - size_t ccov = 0; - for (size_t i = Valley_ + 1; i < SecondValley; ++i) { - if (ccov > AfterValley / 2) { - MaxCov_ = std::max(i, MaxCov_); - break; - } - ccov += cov_[i]; - } - - if (MaxCov_ - Valley_ < 3) - WARN("Too many erroneous kmers, the estimates might be unreliable"); - - std::vector mvals(1 + MaxCov_ - Valley_); - mvals[0] = cov_[MaxCov_]; - size_t tmadcov = mvals[0]; - for (size_t i = 1; i < std::min(MaxCov_ - Valley_, cov_.size() - MaxCov_); ++i) { - mvals[i] = cov_[MaxCov_ + i] + cov_[MaxCov_ - i]; - tmadcov += mvals[i]; - } - size_t madcov = 0; - double CovSd = sqrt((double) (5 * MaxCov_)); - for (size_t i = 0; i < MaxCov_ - Valley_; ++i) { - if (madcov > tmadcov / 2) { - CovSd = (double) i; - break; - } - madcov += mvals[i]; - } - CovSd *= 1.4826; - INFO("Estimated median coverage: " << MaxCov_ << ". Coverage mad: " << CovSd); - - // Estimate error probability as ratio of kmers before the valley. - size_t BeforeValley = 0, Total = 0; - double ErrorProb = 0; - for (size_t i = 0; i < cov_.size(); ++i) { - if (i <= Valley_) - BeforeValley += cov_[i]; - Total += cov_[i]; - } - ErrorProb = (double) BeforeValley / (double) Total; - // Allow some erroneous / good kmers. - ErrorProb = std::min(1 - 1e-3, ErrorProb); - ErrorProb = std::max(1e-3, ErrorProb); - - TRACE("Total: " << Total << ". Before: " << BeforeValley); - TRACE("p: " << ErrorProb); - - std::vector x = {3.0, 3.0, (double) MaxCov_, CovSd, 1.0, 0.0}, - lb = {0.0, 0.0, 0.0, (double) (MaxCov_ - Valley_), 0.0, -6.0}, - ub = {2000.0, 2000.0, (double) (2 * MaxCov_), (double) SecondValley, 2000.0, 6.0}; - - INFO("Fitting coverage model"); - // Ensure that there will be at least 2 iterations. - double PrevErrProb = 2; - const double ErrProbThr = 1e-8; - auto GoodCov = cov_; - GoodCov.resize(std::min(cov_.size(), 5 * MaxCopy * MaxCov_ / 4)); - converged_ = true; - unsigned it = 1; - while (fabs(PrevErrProb - ErrorProb) > ErrProbThr) { - // Recalculate the vector of posterior error probabilities - std::vector z = EStep(x, ErrorProb, GoodCov.size()); - - // Recalculate the probability of error - PrevErrProb = ErrorProb; - ErrorProb = 0; - for (size_t i = 0; i < GoodCov.size(); ++i) - ErrorProb += z[i] * (double) GoodCov[i]; - ErrorProb /= (double) Total; - - bool LastIter = fabs(PrevErrProb - ErrorProb) <= ErrProbThr; - - nlopt::opt opt(nlopt::LN_NELDERMEAD, 6); - CovModelLogLikeEMData data = {GoodCov, z}; - opt.set_max_objective(CovModelLogLikeEM, &data); - if (!LastIter) - opt.set_maxeval(5 * 6 * it); - opt.set_xtol_rel(1e-8); - opt.set_ftol_rel(1e-8); - - double fMin; - nlopt::result Results = nlopt::FAILURE; - try { - Results = opt.optimize(x, fMin); - } catch (nlopt::roundoff_limited&) { - } - - VERBOSE_POWER_T2(it, 1, "... iteration " << it); - TRACE("Results: "); - TRACE("Converged: " << Results << " " << "F: " << fMin); - - double zp = x[0], shape = x[1], u = x[2], sd = x[3], scale = x[4], shape2 = x[5]; - TRACE("zp: " << zp << " p: " << ErrorProb << " shape: " << shape << " u: " << u << " sd: " << sd << - " scale: " << scale << " shape2: " << shape2); - - it += 1; - } - - double delta = x[5] / sqrt(1 + x[5] * x[5]); - mean_coverage_ = x[2] + x[3] * delta * sqrt(2 / M_PI); - sd_coverage_ = x[3] * sqrt(1 - 2 * delta * delta / M_PI); - INFO("Fitted mean coverage: " << mean_coverage_ << ". Fitted coverage std. dev: " << sd_coverage_); - - // Now let us check whether we have sane results - for (size_t i = 0; i < x.size(); ++i) - if (!isfinite(x[i])) { - converged_ = false; - break; - } - - if (!isfinite(ErrorProb)) - converged_ = false; - - // See, if we can deduce proper threshold - - // First, check whether initial estimate of Valley was sane. - ErrorThreshold_ = 0; - if (converged_ && Valley_ > x[2] && x[2] > 2) { - Valley_ = (size_t) math::round(x[2] / 2.0); - WARN("Valley value was estimated improperly, reset to " << Valley_); - } - - // If the model converged, then use it to estimate the thresholds. - if (converged_) { - std::vector z = EStep(x, ErrorProb, GoodCov.size()); - - INFO("Probability of erroneous kmer at valley: " << z[Valley_]); - converged_ = false; - for (size_t i = 0; i < z.size(); ++i) - if (z[i] > strong_probability_threshold_) //0.999 - LowThreshold_ = std::min(i + 1, Valley_); - else if (z[i] < probability_threshold_) {//0.05? - ErrorThreshold_ = std::max(i + 1, Valley_); - converged_ = true; - break; - } - -#if 0 - for (size_t i = 0; i < z.size(); ++i) { - double zp = x[0], shape = x[1], u = x[2], sd = x[3], scale = x[4], shape2 = x[5]; - double pe = ErrorProb * perr(i + 1, scale, shape); - double pg = (1 - ErrorProb) * pgood(i + 1, zp, u, sd, shape2); - - fprintf(stderr, "%e %e %e %e\n", pe, pg, z[i], perr(i + 1, scale, shape)); - } -#endif - } - - // See, if we have sane ErrorThreshold_ and go down to something convervative, if not. - if (converged_) { - INFO("Preliminary threshold calculated as: " << ErrorThreshold_); - ErrorThreshold_ = (Valley_ < mean_coverage_ ? - std::min(Valley_ + (size_t) (mean_coverage_ - (double) Valley_) / 2, ErrorThreshold_) : - Valley_); - INFO("Threshold adjusted to: " << ErrorThreshold_); - } else { - ErrorThreshold_ = Valley_; - LowThreshold_ = 1; - WARN("Failed to determine erroneous kmer threshold. Threshold set to: " << ErrorThreshold_); - } - - // Now the bonus: estimate the genome size! - GenomeSize_ = 0; - for (size_t i = ErrorThreshold_ - 1; i < GoodCov.size(); ++i) - GenomeSize_ += GoodCov[i]; - GenomeSize_ /= 2; - - INFO("Estimated genome size (ignoring repeats): " << GenomeSize_); -} - -} -} diff -Nru spades-3.10.1+dfsg/src/common/utils/coverage_model/kmer_coverage_model.hpp spades-3.11.1+dfsg/src/common/utils/coverage_model/kmer_coverage_model.hpp --- spades-3.10.1+dfsg/src/common/utils/coverage_model/kmer_coverage_model.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/coverage_model/kmer_coverage_model.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,48 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#pragma once - -#include -#include - -namespace utils { -namespace coverage_model { - -class KMerCoverageModel { - const std::vector& cov_; - size_t MaxCov_, Valley_, ErrorThreshold_, LowThreshold_, GenomeSize_; - double probability_threshold_, strong_probability_threshold_, mean_coverage_, sd_coverage_; - bool converged_; - -public: - KMerCoverageModel(const std::vector& cov, double probability_threshold, - double strong_probability_threshold) - : cov_(cov), LowThreshold_(0), probability_threshold_(probability_threshold), - strong_probability_threshold_(strong_probability_threshold), - mean_coverage_(0.0), sd_coverage_(0.0), converged_(false) {} - - void Fit(); - - size_t GetErrorThreshold() const { return ErrorThreshold_; } - - size_t GetLowThreshold() const { return LowThreshold_; } - - size_t GetGenomeSize() const { return GenomeSize_; } - - double GetMeanCoverage() const { return mean_coverage_; } - - double GetSdCoverage() const { return sd_coverage_; } - - bool converged() const { return converged_; } - -private: - size_t EstimateValley() const; -}; - -} -} diff -Nru spades-3.10.1+dfsg/src/common/utils/cpp_utils.hpp spades-3.11.1+dfsg/src/common/utils/cpp_utils.hpp --- spades-3.10.1+dfsg/src/common/utils/cpp_utils.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/cpp_utils.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -37,4 +37,9 @@ return t; } +template +inline bool instanceof(const T *ptr) { + return dynamic_cast(ptr) != nullptr; +} + } // namespace utils diff -Nru spades-3.10.1+dfsg/src/common/utils/debruijn_graph/debruijn_graph_constructor.hpp spades-3.11.1+dfsg/src/common/utils/debruijn_graph/debruijn_graph_constructor.hpp --- spades-3.10.1+dfsg/src/common/utils/debruijn_graph/debruijn_graph_constructor.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/debruijn_graph/debruijn_graph_constructor.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,558 +0,0 @@ -#pragma once -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#include "assembly_graph/core/graph.hpp" -#include "assembly_graph/core/construction_helper.hpp" -#include "utils/standard_base.hpp" -#include "utils/indices/kmer_extension_index.hpp" -#include "utils/openmp_wrapper.h" -#include "utils/parallel_wrapper.hpp" - -namespace debruijn_graph { - -/* - * Constructs DeBruijnGraph from DeBruijn Graph using "new DeBruijnGraphConstructor(DeBruijn).ConstructGraph(DeBruijnGraph, Index)" - */ -template -class DeBruijnGraphConstructor { -private: - typedef typename Graph::EdgeId EdgeId; - typedef Index DeBruijn; - typedef typename Graph::VertexId VertexId; - typedef typename Index::KMer Kmer; - typedef typename DeBruijn::KeyWithHash KeyWithHash; - typedef typename DeBruijn::kmer_iterator kmer_iterator; - - Graph &graph_; - DeBruijn &origin_; - size_t kmer_size_; - - bool StepRightIfPossible(KeyWithHash &kwh) { - // VERIFY(origin_.contains(edge)); - if (origin_.RivalEdgeCount(kwh) == 1 - && origin_.NextEdgeCount(kwh) == 1) { - kwh = origin_.NextEdge(kwh); - // VERIFY(origin_.contains(next_edge)); - return true; - } - return false; - } - - KeyWithHash &GoRight(KeyWithHash &kwh) { - KeyWithHash initial = kwh; - while (StepRightIfPossible(kwh) && kwh != initial) { - ; - } - return kwh; - } - - KeyWithHash &GoLeft(KeyWithHash &kwh) { - //These strange things are in order to avoid making copies of kwh - kwh = !kwh; - kwh = !GoRight(kwh); - return kwh; - } - - Sequence ConstructSeqGoingRight(KeyWithHash &kwh) { - SequenceBuilder s; - s.append(kwh.key()); - KeyWithHash initial = kwh; - while (StepRightIfPossible(kwh) && kwh != initial) { - s.append(kwh[kmer_size_]); - } - return s.BuildSequence(); - } - - Sequence ConstructSequenceWithEdge(const KeyWithHash &kwh) { - KeyWithHash tmp = kwh; - return ConstructSeqGoingRight(GoLeft(tmp)); - } - - VertexId FindVertexByOutgoingEdges(Kmer kmer) { - for (char c = 0; c < 4; ++c) { - KeyWithHash edge = origin_.ConstructKWH(kmer.pushBack(c)); - if (origin_.contains(edge)) - return graph_.EdgeStart(origin_.get_value(edge).edge_id); - } - return VertexId(NULL); - } - - VertexId FindVertexByIncomingEdges(Kmer kmer) { - for (char c = 0; c < 4; ++c) { - KeyWithHash edge = origin_.ConstructKWH(kmer.pushFront(c)); - if (origin_.contains(edge)) { - return graph_.EdgeEnd(origin_.get_value(edge).edge_id); - } - } - return VertexId(NULL); - } - - VertexId FindVertex(Kmer kmer) { - VertexId v = FindVertexByOutgoingEdges(kmer); - return v == VertexId(NULL) ? FindVertexByIncomingEdges(kmer) : v; - } - - VertexId FindVertexMaybeMissing(Kmer kmer) { - VertexId v = FindVertex(kmer); - return v != VertexId(NULL) ? v : graph_.AddVertex(); - } - - VertexId FindEndMaybeMissing(const ConjugateDeBruijnGraph& graph, - VertexId start, Kmer start_kmer, Kmer end_kmer) { - if (start_kmer == end_kmer) { - return start; - } else if (start_kmer == !end_kmer) { - return graph.conjugate(start); - } else { - return FindVertexMaybeMissing(end_kmer); - } - } - - void ConstructPart(const std::vector& kwh_list, - std::vector& sequences) { - for (size_t i = 0; i < sequences.size(); ++i) { - if (origin_.contains(kwh_list[i])) { - continue; - } - - Kmer start_kmer = sequences[i].start < Kmer > (kmer_size_); - Kmer end_kmer = sequences[i].end < Kmer > (kmer_size_); - - VertexId start = FindVertexMaybeMissing(start_kmer); - VertexId end = FindEndMaybeMissing(graph_, start, start_kmer, - end_kmer); - - graph_.AddEdge(start, end, sequences[i]); - } - } - - void AddKmers(kmer_iterator &it, kmer_iterator &end, size_t queueSize, - std::vector& kwh_list) { - for (; kwh_list.size() != queueSize && it != end; ++it) { - KeyWithHash kwh = origin_.ConstructKWH(Kmer(unsigned(kmer_size_ + 1), (*it).data())); - - if (!origin_.contains(kwh)) - kwh_list.push_back(kwh); - } - } - - void CalculateSequences(std::vector &kwh_list, - std::vector &sequences) { - size_t size = kwh_list.size(); - sequences.resize(size); - -# pragma omp parallel for schedule(guided) - for (size_t i = 0; i < size; ++i) { - sequences[i] = ConstructSequenceWithEdge(kwh_list[i]); - } - } - -public: - DeBruijnGraphConstructor(Graph& graph, DeBruijn &origin) : - graph_(graph), origin_(origin), kmer_size_(graph_.k()) { - } - - void ConstructGraph(size_t queueMinSize, size_t queueMaxSize, - double queueGrowthRate) { - kmer_iterator it = origin_.kmer_begin(); - kmer_iterator end = origin_.kmer_end(); - size_t queueSize = queueMinSize; - std::vector kwh_list; - std::vector sequences; - kwh_list.reserve(queueSize); - sequences.reserve(queueMaxSize); - while (it != end) { - AddKmers(it, end, queueSize, kwh_list); // format a queue of kmers that are not in index - CalculateSequences(kwh_list, sequences); // in parallel - ConstructPart(kwh_list, sequences); - kwh_list.clear(); - queueSize = min(size_t(double(queueSize) * queueGrowthRate), queueMaxSize); - } - } - -private: - DECL_LOGGER("DeBruijnGraphConstructor") -}; - -class UnbranchingPathFinder { -private: - typedef DeBruijnExtensionIndex<> Index; - typedef RtSeq Kmer; - typedef Index::kmer_iterator kmer_iterator; - typedef Index::KeyWithHash KeyWithHash; - typedef Index::DeEdge DeEdge; - - Index &origin_; - size_t kmer_size_; - -public: - UnbranchingPathFinder(Index &origin, size_t kmer_size) : origin_(origin), kmer_size_(kmer_size) { - } - - bool StepRightIfPossible(DeEdge &edge) { - if (origin_.CheckUniqueOutgoing(edge.end) && origin_.CheckUniqueIncoming(edge.end)) { - edge = DeEdge(edge.end, origin_.GetUniqueOutgoing(edge.end)); - return true; - } - return false; - } - - Sequence ConstructSeqGoingRight(DeEdge edge) { - SequenceBuilder s; - s.append(edge.start.key()); - s.append(edge.end[kmer_size_ - 1]); - DeEdge initial = edge; - while (StepRightIfPossible(edge) && edge != initial) { - s.append(edge.end[kmer_size_ - 1]); - } - return s.BuildSequence(); - } - - Sequence ConstructSequenceWithEdge(DeEdge edge) { - return ConstructSeqGoingRight(edge); - } - - //Loop consists of 4 parts: 2 selfRC k+1-mers and two sequences of arbitrary length RC to each other; pos is a position of one of selfRC edges - vector SplitLoop(Sequence s, size_t pos) { - return {s.Subseq(pos, pos + kmer_size_ + 1), s.Subseq(pos + 1, s.size() - kmer_size_) + s.Subseq(0, pos + kmer_size_)}; - - } - -//TODO Think about what happends to self rc perfect loops - vector ConstructLoopFromVertex(const KeyWithHash &kh) { - DeEdge break_point(kh, origin_.GetUniqueOutgoing(kh)); - Sequence s = ConstructSequenceWithEdge(break_point); - Kmer kmer = s.start(kmer_size_ + 1) >> 'A'; - for(size_t i = kmer_size_; i < s.size(); i++) { - kmer = kmer << s[i]; - if (kmer == !kmer) { - return SplitLoop(s, i - kmer_size_); - } - } - return {s}; - } -}; - -class UnbranchingPathExtractor { -private: - typedef DeBruijnExtensionIndex<> Index; - typedef RtSeq Kmer; - typedef Index::kmer_iterator kmer_iterator; - typedef Index::DeEdge DeEdge; - typedef Index::KeyWithHash KeyWithHash; - - Index &origin_; - size_t kmer_size_; - - bool IsJunction(KeyWithHash kh) const { - return !(origin_.CheckUniqueOutgoing(kh) && origin_.CheckUniqueIncoming(kh)); - } - - void AddStartDeEdgesForVertex(KeyWithHash kh, std::vector& start_edges) const { - for (char next = 0; next < 4; next++) { - if (origin_.CheckOutgoing(kh, next)) { - TRACE("Added to queue " << DeEdge(kh, origin_.GetOutgoing(kh, next))); - start_edges.push_back(DeEdge(kh, origin_.GetOutgoing(kh, next))); - } - } - } - - void AddStartDeEdges(kmer_iterator &it, size_t queueSize, - std::vector& start_edges) const { - for (; start_edges.size() < queueSize && it.good(); ++it) { - KeyWithHash kh = origin_.ConstructKWH(Kmer(kmer_size_, *it)); - if (IsJunction(kh)) { - AddStartDeEdgesForVertex(kh, start_edges); - KeyWithHash kh_inv = !kh; - if(!(kh_inv.is_minimal())) { - AddStartDeEdgesForVertex(kh_inv, start_edges); - } - } - } - } - - void CalculateSequences(std::vector &edges, - std::vector &sequences, UnbranchingPathFinder &finder) const { - size_t size = edges.size(); - size_t start = sequences.size(); - sequences.resize(start + size); - -# pragma omp parallel for schedule(guided) - for (size_t i = 0; i < size; ++i) { - sequences[start + i] = finder.ConstructSequenceWithEdge(edges[i]); - TRACE("From " << edges[i] << " calculated sequence"); - TRACE(sequences[start + i]); - } - } - - void CleanCondensed(const Sequence &sequence) { - Kmer kmer = sequence.start(kmer_size_); - KeyWithHash kwh = origin_.ConstructKWH(kmer); - origin_.IsolateVertex(kwh); - for(size_t pos = kmer_size_; pos < sequence.size(); pos++) { - kwh = kwh << sequence[pos]; - origin_.IsolateVertex(kwh); - } - } - - void CleanCondensed(const std::vector &sequences) { -# pragma omp parallel for schedule(guided) - for (size_t i = 0; i < sequences.size(); ++i) { - CleanCondensed(sequences[i]); - } - } - - //This methods collects all loops that were not extracted by finding unbranching paths because there are no junctions on loops. - //TODO make parallel - const std::vector CollectLoops() { - INFO("Collecting perfect loops"); - UnbranchingPathFinder finder(origin_, kmer_size_); - std::vector result; - for (kmer_iterator it = origin_.kmer_begin(); it.good(); ++it) { - KeyWithHash kh = origin_.ConstructKWH(Kmer(kmer_size_, *it)); - if (!IsJunction(kh)) { - vector loop = finder.ConstructLoopFromVertex(kh); - for(Sequence s: loop) { - result.push_back(s); - CleanCondensed(s); - if(s != (!s)) { - result.push_back(!s); - } - } - } - } - INFO("Collecting perfect loops finished. " << result.size() << " loops collected"); - return result; - } - -public: - UnbranchingPathExtractor(Index &origin, size_t k) : origin_(origin), kmer_size_(k) { - } - - //TODO very large vector is returned. But I hate to make all those artificial changes that can fix it. - const std::vector ExtractUnbranchingPaths(size_t queueMinSize, size_t queueMaxSize, - double queueGrowthRate) { - INFO("Extracting unbranching paths"); - UnbranchingPathFinder finder(origin_, kmer_size_); - std::vector result; - size_t queueSize = queueMinSize; - std::vector start_edges; - std::vector sequences; - start_edges.reserve(queueSize); - auto it = origin_.kmer_begin(); - while (it.good()) { - AddStartDeEdges(it, queueSize, start_edges); // format a queue of junction kmers - CalculateSequences(start_edges, sequences, finder); // in parallel - start_edges.clear(); - queueSize = min((size_t) ((double) queueSize * queueGrowthRate), queueMaxSize); - } - INFO("Extracting unbranching paths finished. " << sequences.size() << " sequences extracted"); - return sequences; - } - - const std::vector ExtractUnbranchingPathsAndLoops(size_t queueMinSize, size_t queueMaxSize, - double queueGrowthRate) { - std::vector result = ExtractUnbranchingPaths(queueMinSize, queueMaxSize, queueGrowthRate); - CleanCondensed(result); - std::vector loops = CollectLoops(); - for(auto it = loops.begin(); it != loops.end(); ++it) { - result.push_back(*it); - } - return result; - } - -private: - DECL_LOGGER("UnbranchingPathExtractor") -}; - -/* - * Only works for Conjugate dbg - */ -template -class FastGraphFromSequencesConstructor { -private: - typedef typename Graph::EdgeId EdgeId; - typedef typename Graph::VertexId VertexId; - typedef RtSeq Kmer; - typedef DeBruijnExtensionIndex<> Index; - size_t kmer_size_; - Index &origin_; - - class LinkRecord { - private: - size_t hash_and_mask_; - EdgeId edge_; - - size_t BitBool(bool flag) const { - if(flag) - return 1; - return 0; - } - - public: - size_t GetHash() const { - return hash_and_mask_ >> 2; - } - - bool IsRC() const { - return hash_and_mask_ & 2; - } - - bool IsStart() const { - return hash_and_mask_ & 1; - } - - - EdgeId GetEdge() const { - return edge_; - } - - LinkRecord(size_t hash, EdgeId edge, bool is_start, bool is_rc) : - hash_and_mask_((hash << 2) | (BitBool(is_rc) << 1)| BitBool(is_start)), edge_(edge) { - } - - LinkRecord() : - hash_and_mask_(-1ul), edge_(0) { - } - - bool IsInvalid() { - return hash_and_mask_ + 1 == 0 && edge_ == EdgeId(0); - } - - bool operator<(const LinkRecord &other) const { - if(this->hash_and_mask_ == other.hash_and_mask_) - return this->edge_ < other.edge_; - return this->hash_and_mask_ < other.hash_and_mask_; - } - }; - - LinkRecord StartLink(const EdgeId &edge, const Sequence &sequence) const { - Kmer kmer(kmer_size_, sequence); - Kmer kmer_rc = !kmer; - if(kmer < kmer_rc) - return LinkRecord(origin_.ConstructKWH(kmer).idx(), edge, true, false); - else - return LinkRecord(origin_.ConstructKWH(kmer_rc).idx(), edge, true, true); - } - - LinkRecord EndLink(const EdgeId &edge, const Sequence &sequence) const { - Kmer kmer(kmer_size_, sequence, sequence.size() - kmer_size_); - Kmer kmer_rc = !kmer; - if(kmer < kmer_rc) - return LinkRecord(origin_.ConstructKWH(kmer).idx(), edge, false, false); - else - return LinkRecord(origin_.ConstructKWH(kmer_rc).idx(), edge, false, true); - } - - void CollectLinkRecords(typename Graph::HelperT &helper, const Graph &graph, vector &records, const vector &sequences) const { - size_t size = sequences.size(); - records.resize(size * 2, LinkRecord(0, EdgeId(0), false, false)); - restricted::IdSegmentStorage id_storage = helper.graph().GetGraphIdDistributor().Reserve(size * 2); -# pragma omp parallel for schedule(guided) - for (size_t i = 0; i < size; ++i) { - size_t j = i << 1; - auto id_distributor = id_storage.GetSegmentIdDistributor(j, j + 2);//indices for two edges are required - EdgeId edge = helper.AddEdge(DeBruijnEdgeData(sequences[i]), id_distributor); - records[j] = StartLink(edge, sequences[i]); - if(graph.conjugate(edge) != edge) - records[j + 1] = EndLink(edge, sequences[i]); - else - records[j + 1] = LinkRecord(); - } - } - - void LinkEdge(typename Graph::HelperT &helper, const Graph &graph, const VertexId v, const EdgeId edge, const bool is_start, const bool is_rc) const { - VertexId v1 = v; - if(is_rc) { - v1 = graph.conjugate(v); - } - if(is_start) { - helper.LinkOutgoingEdge(v1, edge); - } else { - helper.LinkIncomingEdge(v1, edge); - } - } - -public: - FastGraphFromSequencesConstructor(size_t k, Index &origin) : kmer_size_(k), origin_(origin) { - } - - void ConstructGraph(Graph &graph, const vector &sequences) const { - typename Graph::HelperT helper = graph.GetConstructionHelper(); - vector records; - CollectLinkRecords(helper, graph, records, sequences);//TODO make parallel - parallel::sort(records.begin(), records.end()); - size_t size = records.size(); - vector> vertices_list(omp_get_max_threads()); - restricted::IdSegmentStorage id_storage = helper.graph().GetGraphIdDistributor().Reserve(size * 2); -# pragma omp parallel for schedule(guided) - for(size_t i = 0; i < size; i++) { - if(i != 0 && records[i].GetHash() == records[i - 1].GetHash()) { - continue; - } - if(records[i].IsInvalid()) - continue; - auto id_distributor = id_storage.GetSegmentIdDistributor(i << 1, (i << 1) + 2); - VertexId v = helper.CreateVertex(DeBruijnVertexData(), id_distributor); - vertices_list[omp_get_thread_num()].push_back(v); - for(size_t j = i; j < size && records[j].GetHash() == records[i].GetHash(); j++) { - LinkEdge(helper, graph, v, records[j].GetEdge(), records[j].IsStart(), records[j].IsRC()); - } - } - for(size_t i = 0; i < vertices_list.size(); i++) - helper.AddVerticesToGraph(vertices_list[i].begin(), vertices_list[i].end()); - } -}; - -/* - * Constructs DeBruijnGraph from DeBruijnExtensionIndex using "new DeBruijnGraphExtentionConstructor(DeBruijn).ConstructGraph(DeBruijnGraph, Index)" - */ -template -class DeBruijnGraphExtentionConstructor { -private: - typedef typename Graph::EdgeId EdgeId; - typedef DeBruijnExtensionIndex<> DeBruijn; - typedef typename Graph::VertexId VertexId; - typedef RtSeq Kmer; - - Graph &graph_; - DeBruijn &origin_; - size_t kmer_size_; - - void FilterRC(std::vector &edge_sequences) { - size_t size = 0; - for(size_t i = 0; i < edge_sequences.size(); i++) { - if(!(edge_sequences[i] < !edge_sequences[i])) { - edge_sequences[size] = edge_sequences[i]; - size++; - } - } - edge_sequences.resize(size); - } - -public: - DeBruijnGraphExtentionConstructor(Graph& graph, DeBruijn &origin) : - graph_(graph), origin_(origin), kmer_size_(graph.k()) { - } - - void ConstructGraph(size_t queueMinSize, size_t queueMaxSize, - double queueGrowthRate, bool keep_perfect_loops) { - std::vector edge_sequences; - if(keep_perfect_loops) - edge_sequences = UnbranchingPathExtractor(origin_, kmer_size_).ExtractUnbranchingPathsAndLoops(queueMinSize, queueMaxSize, queueGrowthRate); - else - edge_sequences = UnbranchingPathExtractor(origin_, kmer_size_).ExtractUnbranchingPaths(queueMinSize, queueMaxSize, queueGrowthRate); - FilterRC(edge_sequences); - FastGraphFromSequencesConstructor(kmer_size_, origin_).ConstructGraph(graph_, edge_sequences); - } - -private: - DECL_LOGGER("DeBruijnGraphConstructor") -}; - -} diff -Nru spades-3.10.1+dfsg/src/common/utils/debruijn_graph/early_simplification.hpp spades-3.11.1+dfsg/src/common/utils/debruijn_graph/early_simplification.hpp --- spades-3.10.1+dfsg/src/common/utils/debruijn_graph/early_simplification.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/debruijn_graph/early_simplification.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,191 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#pragma once -#include "utils/standard_base.hpp" -#include "utils/indices/perfect_hash_map.hpp" -#include "utils/mph_index/kmer_index.hpp" - -namespace debruijn_graph { - -class LinkCleaner { -private: - typedef DeBruijnExtensionIndex<> Index; - typedef Index::KMer Kmer; - typedef Index::KeyWithHash KeyWithHash; - Index &index_; - - void CleanForwardLinks(KeyWithHash &kh, char i) { - if(index_.CheckOutgoing(kh, i)) { - KeyWithHash next_kh = index_.GetOutgoing(kh, i); - if(!index_.CheckIncoming(next_kh, kh[0])) { - index_.DeleteOutgoing(kh, i); - } - } - } - - void CleanBackwardLinks(KeyWithHash &kh, char i) { - if(index_.CheckIncoming(kh, i)) { - KeyWithHash prev_kh = index_.GetIncoming(kh, i); - if(!index_.CheckOutgoing(prev_kh, kh[index_.k() - 1])) { - index_.DeleteIncoming(kh, i); - } - } - } - -public: - LinkCleaner(Index &index) : index_(index) {} - - //TODO make parallel - void CleanLinks() { - vector iters = index_.kmer_begin(10 * omp_get_max_threads()); -# pragma omp parallel for schedule(guided) - for(size_t i = 0; i < iters.size(); i++) { - for (Index::kmer_iterator &it = iters[i]; it.good(); ++it) { - KeyWithHash kh = index_.ConstructKWH(RtSeq(index_.k(), *it)); - if (kh.is_minimal()) { - KeyWithHash kh = index_.ConstructKWH(RtSeq(index_.k(), *it)); - for (char i = 0; i < 4; i++) { - CleanForwardLinks(kh, i); - CleanBackwardLinks(kh, i); - } - } - } - } - } -}; - -class AlternativeEarlyTipClipper { -private: - typedef DeBruijnExtensionIndex<> Index; - typedef Index::KMer Kmer; - typedef Index::KeyWithHash KeyWithHash; - Index &index_; - size_t length_bound_; - - /* - * This method starts from the kmer that is second in the tip counting from junction vertex. It records all kmers of a tip into tip vector. - * The method returns length of a tip. - * In case it did not end as a tip or if it was too long tip vector is cleared and infinite length is returned. - * Thus tip vector contains only kmers to be removed while returned length value gives reasonable information of what happend. - */ - size_t FindForward(KeyWithHash kh, vector &tip) { - while(tip.size() < length_bound_ && index_.CheckUniqueIncoming(kh) && index_.CheckUniqueOutgoing(kh)) { - tip.push_back(kh); - kh = index_.GetUniqueOutgoing(kh); - } - tip.push_back(kh); - if(index_.CheckUniqueIncoming(kh) && index_.IsDeadEnd(kh)) { - return tip.size(); - } - tip.clear(); - return -1; - } - - size_t FindBackward(KeyWithHash kh, vector &tip) { - while(tip.size() < length_bound_ && index_.CheckUniqueOutgoing(kh) && index_.CheckUniqueIncoming(kh)) { - tip.push_back(kh); - kh = index_.GetUniqueIncoming(kh); - } - tip.push_back(kh); - if(index_.CheckUniqueOutgoing(kh) && index_.IsDeadStart(kh)) { - return tip.size(); - } - tip.clear(); - return -1; - } - - size_t RemoveTip(vector &tip) { - for(size_t i = 0; i < tip.size(); i++) - index_.IsolateVertex(tip[i]); - return tip.size(); - } - - size_t RemoveTips(vector > tips, size_t max) { - size_t result = 0; - for(char c = 0; c < 4; c++) { - if(tips[c].size() < max) { - result += RemoveTip(tips[c]); - } - } - return result; - } - - size_t RemoveForward(KeyWithHash kh) { - vector> tips; - tips.resize(4); - size_t max = 0; - for(char c = 0; c < 4; c++) { - if(index_.CheckOutgoing(kh, c)) { - KeyWithHash khc = index_.GetOutgoing(kh, c); - size_t len = FindForward(khc, tips[c]); - if(len > max) - max = len; - } - } - return RemoveTips(tips, max); - } - - size_t RemoveBackward(KeyWithHash kh) { - vector> tips; - tips.resize(4); - size_t max = 0; - for(char c = 0; c < 4; c++) { - if(index_.CheckIncoming(kh, c)) { - KeyWithHash khc = index_.GetIncoming(kh, c); - size_t len = FindBackward(khc, tips[c]); - if(len > max) - max = len; - } - } - return RemoveTips(tips, max); - } - - //TODO make parallel - size_t RoughClipTips() { - vector iters = index_.kmer_begin(10 * omp_get_max_threads()); - vector result(iters.size()); -# pragma omp parallel for schedule(guided) - for(size_t i = 0; i < iters.size(); i++) { - for(Index::kmer_iterator &it = iters[i]; it.good(); ++it) { - KeyWithHash kh = index_.ConstructKWH(RtSeq(index_.k(), *it)); - if(kh.is_minimal()) { - if (index_.OutgoingEdgeCount(kh) >= 2) { - result[i] += RemoveForward(kh); - } - if (index_.IncomingEdgeCount(kh) >= 2) { - result[i] += RemoveBackward(kh); - } - } - } - } - size_t sum = 0; - for(size_t i = 0; i < result.size(); i++) - sum += result[i]; - return sum; - } - - -public: - AlternativeEarlyTipClipper(Index &index, size_t length_bound) : index_(index), length_bound_(length_bound) { - } - - /* - * Method returns the number of removed edges - */ - size_t ClipTips() { - INFO("Early tip clipping"); - size_t result = RoughClipTips(); - LinkCleaner(index_).CleanLinks(); - INFO(result << " " << (index_.k()+1) <<"-mers were removed by early tip clipper"); - return result; - } -protected: - DECL_LOGGER("Early tip clipping"); -}; - -} diff -Nru spades-3.10.1+dfsg/src/common/utils/extension_index/kmer_extension_index_builder.hpp spades-3.11.1+dfsg/src/common/utils/extension_index/kmer_extension_index_builder.hpp --- spades-3.10.1+dfsg/src/common/utils/extension_index/kmer_extension_index_builder.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/extension_index/kmer_extension_index_builder.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -0,0 +1,109 @@ +//*************************************************************************** +//* Copyright (c) 2016 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "kmer_extension_index.hpp" +#include "utils/kmer_mph/kmer_splitters.hpp" + +namespace utils { + +class DeBruijnExtensionIndexBuilder { +public: + template + size_t FillExtensionsFromStream(ReadStream &stream, Index &index) const { + unsigned k = index.k(); + size_t rl = 0; + + while (!stream.eof()) { + typename ReadStream::read_type r; + stream >> r; + rl = std::max(rl, r.size()); + + const Sequence &seq = r.sequence(); + if (seq.size() < k + 1) + continue; + + typename Index::KeyWithHash kwh = index.ConstructKWH(seq.start(k)); + for (size_t j = k; j < seq.size(); ++j) { + char nnucl = seq[j], pnucl = kwh[0]; + index.AddOutgoing(kwh, nnucl); + kwh <<= nnucl; + index.AddIncoming(kwh, pnucl); + } + } + + return rl; + } + + template + void FillExtensionsFromIndex(const std::string &KPlusOneMersFilename, + Index &index) const { + unsigned KPlusOne = index.k() + 1; + + typename Index::kmer_iterator it(KPlusOneMersFilename, + RtSeq::GetDataSize(KPlusOne)); + for (; it.good(); ++it) { + RtSeq kpomer(KPlusOne, *it); + + char pnucl = kpomer[0], nnucl = kpomer[KPlusOne - 1]; + TRACE("processing k+1-mer " << kpomer); + index.AddOutgoing(index.ConstructKWH(RtSeq(KPlusOne - 1, kpomer)), + nnucl); + // FIXME: This is extremely ugly. Needs to add start / end methods to extract first / last N symbols... + index.AddIncoming(index.ConstructKWH(RtSeq(KPlusOne - 1, kpomer << 0)), + pnucl); + } + } + +public: + template + ReadStatistics BuildExtensionIndexFromStream(Index &index, Streams &streams, io::SingleStream *contigs_stream = 0, + size_t read_buffer_size = 0) const { + unsigned nthreads = (unsigned) streams.size(); + + // First, build a k+1-mer index + DeBruijnReadKMerSplitter> + splitter(index.workdir(), index.k() + 1, 0xDEADBEEF, streams, + contigs_stream, read_buffer_size); + KMerDiskCounter counter(index.workdir(), splitter); + counter.CountAll(nthreads, nthreads, /* merge */false); + + // Now, count unique k-mers from k+1-mers + DeBruijnKMerKMerSplitter > + splitter2(index.workdir(), index.k(), + index.k() + 1, Index::storing_type::IsInvertable(), read_buffer_size); + for (unsigned i = 0; i < nthreads; ++i) + splitter2.AddKMers(counter.GetMergedKMersFname(i)); + KMerDiskCounter counter2(index.workdir(), splitter2); + + BuildIndex(index, counter2, 16, nthreads); + + // Build the kmer extensions + INFO("Building k-mer extensions from k+1-mers"); +# pragma omp parallel for num_threads(nthreads) + for (unsigned i = 0; i < nthreads; ++i) + FillExtensionsFromIndex(counter.GetMergedKMersFname(i), index); + INFO("Building k-mer extensions from k+1-mers finished."); + + return splitter.stats(); + } + +private: + DECL_LOGGER("DeBruijnExtensionIndexBuilder"); +}; + +template +struct ExtensionIndexHelper { + using IndexT = Index; + typedef typename Index::traits_t traits_t; + typedef typename Index::KMer Kmer; + typedef typename Index::KMerIdx KMerIdx; + using DeBruijnExtensionIndexBuilderT = DeBruijnExtensionIndexBuilder; +}; + +} diff -Nru spades-3.10.1+dfsg/src/common/utils/extension_index/kmer_extension_index.hpp spades-3.11.1+dfsg/src/common/utils/extension_index/kmer_extension_index.hpp --- spades-3.10.1+dfsg/src/common/utils/extension_index/kmer_extension_index.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/extension_index/kmer_extension_index.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -0,0 +1,314 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "utils/ph_map/perfect_hash_map.hpp" +#include "utils/stl_utils.hpp" +#include "utils/ph_map/storing_traits.hpp" +#include + +namespace utils { + +inline uint8_t invert_byte_slow(uint8_t a) { + size_t res = 0; + for(size_t i = 0; i < 8; i++) { + res <<= 1; + res += a & 1; + a = uint8_t(a >> 1); + } + return uint8_t(res); +} + +inline vector count_invert_byte() { + vector result; + for (size_t a = 0; a < 256; a++) { + result.push_back(invert_byte_slow((uint8_t)a)); + } + return result; +} + +inline uint8_t invert_byte(uint8_t a) { + static vector precalc = count_invert_byte(); + return precalc[a]; +} + +class InOutMask { +private: + uint8_t mask_; + + bool CheckUnique(uint8_t mask) const { + static bool unique[] = + { 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 }; + return unique[mask]; + } + + char GetUnique(uint8_t mask) const { + static char next[] = { -1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, + -1, -1, -1 }; + return next[mask]; + } + + size_t Count(uint8_t mask) const { + static char count[] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; + return count[mask]; + } + + char inv_position(char nucl, bool as_is) const { + if (as_is) + return nucl; + else + return char(7 - nucl); + } + + uint8_t outgoing() const { + return mask_ & 0xF; + } + + uint8_t incoming() const { + return (mask_ >> 4) & 0xF; + } + +public: + explicit InOutMask(uint8_t mask = 0) + : mask_(mask) {} + + uint8_t get_mask() const { + return mask_; + } + + template + InOutMask conjugate(const Key & /*k*/) const { + return InOutMask(invert_byte(mask_)); + } + + void AddOutgoing(char nnucl, bool as_is) { + unsigned nmask = (unsigned) (1 << inv_position(nnucl, as_is)); + if (!(mask_ & nmask)) { +# pragma omp atomic + mask_ |= (unsigned char) nmask; + } + } + + void AddIncoming(char pnucl, bool as_is) { + unsigned pmask = (unsigned) (1 << inv_position(char(pnucl + 4), as_is)); + if (!(mask_ & pmask)) { +# pragma omp atomic + mask_ |= (unsigned char) pmask; + } + } + + void DeleteOutgoing(char nnucl, bool as_is) { + unsigned nmask = (1 << inv_position(nnucl, as_is)); + if (mask_ & nmask) { +# pragma omp atomic + mask_ &= (unsigned char) ~nmask; + } + } + + void DeleteIncoming(char pnucl, bool as_is) { + unsigned pmask = (1 << inv_position(char(pnucl + 4), as_is)); + if (mask_ & pmask) { +# pragma omp atomic + mask_ &= (unsigned char) ~pmask; + } + } + + void IsolateVertex() { + mask_ = 0; + } + + bool CheckOutgoing(char nucl) const { + return mask_ & (1 << nucl); + } + + bool CheckIncoming(char nucl) const { + return mask_ & (1 << (4 + nucl)); + } + + bool IsDeadEnd() const { + return outgoing() == 0; + } + + bool IsDeadStart() const { + return incoming() == 0; + } + + bool CheckUniqueOutgoing() const { + return CheckUnique(outgoing()); + } + + bool CheckUniqueIncoming() const { + return CheckUnique(incoming()); + } + + char GetUniqueOutgoing() const { + return GetUnique(outgoing()); + } + + char GetUniqueIncoming() const { + return GetUnique(incoming()); + } + + size_t OutgoingEdgeCount() const { + return Count(outgoing()); + } + + size_t IncomingEdgeCount() const { + return Count(incoming()); + } +}; + +template +Stream &operator<<(Stream& stream, const InOutMask &mask) { + return stream << std::bitset<8>(mask.get_mask()); +} + +template +struct slim_kmer_index_traits : public utils::kmer_index_traits { + typedef utils::kmer_index_traits __super; + + typedef MMappedRecordReader FinalKMerStorage; + + template + static void raw_serialize(Writer&, typename __super::RawKMerStorage*) { + VERIFY(false && "Cannot save extension index"); + } + + template + static typename __super::RawKMerStorage *raw_deserialize( + Reader&, const std::string &) { + VERIFY(false && "Cannot load extension index"); + return NULL; + } + +}; + +template +struct AbstractDeEdge { + KeyWithHash start; + KeyWithHash end; + AbstractDeEdge(KeyWithHash s, KeyWithHash e) + : start(std::move(s)), end(std::move(e)) {} + + AbstractDeEdge &operator=(const AbstractDeEdge &that) { + this->start = that.start; + this->end = that.end; + return *this; + } + + bool operator==(const AbstractDeEdge &other) { + return start == other.start && end == other.end; + } + + bool operator!=(const AbstractDeEdge &other) { + return !(*this == other); + } +}; + +template +stream &operator<<(stream &s, const AbstractDeEdge de_edge) { + return s << "DeEdge[" << de_edge.start << ", " << de_edge.end << "]"; +} + +template, class StoringType = DefaultStoring> +class DeBruijnExtensionIndex : public KeyIteratingMap { + typedef KeyIteratingMap base; + +public: + typedef typename base::traits_t traits_t; + typedef StoringType storing_type; + typedef typename base::KeyType KMer; + typedef typename base::IdxType KMerIdx; + typedef typename base::KeyWithHash KeyWithHash; + typedef AbstractDeEdge DeEdge; + using base::ConstructKWH; + + DeBruijnExtensionIndex(unsigned K, const std::string &workdir) + : base((size_t) K, workdir) { + } + + void AddOutgoing(const KeyWithHash &kwh, char nucl) { + TRACE("Add outgoing " << kwh << " " << size_t(nucl) << " " << kwh.is_minimal()); + this->get_raw_value_reference(kwh).AddOutgoing(nucl, kwh.is_minimal()); + } + + void AddIncoming(const KeyWithHash &kwh, char nucl) { + TRACE("Add incoming " << kwh << " " << size_t(nucl) << " " << kwh.is_minimal()); + this->get_raw_value_reference(kwh).AddIncoming(nucl, kwh.is_minimal()); + } + + void DeleteOutgoing(const KeyWithHash &kwh, char nucl) { + TRACE("Delete outgoing " << kwh << " " << size_t(nucl) << " " << kwh.is_minimal()); + this->get_raw_value_reference(kwh).DeleteOutgoing(nucl, kwh.is_minimal()); + } + + void DeleteIncoming(const KeyWithHash &kwh, char nucl) { + TRACE("Delete incoming " << kwh << " " << size_t(nucl) << " " << kwh.is_minimal()); + this->get_raw_value_reference(kwh).DeleteIncoming(nucl, kwh.is_minimal()); + } + + void IsolateVertex(const KeyWithHash &kwh) { + TRACE("Isolate vertex " << kwh); + this->get_raw_value_reference(kwh).IsolateVertex(); + } + + bool CheckOutgoing(const KeyWithHash &kwh, char nucl) const { + return this->get_value(kwh).CheckOutgoing(nucl); + } + + KeyWithHash GetOutgoing(const KeyWithHash &kwh, char nucl) const { + return kwh << nucl; + } + + bool CheckIncoming(const KeyWithHash &kwh, char nucl) const { + return this->get_value(kwh).CheckIncoming(nucl); + } + + KeyWithHash GetIncoming(const KeyWithHash &kwh, char nucl) const { + return kwh >> nucl; + } + + bool IsDeadEnd(const KeyWithHash &kwh) const { + return this->get_value(kwh).IsDeadEnd(); + } + + bool IsDeadStart(const KeyWithHash &kwh) const { + return this->get_value(kwh).IsDeadStart(); + } + + bool CheckUniqueOutgoing(const KeyWithHash &kwh) const { + return this->get_value(kwh).CheckUniqueOutgoing(); + } + + KeyWithHash GetUniqueOutgoing(const KeyWithHash &kwh) const { + return GetOutgoing(kwh, this->get_value(kwh).GetUniqueOutgoing()); + } + + bool CheckUniqueIncoming(const KeyWithHash &kwh) const { + return this->get_value(kwh).CheckUniqueIncoming(); + } + + KeyWithHash GetUniqueIncoming(const KeyWithHash &kwh) const { + return GetIncoming(kwh, this->get_value(kwh).GetUniqueIncoming()); + } + + size_t OutgoingEdgeCount(const KeyWithHash &kwh) const { + return this->get_value(kwh).OutgoingEdgeCount(); + } + + size_t IncomingEdgeCount(const KeyWithHash &kwh) const { + return this->get_value(kwh).IncomingEdgeCount(); + } + + ~DeBruijnExtensionIndex() { } + +private: + DECL_LOGGER("ExtentionIndex"); +}; + +} diff -Nru spades-3.10.1+dfsg/src/common/utils/file_limit.hpp spades-3.11.1+dfsg/src/common/utils/file_limit.hpp --- spades-3.10.1+dfsg/src/common/utils/file_limit.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/file_limit.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,33 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#pragma once - -#include -#include -#include - -#include "utils/verify.hpp" - -inline rlim_t limit_file(size_t limit) { - struct rlimit rl; - - int res = getrlimit(RLIMIT_NOFILE, &rl); - VERIFY_MSG(res == 0, - "getrlimit(2) call failed, errno = " << errno); - - // We cannot go beyond hard limit and we might not have enough privileges to - // increase the hard limit - limit = std::max(limit, rl.rlim_cur); - rl.rlim_cur = std::min(limit, rl.rlim_max); - res = setrlimit(RLIMIT_NOFILE, &rl); - VERIFY_MSG(res == 0, - "setrlimit(2) call failed, errno = " << errno); - INFO("Open file limit set to " << rl.rlim_cur); - - return rl.rlim_cur; -} diff -Nru spades-3.10.1+dfsg/src/common/utils/filesystem/copy_file.cpp spades-3.11.1+dfsg/src/common/utils/filesystem/copy_file.cpp --- spades-3.10.1+dfsg/src/common/utils/filesystem/copy_file.cpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/filesystem/copy_file.cpp 2017-09-28 09:05:11.000000000 +0000 @@ -0,0 +1,155 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "copy_file.hpp" + +#include "path_helper.hpp" +#include "utils/logger/logger.hpp" + +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace fs { + +namespace details { + +void copy_file(std::string from_path, std::string to_path) { + using namespace std; + + from_path = make_full_path(from_path); + to_path = make_full_path(to_path); + + if (from_path == to_path) + return; + + std::ifstream source(from_path, ios::binary); + std::ofstream dest(to_path.c_str(), ios::binary); + + dest << source.rdbuf(); +} + +void hard_link(std::string from_path, std::string to_path) { + from_path = make_full_path(from_path); + to_path = make_full_path(to_path); + + if (from_path == to_path) + return; + + if (link(from_path.c_str(), to_path.c_str()) == -1) { + WARN("Failed to create link. Reason: " << strerror(errno) << ". Error code: " << errno << ". Copying instead"); + copy_file(from_path, to_path); + } +} + +files_t files_in_folder(std::string const& path) { + DIR *dp; + if ((dp = opendir(path.c_str())) == NULL) + throw std::runtime_error("can not open folder " + path); + + files_t files; + + struct dirent *dirp; + while ((dirp = readdir(dp)) != NULL) + if (dirp->d_type == DT_REG) + files.push_back(append_path(path, dirp->d_name)); + + closedir(dp); + return files; +} + +files_t folders_in_folder(std::string const& path) { + DIR *dp; + if ((dp = opendir(path.c_str())) == NULL) + throw std::runtime_error("can not open folder " + path); + + files_t folders; + + struct dirent *dirp; + while ((dirp = readdir(dp)) != NULL) + if (dirp->d_type == DT_DIR) { + std::string folder = dirp->d_name; + + if (folder != "." && folder != "..") + folders.push_back(append_path(path, folder)); + } + + closedir(dp); + return folders; +} + +} // details + +fs::files_t files_by_prefix(std::string const& path) { + using namespace details; + files_t files; + + std::string folder(parent_path(path)); + std::string prefix = filename(path); + + files_t out_files; + const files_t all_files = files_in_folder(folder); + + for (auto it = all_files.begin(); it != all_files.end(); ++it) // no std::copy_if before C++11 + if (boost::starts_with(filename(*it), prefix)) + out_files.push_back(*it); + + return out_files; +} + +void copy_files_by_prefix(fs::files_t const& files, std::string const& to_folder) { + using namespace details; + + for (auto it = files.begin(); it != files.end(); ++it) { + files_t files_to_copy = files_by_prefix(*it); + + for (auto it = files_to_copy.begin(); it != files_to_copy.end(); ++it) + copy_file(*it, append_path(to_folder, filename(*it))); + } +} + +void link_files_by_prefix(fs::files_t const& files, std::string const& to_folder) { + using namespace details; + + for (auto it = files.begin(); it != files.end(); ++it) { + files_t files_to_copy = files_by_prefix(*it); + + for (auto it = files_to_copy.begin(); it != files_to_copy.end(); ++it) + hard_link(*it, append_path(to_folder, filename(*it))); + } +} + +void copy_files_by_ext(std::string const& from_folder, std::string const& to_folder, std::string const& ext, bool recursive) { + using namespace details; + + files_t files = files_in_folder(from_folder); + + for (auto it = files.begin(); it != files.end(); ++it) + if (boost::ends_with(*it, ext)) + copy_file(*it, append_path(to_folder, filename(*it))); + + if (recursive) { + files_t folders = folders_in_folder(from_folder); + + for (auto it = folders.begin(); it != folders.end(); ++it) { + std::string subdir = append_path(to_folder, filename(*it)); + fs:: make_dir(subdir); + copy_files_by_ext(*it, subdir, ext, recursive); + } + } +} + +} diff -Nru spades-3.10.1+dfsg/src/common/utils/filesystem/copy_file.hpp spades-3.11.1+dfsg/src/common/utils/filesystem/copy_file.hpp --- spades-3.10.1+dfsg/src/common/utils/filesystem/copy_file.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/filesystem/copy_file.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -0,0 +1,18 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "path_helper.hpp" +#include + +namespace fs { + +files_t files_by_prefix(std::string const& path); +void copy_files_by_prefix(files_t const& files, std::string const& to_folder); +void link_files_by_prefix(files_t const& files, std::string const& to_folder); +void copy_files_by_ext(std::string const& from_folder, std::string const& to_folder, std::string const& ext, bool recursive); + +} diff -Nru spades-3.10.1+dfsg/src/common/utils/filesystem/file_limit.hpp spades-3.11.1+dfsg/src/common/utils/filesystem/file_limit.hpp --- spades-3.10.1+dfsg/src/common/utils/filesystem/file_limit.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/filesystem/file_limit.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -0,0 +1,37 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include +#include +#include + +#include "utils/verify.hpp" + +namespace utils { + +inline rlim_t limit_file(size_t limit) { + struct rlimit rl; + + int res = getrlimit(RLIMIT_NOFILE, &rl); + VERIFY_MSG(res == 0, + "getrlimit(2) call failed, errno = " << errno); + + // We cannot go beyond hard limit and we might not have enough privileges to + // increase the hard limit + limit = std::max(limit, rl.rlim_cur); + rl.rlim_cur = std::min(limit, rl.rlim_max); + res = setrlimit(RLIMIT_NOFILE, &rl); + VERIFY_MSG(res == 0, + "setrlimit(2) call failed, errno = " << errno); + INFO("Open file limit set to " << rl.rlim_cur); + + return rl.rlim_cur; +} + +} diff -Nru spades-3.10.1+dfsg/src/common/utils/filesystem/path_helper.cpp spades-3.11.1+dfsg/src/common/utils/filesystem/path_helper.cpp --- spades-3.10.1+dfsg/src/common/utils/filesystem/path_helper.cpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/filesystem/path_helper.cpp 2017-09-28 09:05:11.000000000 +0000 @@ -0,0 +1,250 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "path_helper.hpp" + +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace fs { + +bool make_dir(std::string const& folder) { + return mkdir(folder.c_str(), 0755) == 0; +} + +std::string make_temp_dir(std::string const& prefix, + std::string const& suffix) { + std::string name = append_path(prefix, suffix + "_XXXXXX"); + char* actual; + if ((actual = ::mkdtemp(strcpy(new char[name.length() + 1], name.c_str()))) + == NULL) + throw std::runtime_error("Cannot create temporary dir " + name); + + std::string result(actual); + if (result == name) + throw std::runtime_error("Cannot create temporary dir " + name); + + delete[] actual; + + return result; +} + +void remove_dir(std::string const& folder) { + DIR *dp; + if ((dp = opendir(folder.c_str())) == NULL) + throw std::runtime_error("can not open folder " + folder); + + struct dirent *dirp; + while ((dirp = readdir(dp)) != NULL) { + std::string full_path = folder + "/" + dirp->d_name; + + if (dirp->d_type == DT_DIR) { + if (std::string(".") != dirp->d_name + && std::string("..") != dirp->d_name) { + remove_dir(full_path); + } + } else + remove(full_path.c_str()); + } + + closedir(dp); + remove(folder.c_str()); +} + +bool is_regular_file(std::string const& path) { + struct stat st; + return (stat(path.c_str(), &st) == 0) && (S_ISREG(st.st_mode)); +} + +std::string append_path(std::string const& prefix, std::string const& suffix) { + std::string delimiter = ""; + + if (!boost::ends_with(prefix, "/") && !boost::starts_with(suffix, "/") + && !prefix.empty()) { + delimiter = "/"; + } + + return prefix + delimiter + suffix; +} + +std::string current_dir() { + char* cwd = getcwd(NULL, 0); + std::string result = cwd; + + free(cwd); + return result; +} + +std::string make_full_path(std::string const& path) { + if (!boost::starts_with(path, "/")) // relative path + return append_path(current_dir(), path); + return path; +} + +std::string filename(std::string const& path) { + size_t pos = path.find_last_of('/'); + return pos != std::string::npos ? path.substr(pos + 1) : path; +} + +std::string basename(std::string const& path) { + size_t slash = path.find_last_of('/'); + size_t after_slash = slash == std::string::npos ? 0 : slash + 1; + + size_t dot = path.find_last_of('.'); + if (dot < after_slash) + dot = std::string::npos; + + return path.substr(after_slash, dot - after_slash); +} + +std::string extension(std::string const& path) { + size_t slash = path.find_last_of('/'); + size_t after_slash = slash == std::string::npos ? 0 : slash + 1; + size_t dot = path.find_last_of('.'); + + if (dot < after_slash || dot == std::string::npos || dot + 1 == path.size()) + return std::string(); + + return path.substr(dot); +} + +std::string parent_path(std::string const& path) { + std::string cpath(path); + + cpath = make_full_path(cpath); + size_t slash_pos = cpath.find_last_of('/'); + + return (slash_pos == 0 ? std::string("/") : cpath.substr(0, slash_pos)); +} + +bool check_existence(std::string const& path) { + struct stat st_buf; + return stat(path.c_str(), &st_buf) == 0 + && (S_ISREG(st_buf.st_mode) || S_ISDIR(st_buf.st_mode)); // exists and (file or dir) +} + +void remove_if_exists(std::string const& path) { + if (check_existence(path)) { + if (is_regular_file(path)) // file + remove(path.c_str()); + else // dir + remove_dir(path); + } +} + +//TODO do we need to screen anything but whitespaces? +std::string screen_whitespaces(std::string const &path) { + std::string to_search = " "; + std::string res = ""; + for (size_t i = 0; i < path.size(); i++) { + if ((i == 0) || (path[i] != ' ') || (path[i - 1] == '\\')) { + res += path[i]; + } else { + res +='\\'; + res +=' '; + } + } +// res += "'"; + return res; +} + +//todo reduce code duplication!!! +bool FileExists(std::string const &filename) { + struct stat st_buf; + return stat(filename.c_str(), &st_buf) == 0 && S_ISREG(st_buf.st_mode); +} + +void CheckFileExistenceFATAL(std::string const &filename) { + if (!FileExists(filename)) FATAL_ERROR("File " << filename << " doesn't exist or can't be read!"); +} + +void make_dirs(std::string const &path) { + VERIFY(!path.empty()); + + size_t slash_pos = 0; + while ((slash_pos = path.find_first_of('/', slash_pos + 1)) != std::string::npos) { + make_dir(path.substr(0, slash_pos)); + } + if (path[path.size() - 1] != '/') { + make_dir(path); + } +} + +// doesn't support symlinks +std::string resolve(std::string const& path) { + typedef boost::char_delimiters_separator separator_t; + typedef boost::tokenizer tokenizer_t; + + tokenizer_t tok(path, separator_t(false, "", "/")); + + std::string result = "/"; + for (auto it = tok.begin(); it != tok.end(); ++it) { + if (*it == "..") + result = parent_path(result); + + else if (*it == ".") + ; // Ignore + + else + // Just cat other path entries + result = append_path(result, *it); + } + + return result; +} + +std::string make_relative_path(std::string p, std::string base) { + p = resolve(p); + base = resolve(base); + + std::string pp = parent_path(p); + + typedef boost::char_delimiters_separator separator_t; + typedef boost::tokenizer tokenizer_t; + + tokenizer_t pp_tok(pp, separator_t(false, "", "/")); + tokenizer_t base_tok(base, separator_t(false, "", "/")); + + auto i = pp_tok.begin(); + auto j = base_tok.begin(); + + while (i != pp_tok.end() && j != base_tok.end() && *i == *j) { + ++i; + ++j; + } + + std::string result; + for (; j != base_tok.end(); ++j) + result = append_path("..", result); + + for (; i != pp_tok.end(); ++i) + result = append_path(result, *i); + + return append_path(result, filename(p)); +} + +std::string MakeLaunchTimeDirName() { + time_t rawtime; + struct tm * timeinfo; + char buffer[80]; + + time(&rawtime); + timeinfo = localtime(&rawtime); + + strftime(buffer, 80, "%m.%d_%H.%M.%S", timeinfo); + return std::string(buffer); +} + +} diff -Nru spades-3.10.1+dfsg/src/common/utils/filesystem/path_helper.hpp spades-3.11.1+dfsg/src/common/utils/filesystem/path_helper.hpp --- spades-3.10.1+dfsg/src/common/utils/filesystem/path_helper.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/filesystem/path_helper.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -0,0 +1,90 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include +#include +#include +#include + +#include +#include +#include "utils/logger/logger.hpp" +#include "utils/verify.hpp" + +namespace fs { +//todo review and make names consistent! + +typedef std::vector files_t; + +bool make_dir(std::string const &folder); + +std::string make_temp_dir(std::string const &prefix, std::string const &suffix); + +void remove_dir(std::string const &folder); + +bool is_regular_file(std::string const &path); + +std::string append_path(std::string const &prefix, std::string const &suffix); + +std::string current_dir(); + +std::string make_full_path(std::string const &path); + +std::string filename(std::string const &path); + +std::string basename(std::string const &path); + +std::string extension(std::string const &path); + +std::string parent_path(std::string const &path); + +bool check_existence(std::string const &path); + +void remove_if_exists(std::string const &path); + +std::string screen_whitespaces(std::string const &path); + +/** +* Checks if file exists. +* Analogs: http://www.techbytes.ca/techbyte103.html , http://www.gamedev.net/topic/211918-determining-if-a-file-exists-c/ +*/ +bool FileExists(std::string const &filename); + +/** +* Exit(1) if file doesn't exists, writes FATAL log message. +*/ +void CheckFileExistenceFATAL(std::string const &filename); + +void make_dirs(std::string const &path); + +// doesn't support symlinks +std::string resolve(std::string const &path); + +std::string make_relative_path(std::string p, std::string base = current_dir()); + +std::string MakeLaunchTimeDirName(); + +class TmpFolderFixture +{ + std::string tmp_folder_; + +public: + TmpFolderFixture(std::string tmp_folder = "tmp") : + tmp_folder_(tmp_folder) + { + fs::make_dirs(tmp_folder_); + } + + ~TmpFolderFixture() + { + fs::remove_dir(tmp_folder_); + } +}; + +} diff -Nru spades-3.10.1+dfsg/src/common/utils/indices/edge_index_builders.hpp spades-3.11.1+dfsg/src/common/utils/indices/edge_index_builders.hpp --- spades-3.10.1+dfsg/src/common/utils/indices/edge_index_builders.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/indices/edge_index_builders.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,174 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#pragma once - -#include "edge_info_updater.hpp" -#include "perfect_hash_map_builder.hpp" - -namespace debruijn_graph { - -template -class GraphPositionFillingIndexBuilder { -public: - typedef Index IndexT; - typedef typename Index::KMer Kmer; - - template - void BuildIndexFromGraph(Index &index, - const Graph/*T*/ &g, size_t read_buffer_size = 0) const { - debruijn_graph::BuildIndexFromGraph(index, g, read_buffer_size); - - // Now use the index to fill the coverage and EdgeId's - INFO("Collecting k-mer coverage information from graph, this takes a while."); - EdgeInfoUpdater updater(g, index); - updater.UpdateAll(); - } - -}; - -template struct Void { typedef void type; }; - -template -struct has_contains: std::false_type {}; - -template -struct has_contains< - T - , typename Void< - //decltype( std::declval().contains(typename T::KMerIdx(0), typename T::KMer()) ) - decltype( ((T*)(0))->contains(*((typename T::KeyWithHash*)(0))) ) - >::type ->: std::true_type {}; - -template -class CoverageFillingEdgeIndexBuilder : public Builder { - typedef Builder base; - public: - typedef typename Builder::IndexT IndexT; - typedef typename IndexT::KMer Kmer; - typedef typename IndexT::KMerIdx KmerIdx; - typedef typename IndexT::KeyWithHash KeyWithHash; - - private: - - - bool ContainsWrap(bool check_contains, IndexT& index, const KeyWithHash &kwh, std::true_type) const { - return !check_contains || index.contains(kwh); - } - - bool ContainsWrap(bool /*check_contains*/, IndexT&/* index*/, const KeyWithHash &/*kwh*/, std::false_type) const { - VERIFY(false); -// VERIFY(!check_contains); - return true; - } - - template - size_t FillCoverageFromStream(ReadStream &stream, - IndexT &index, bool check_contains) const { - unsigned k = index.k(); - size_t rl = 0; - - while (!stream.eof()) { - typename ReadStream::ReadT r; - stream >> r; - rl = std::max(rl, r.size()); - - const Sequence &seq = r.sequence(); - if (seq.size() < k) - continue; - - KeyWithHash kwh = index.ConstructKWH(seq.start(k) >> 'A'); - for (size_t j = k - 1; j < seq.size(); ++j) { - kwh <<= seq[j]; - //contains is not used since index might be still empty here - if (kwh.is_minimal() && index.valid(kwh) && ContainsWrap(check_contains, index, kwh, has_contains())) { -# pragma omp atomic - index.get_raw_value_reference(kwh).count += 1; - } - } - } - - return rl; - } - - public: - - template - size_t ParallelFillCoverage(IndexT &index, - Streams &streams, - bool check_contains = true) const { - INFO("Collecting k-mer coverage information from reads, this takes a while."); - unsigned nthreads = (unsigned) streams.size(); - size_t rl = 0; - streams.reset(); -#pragma omp parallel for num_threads(nthreads) shared(rl) - for (size_t i = 0; i < nthreads; ++i) { - size_t crl = FillCoverageFromStream(streams[i], index, check_contains); - - // There is no max reduction in C/C++ OpenMP... Only in FORTRAN :( -#pragma omp flush(rl) - if (crl > rl) -#pragma omp critical - { - rl = std::max(rl, crl); - } - } - - // Contigs have zero coverage! -#if 0 - if (contigs_stream) { - contigs_stream->reset(); - FillCoverageFromStream(*contigs_stream, index, check_contains); - } -#endif - -//todo if this verify is neede, put it outside -//#ifndef NDEBUG -// for (auto idx = index.kmer_idx_begin(), eidx = index.kmer_idx_end(); -// idx != eidx; ++idx) { -// -// Kmer k = index.kmer(idx); -// -// VERIFY(index[k].count == index[!k].count); -// } -//#endif - - return rl; - } - - template - size_t BuildIndexFromStream(IndexT &index, - Streams &streams, - io::SingleStream* contigs_stream = 0) const { - debruijn_graph::BuildIndexFromStream(index, streams, contigs_stream); - - return ParallelFillCoverage(index, streams, false); - } - -// template -// size_t BuildIndexWithCoverageFromGraph( -// GraphT &graph, IndexT &index, -// Streams &streams, -// SingleReadStream* contigs_stream = 0) const { -// this->BuildIndexFromGraph(index, graph); -// -// return ParallelFillCoverage(index, streams, contigs_stream, true); -// } -}; - -template -struct EdgeIndexHelper { - typedef typename Index::KMer Kmer; - typedef typename Index::KMerIdx KMerIdx; - typedef typename Index::traits_t traits_t; - typedef CoverageFillingEdgeIndexBuilder CoverageFillingEdgeIndexBuilderT; - typedef GraphPositionFillingIndexBuilder GraphPositionFillingIndexBuilderT; - typedef CoverageFillingEdgeIndexBuilder CoverageAndGraphPositionFillingIndexBuilderT; -}; - -} diff -Nru spades-3.10.1+dfsg/src/common/utils/indices/edge_info_updater.hpp spades-3.11.1+dfsg/src/common/utils/indices/edge_info_updater.hpp --- spades-3.10.1+dfsg/src/common/utils/indices/edge_info_updater.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/indices/edge_info_updater.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,109 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#pragma once - -#include "utils/standard_base.hpp" -#include "utils/openmp_wrapper.h" -#include "sequence/sequence.hpp" -#include "assembly_graph/core/graph_iterators.hpp" -#include "utils/indices/edge_position_index.hpp" - -namespace debruijn_graph { - -template -class EdgeInfoUpdater { - typedef typename Index::KMer Kmer; - typedef typename Graph::EdgeId EdgeId; - typedef typename Index::KeyWithHash KeyWithHash; - - const Graph &g_; - Index &index_; - -// void PutInIndex(const KeyWithHash &kwh, EdgeId id, size_t offset) { -// if (index_.valid(kwh)) { -// auto &entry = index_.get_raw_value_reference(kwh); -// if (!entry.valid() || index_.contains(kwh)) { -// index_.put_value(kwh, EdgeInfo(id, (unsigned)offset, entry.count)); -// } -// } -// } - - //todo why do we need to check equality???!!! - bool DeleteIfEqual(const KeyWithHash& kwh, EdgeId e) { - if (!index_.contains(kwh)) - return false; - if (index_.get_value(kwh).edge_id == e) { - index_.get_raw_value_reference(kwh).clear(); - return true; - } - return false; - } - - void UpdateKMers(const Sequence &nucls, EdgeId e) { - VERIFY(nucls.size() >= index_.k()); - KeyWithHash kwh = index_.ConstructKWH(Kmer(index_.k(), nucls)); - if (kwh.is_minimal()) - index_.PutInIndex(kwh, e, 0); - for (size_t i = index_.k(), n = nucls.size(); i < n; ++i) { - kwh <<= nucls[i]; - if (kwh.is_minimal()) - index_.PutInIndex(kwh, e, i - index_.k() + 1); - } - } - - void DeleteKMers(const Sequence &nucls, EdgeId e) { - VERIFY(nucls.size() >= index_.k()); - KeyWithHash kwh = index_.ConstructKWH(Kmer(index_.k(), nucls)); - DeleteIfEqual(kwh, e); - for (size_t i = index_.k(), n = nucls.size(); i < n; ++i) { - kwh <<= nucls[i]; - DeleteIfEqual(kwh, e); - } - } - - public: - /** - * Creates DataHashRenewer for specified graph and index - * @param g graph to be indexed - * @param index index to be synchronized with graph - */ - EdgeInfoUpdater(const Graph& g, Index& index) - : g_(g), - index_(index) { - } - - void UpdateKmers(EdgeId e) { - Sequence nucls = g_.EdgeNucls(e); - UpdateKMers(nucls, e); - } - - void DeleteKmers(EdgeId e) { - Sequence nucls = g_.EdgeNucls(e); - DeleteKMers(nucls, e); - } - - void UpdateAll() { - unsigned nthreads = omp_get_max_threads(); - - omnigraph::IterationHelper edges(g_); - auto iters = edges.Chunks(16 * nthreads); - - #pragma omp parallel for schedule(guided) - for (size_t i = 0; i < iters.size() - 1; ++i) { - TRACE("Processing chunk #" << i); - for (auto it = iters[i]; it != iters[i + 1]; ++it) { - UpdateKmers(*it); - } - } - } - - private: - DECL_LOGGER("EdgeInfoUpdater") -}; - -} diff -Nru spades-3.10.1+dfsg/src/common/utils/indices/edge_multi_index.hpp spades-3.11.1+dfsg/src/common/utils/indices/edge_multi_index.hpp --- spades-3.10.1+dfsg/src/common/utils/indices/edge_multi_index.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/indices/edge_multi_index.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,155 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#pragma once - -#include "perfect_hash_map.hpp" -#include "edge_info_updater.hpp" -#include "edge_position_index.hpp" - -#include - -namespace debruijn_graph { - -template -class EdgeInfoStorage { -public: - typedef vector> Content; - typedef typename Content::iterator iterator; - typedef typename Content::const_iterator const_iterator; - Content content_; - folly::MicroSpinLock lock_; - - EdgeInfoStorage(const Content &content) : content_(content) { - lock_.init(); - } - - EdgeInfoStorage() { - lock_.init(); - } - - EdgeInfo &operator[](size_t i) { - return content_[i]; - } - - iterator begin() { - return content_.begin(); - } - - iterator end() { - return content_.end(); - } - - const_iterator begin() const { - return content_.cbegin(); - } - - const_iterator end() const { - return content_.cend(); - } - - iterator find(const EdgeInfo &info) { - return content_.find(info); - } - - const_iterator find(const EdgeInfo &info) const { - return content_.find(info); - } - - void push_back(const EdgeInfo &info) { - folly::MSLGuard g(lock_); - content_.push_back(info); - } - - template - void emplace_back(Args&&... args) { - folly::MSLGuard g(lock_); - content_.emplace_back(std::forward(args)...); - } - - size_t size() const{ - return content_.size(); - } - - bool valid() const { - //what's invalid edge info storage? - return true; - } - - EdgeInfoStorage conjugate(size_t k) const { - EdgeInfoStorage result; - for(auto it = content_.rbegin(); it != content_.rend(); ++it) { - result.push_back(it->conjugate(k)); - } - return result; - } -}; - -//todo it is not handling graph events!!! -template, class StoringType = SimpleStoring > -class DeBruijnEdgeMultiIndex : public KeyStoringMap, traits, StoringType > { - typedef KeyStoringMap, traits, StoringType > base; - public: - typedef StoringType storing_type; - typedef typename base::traits_t traits_t; - typedef typename base::KMer KMer; - typedef typename base::KMerIdx KMerIdx; - typedef typename base::KeyWithHash KeyWithHash; - typedef EdgeInfoStorage Value; - - using base::ConstructKWH; -// typedef typename base::IdType IdType; - //todo move this typedef up in hierarchy (need some c++ tricks) - - DeBruijnEdgeMultiIndex(unsigned k, const std::string &workdir) - : base(k, workdir) { - INFO("Constructing multi-kmer index"); - } - - ~DeBruijnEdgeMultiIndex() {} - - - Value get(const KeyWithHash &kwh) const { - VERIFY(contains(kwh)); - return base::get_value(kwh); - } - - bool contains(const KeyWithHash &kwh) const { - if (!base::valid(kwh)) - return false; - return this->get_raw_value_reference(kwh).valid(); - } - - bool valid(const KMer &kmer) const { - KeyWithHash kwh = base::ConstructKWH(kmer); - return base::valid(kwh); - } - - void PutInIndex(const KeyWithHash &kwh, IdType id, size_t offset) { - if (!contains(kwh)) - return; - - EdgeInfoStorage &entry = this->get_raw_value_reference(kwh); - entry.emplace_back(id, (unsigned int)offset); - } - - const EdgeInfoStorage get(const KMer& kmer) const { - auto kwh = base::ConstructKWH(kmer); - auto entry = this->get_value(kwh); - return entry; - } - - //todo delete if equal seems to work improperly!!! - bool DeleteIfEqual(const KeyWithHash &, IdType) { - VERIFY(false); - return false; - } - -}; - -} diff -Nru spades-3.10.1+dfsg/src/common/utils/indices/edge_position_index.hpp spades-3.11.1+dfsg/src/common/utils/indices/edge_position_index.hpp --- spades-3.10.1+dfsg/src/common/utils/indices/edge_position_index.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/indices/edge_position_index.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,216 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#pragma once - -#include "perfect_hash_map.hpp" -#include "io/reads/single_read.hpp" - -namespace debruijn_graph { - -template -struct EdgeInfo { - IdType edge_id; - unsigned offset; - unsigned count; - - EdgeInfo(IdType edge_id_ = IdType(), unsigned offset_ = unsigned(-1), unsigned count_ = 0) : - edge_id(edge_id_), offset(offset_), count(count_) { - VERIFY(edge_id != IdType() || clean()); - } - - template - EdgeInfo conjugate(const KWH &kwh) const { - return conjugate(kwh.key().size()); - } - - EdgeInfo conjugate(size_t k) const { - if(!valid()) { - return EdgeInfo(IdType(0), unsigned(-1), count); - } else { - return EdgeInfo(edge_id->conjugate(), (unsigned)edge_id->length(k) - offset, count); - } - } - - void clear() { - offset = unsigned(-1); - } - - bool clean() const { - return offset == unsigned(-1); - } - - void remove() { - offset = unsigned(-2); - } - - bool removed() const { - return offset == unsigned(-2); - } - - bool valid() const { - return !clean() && !removed(); - } -}; - -template -stream &operator<<(stream &s, const EdgeInfo &info) { - return s << "EdgeInfo[" << info.edge_id.int_id() << ", " << info.offset << ", " << info.count << "]"; -} - -template -class KmerFreeEdgeIndex : public KeyIteratingMap, - kmer_index_traits, StoringType> { - typedef KeyIteratingMap, - kmer_index_traits, StoringType> base; - const Graph &graph_; - -public: - typedef typename base::traits_t traits_t; - typedef StoringType storing_type; - typedef typename base::KMer KMer; - typedef typename base::KMerIdx KMerIdx; - typedef Graph GraphT; - typedef typename Graph::EdgeId IdType; - typedef typename base::KeyWithHash KeyWithHash; - typedef EdgeInfo KmerPos; - using base::valid; - using base::ConstructKWH; - -public: - - KmerFreeEdgeIndex(const Graph &graph, const std::string &workdir) - : base(unsigned(graph.k() + 1), workdir), graph_(graph) {} - - /** - * Shows if kmer has some entry associated with it - */ - bool contains(const KeyWithHash &kwh) const { - // Sanity check - if (!valid(kwh)) - return false; - - KmerPos entry = base::get_value(kwh); - if (!entry.valid()) - return false; - return graph_.EdgeNucls(entry.edge_id).contains(kwh.key(), entry.offset); - } - - void PutInIndex(KeyWithHash &kwh, IdType id, size_t offset) { - if (!valid(kwh)) - return; - - KmerPos &entry = this->get_raw_value_reference(kwh); - if (entry.removed()) { - //VERIFY(false); - return; - } - if (entry.clean()) { - //put verify on this conversion! - this->put_value(kwh, KmerPos(id, (unsigned)offset, entry.count)); - } else if (contains(kwh)) { - //VERIFY(false); - entry.remove(); - } else { - //VERIFY(false); - //FIXME bad situation; some other kmer is there; think of putting verify - } - } - - //Only coverage is loaded - template - void BinWrite(Writer &writer) const { - this->index_ptr_->serialize(writer); - size_t sz = this->data_.size(); - writer.write((char*)&sz, sizeof(sz)); - for (size_t i = 0; i < sz; ++i) - writer.write((char*)&(this->data_[i].count), sizeof(this->data_[0].count)); - } - - template - void BinRead(Reader &reader, const std::string/* &FileName*/) { - this->clear(); - this->index_ptr_->deserialize(reader); - size_t sz = 0; - reader.read((char*)&sz, sizeof(sz)); - this->data_.resize(sz); - for (size_t i = 0; i < sz; ++i) - reader.read((char*)&(this->data_[i].count), sizeof(this->data_[0].count)); - } -}; - -template -class KmerStoringEdgeIndex : public KeyStoringMap, - kmer_index_traits, StoringType> { - typedef KeyStoringMap, - kmer_index_traits, StoringType> base; - -public: - typedef typename base::traits_t traits_t; - typedef StoringType storing_type; - typedef typename base::KMer KMer; - typedef typename base::KMerIdx KMerIdx; - typedef Graph GraphT; - typedef typename Graph::EdgeId IdType; - typedef typename base::KeyWithHash KeyWithHash; - typedef EdgeInfo KmerPos; - using base::valid; - using base::ConstructKWH; - - - KmerStoringEdgeIndex(const Graph& g, const std::string &workdir) - : base(unsigned(g.k() + 1), workdir) {} - - ~KmerStoringEdgeIndex() {} - - /** - * Shows if kmer has some entry associated with it - */ - bool contains(const KeyWithHash &kwh) const { - if (!base::valid(kwh)) - return false; - return this->get_raw_value_reference(kwh).valid(); - } - - template - void BinWrite(Writer &writer) const { - this->index_ptr_->serialize(writer); - size_t sz = this->data_.size(); - writer.write((char*)&sz, sizeof(sz)); - for (size_t i = 0; i < sz; ++i) - writer.write((char*)&(this->data_[i].count), sizeof(this->data_[0].count)); - this->BinWriteKmers(writer); - } - - template - void BinRead(Reader &reader, const std::string &FileName) { - this->clear(); - this->index_ptr_->deserialize(reader); - size_t sz = 0; - reader.read((char*)&sz, sizeof(sz)); - this->data_.resize(sz); - for (size_t i = 0; i < sz; ++i) - reader.read((char*)&(this->data_[i].count), sizeof(this->data_[0].count)); - this->BinReadKmers(reader, FileName); - } - - void PutInIndex(KeyWithHash &kwh, IdType id, size_t offset) { - //here valid already checks equality of query-kmer and stored-kmer sequences - if (base::valid(kwh)) { - KmerPos &entry = this->get_raw_value_reference(kwh); - if (entry.removed()) - return; - if (!entry.clean()) { - this->put_value(kwh, KmerPos(id, (unsigned)offset, entry.count)); - } else { - entry.remove(); - } - } - } -}; - -} diff -Nru spades-3.10.1+dfsg/src/common/utils/indices/editable_index.hpp spades-3.11.1+dfsg/src/common/utils/indices/editable_index.hpp --- spades-3.10.1+dfsg/src/common/utils/indices/editable_index.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/indices/editable_index.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,270 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#pragma once - -#include "perfect_hash_map.hpp.hpp" - -namespace debruijn_graph { - -//template -//class EditableDeBruijnKMerIndex: public DeBruijnKMerIndex { -//public: -// typedef size_t KMerIdx; -//private: -// typedef typename traits::SeqType KMer; -// typedef KMerIndex KMerIndexT; -// typedef ValueType KMerIndexValueType; -// typedef std::vector KMerIndexStorageType; -// typedef boost::bimap KMerPushBackIndexType; -// -// KMerPushBackIndexType push_back_index_; -// KMerIndexStorageType push_back_buffer_; -// -// using DeBruijnKMerIndex::index_; -// using DeBruijnKMerIndex::data_; -// using DeBruijnKMerIndex::kmers; -// using DeBruijnKMerIndex::K_; -// using DeBruijnKMerIndex::InvalidKMerIdx; -//public: -// EditableDeBruijnKMerIndex(unsigned K, const std::string &workdir) : -// DeBruijnKMerIndex(K, workdir) { -// } -// -// KMerIdx seq_idx(const KMer &s) const { -// KMerIdx idx = index_.seq_idx(s); -// -// // First, check whether we're insert index itself. -// if (contains(idx, s, /* check push back */false)) -// return idx; -// -// // Maybe we're inside push_back buffer then? -// auto it = push_back_index_.left.find(s); -// if (it != push_back_index_.left.end()) -// return data_.size() + it->second; -// -// return InvalidKMerIdx; -// } -// -// KMerIndexValueType &operator[](const KMer &s) { -// return operator[](index_.seq_idx(s)); -// } -// -// const KMerIndexValueType &operator[](const KMer &s) const { -// return operator[](index_.seq_idx(s)); -// } -// -// -// const KMerIndexValueType &operator[](KMerIdx idx) const { -// if (idx < this->data_.size()) -// return this->data_[idx]; -// return push_back_buffer_[idx - this->data_.size()]; -// } -// -// KMerIndexValueType &operator[](KMerIdx idx) { -// if (idx < this->data_.size()) -// return this->data_[idx]; -// -// return push_back_buffer_[idx - this->data_.size()]; -// } -// -// size_t size() const { -// return this->data_.size() + push_back_buffer_.size(); -// } -// -// bool contains(const KMer &k) const { -// KMerIdx idx = seq_idx(k); -// -// return idx != InvalidKMerIdx; -// } -// bool contains(KMerIdx idx) const { -// return idx < size(); -// } -// -// size_t insert(const KMer &s, const KMerIndexValueType &value) { -// size_t idx = push_back_buffer_.size(); -// push_back_index_.insert( -// typename KMerPushBackIndexType::value_type(s, idx)); -// push_back_buffer_.push_back(value); -// -// return idx; -// } -// -// KMer kmer(KMerIdx idx) const { -// VERIFY(contains(idx)); -// -// if (idx < this->data_.size()) { -// auto it = kmers->begin() + idx; -// return (typename traits::raw_create()(K_, *it)); -// } -// -// idx -= this->data_.size(); -// return push_back_index_.right.find(idx)->second; -// } -// -// template -// void BinWrite(Writer &writer) const { -// index_.serialize(writer); -// size_t sz = this->data_.size(); -// writer.write((char*) &sz, sizeof(sz)); -// writer.write((char*) &this->data_[0], sz * sizeof(data_[0])); -// sz = push_back_buffer_.size(); -// writer.write((char*) &sz, sizeof(sz)); -// writer.write((char*) &push_back_buffer_[0], -// sz * sizeof(push_back_buffer_[0])); -// for (auto it = push_back_index_.left.begin(), e = -// push_back_index_.left.end(); it != e; ++it) { -// size_t idx = it->second; -// KMer::BinWrite(writer, it->first); -// writer.write((char*) &idx, sizeof(idx)); -// sz -= 0; -// } -// VERIFY(sz == 0); -// traits::raw_serialize(writer, kmers); -// } -// -// template -// void BinRead(Reader &reader, const std::string &FileName) { -// clear(); -// index_.deserialize(reader); -// size_t sz = 0; -// reader.read((char*) &sz, sizeof(sz)); -// data_.resize(sz); -// reader.read((char*) &data_[0], sz * sizeof(data_[0])); -// reader.read((char*) &sz, sizeof(sz)); -// push_back_buffer_.resize(sz); -// reader.read((char*) &push_back_buffer_[0], -// sz * sizeof(push_back_buffer_[0])); -// for (size_t i = 0; i < sz; ++i) { -// KMer s(K_); -// size_t idx; -// -// s.BinRead(reader); -// reader.read((char*) &idx, sizeof(idx)); -// -// push_back_index_.insert( -// typename KMerPushBackIndexType::value_type(s, idx)); -// } -// -// kmers = traits::raw_deserialize(reader, FileName); -// } -// -// void clear() { -// index_.clear(); -// this->data_.clear(); -// KMerIndexStorageType().swap(data_); -// push_back_index_.clear(); -// push_back_buffer_.clear(); -// delete kmers; -// kmers = NULL; -// } -// -//protected: -// bool contains(KMerIdx idx, const KMer &k, -// bool check_push_back = true) const { -// // Sanity check -// if (idx == InvalidKMerIdx || idx >= size()) -// return false; -// -// if (idx < data_.size()) { -// auto it = kmers->begin() + idx; -// return (typename traits::raw_equal_to()(k, *it)); -// } -// -// if (check_push_back) { -// auto it = push_back_index_.right.find(idx - data_.size()); -// return (it != push_back_index_.right.end() && it->second == k); -// } -// -// return false; -// } -// -//}; - -//template -//class EditableDeBruijnKMerIndexBuilder { -// public: -// template -// size_t BuildIndexFromStream(EditableDeBruijnKMerIndex &index, -// io::ReadStreamVector > &streams, -// SingleReadStream* contigs_stream = 0) const; -// -// template -// void BuildIndexFromGraph(EditableDeBruijnKMerIndex &index, -// const Graph &g) const; -// -// protected: -// template -// void SortUniqueKMers(KMerCounter &counter, Index &index) const; -// -// protected: -// DECL_LOGGER("K-mer Index Building"); -//}; - -//template <> -//class EditableDeBruijnKMerIndexBuilder> { -// public: -// template -// size_t BuildIndexFromStream(EditableDeBruijnKMerIndex> &index, -// io::ReadStreamVector > &streams, -// SingleReadStream* contigs_stream = 0) const { -// DeBruijnReadKMerSplitter splitter(index.workdir(), -// index.K(), 0, -// streams, contigs_stream); -// KMerDiskCounter counter(index.workdir(), splitter); -// KMerIndexBuilder>::KMerIndexT> builder(index.workdir(), 16, streams.size()); -// size_t sz = builder.BuildIndex(index.index_, counter, /* save final */ true); -// index.data_.resize(sz); -// -// if (!index.kmers) -// index.kmers = counter.GetFinalKMers(); -// -// SortUniqueKMers(counter, index); -// -// return 0; -// } -// -// template -// void BuildIndexFromGraph(EditableDeBruijnKMerIndex &index, -// const Graph &g) const { -// DeBruijnGraphKMerSplitter splitter(index.workdir(), index.K(), g); -// KMerDiskCounter counter(index.workdir(), splitter); -// KMerIndexBuilder>::KMerIndexT> builder(index.workdir(), 16, 1); -// size_t sz = builder.BuildIndex(index.index_, counter, /* save final */ true); -// index.data_.resize(sz); -// -// if (!index.kmers) -// index.kmers = counter.GetFinalKMers(); -// -// SortUniqueKMers(counter, index); -// } -// -// protected: -// template -// void SortUniqueKMers(KMerCounter &counter, Index &index) const { -// size_t swaps = 0; -// INFO("Arranging kmers in hash map order"); -// for (auto I = index.kmers->begin(), E = index.kmers->end(); I != E; ++I) { -// size_t cidx = I - index.kmers->begin(); -// size_t kidx = index.raw_seq_idx(*I); -// while (cidx != kidx) { -// auto J = index.kmers->begin() + kidx; -// using std::swap; -// swap(*I, *J); -// swaps += 1; -// -// kidx = index.raw_seq_idx(*I); -// } -// } -// INFO("Done. Total swaps: " << swaps); -// } -// -// protected: -// DECL_LOGGER("K-mer Index Building"); -//}; - -} diff -Nru spades-3.10.1+dfsg/src/common/utils/indices/key_with_hash.hpp spades-3.11.1+dfsg/src/common/utils/indices/key_with_hash.hpp --- spades-3.10.1+dfsg/src/common/utils/indices/key_with_hash.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/indices/key_with_hash.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,229 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#pragma once - -#include "storing_traits.hpp" - -namespace debruijn_graph { - -template -class SimpleKeyWithHash { -public: - typedef Key KeyType; -private: - typedef typename HashFunction::IdxType IdxType; - const HashFunction &hash_; - Key key_; - mutable IdxType idx_; //lazy computation - mutable bool ready_; - - void CountIdx() const { - ready_ = true; - idx_ = hash_.seq_idx(key_); - } - - void SetKey(const Key &key) { - ready_ = false; - key_ = key; - } -public: - - SimpleKeyWithHash(Key key, const HashFunction &hash) : hash_(hash), key_(key), idx_(0), ready_(false) { - } - - Key key() const { - return key_; - } - - IdxType idx() const { - if(!ready_) { - CountIdx(); - } - return idx_; - } - - SimpleKeyWithHash &operator=(const SimpleKeyWithHash &that) { - VERIFY(&this->hash_ == &that.hash_); - this->key_= that.key_; - this->idx_ = that.idx_; - this->ready_ = that.ready_; - return *this; - } - - bool operator==(const SimpleKeyWithHash &that) const { - VERIFY(&this->hash_ == &that.hash_); - if (this->ready_ && that.ready_) - return this->idx_ == that.idx_ && this->is_minimal_ == that.is_minimal_; - return this->key_ == that.key_; - } - - bool operator!=(const SimpleKeyWithHash &that) const { - VERIFY(&this->hash_ == &that.hash_); - return this->key_ != that.key_; - } - - SimpleKeyWithHash operator!() const { - return SimpleKeyWithHash(!key_, hash_); - } - - SimpleKeyWithHash operator<<(char nucl) const { - return SimpleKeyWithHash(key_ << nucl, hash_); - } - - SimpleKeyWithHash operator>>(char nucl) const { - return SimpleKeyWithHash(key_ >> nucl, hash_); - } - - void operator<<=(char nucl) { - SetKey(key_ << nucl); - } - - void operator>>=(char nucl) { - SetKey(key_ >> nucl); - } - - char operator[](size_t i) const { - return key_[i]; - } - - bool is_minimal() const { - return true; - } -}; - -template -stream &operator<<(stream &s, const SimpleKeyWithHash &kwh) { - return s << "SKWH[" << kwh.key() << ", " << kwh.idx() << "]"; -} - -//Would it make sense to also store inverted kmer for not minimal kwh? -template -class InvertableKeyWithHash { -private: - typedef typename HashFunction::IdxType IdxType; - - const HashFunction &hash_; - Key key_; - mutable IdxType idx_; //lazy computation - mutable bool is_minimal_; - mutable bool ready_; - - void CountIdx() const { - ready_ = true; - is_minimal_ = key_.IsMinimal(); - if(is_minimal_) - idx_ = hash_.seq_idx(key_); - else{ - idx_ = hash_.seq_idx(!key_); - } - } - - InvertableKeyWithHash(Key key, const HashFunction &hash, bool is_minimal, - size_t idx, bool ready) - : hash_(hash), key_(key), idx_(idx), - is_minimal_(is_minimal), ready_(ready) { - } - public: - - InvertableKeyWithHash(Key key, const HashFunction &hash) - : hash_(hash), key_(key), idx_(0), is_minimal_(false), ready_(false) {} - - const Key &key() const { - return key_; - } - - IdxType idx() const { - if (!ready_) - CountIdx(); - - return idx_; - } - - bool is_minimal() const { - if(!ready_) { - return key_.IsMinimal(); - } - return is_minimal_; - } - - bool ready() const { - return ready_; - } - - InvertableKeyWithHash &operator=(const InvertableKeyWithHash &that) { - VERIFY(&this->hash_ == &that.hash_); - this->key_= that.key_; - this->idx_ = that.idx_; - this->ready_ = that.ready_; - this->is_minimal_ = that.is_minimal_; - return *this; - } - - bool operator==(const InvertableKeyWithHash &that) const { - VERIFY(&this->hash_ == &that.hash_); - return this->key_ == that.key_; - } - - bool operator!=(const InvertableKeyWithHash &that) const { - VERIFY(&this->hash_ == &that.hash_); - return this->key_ != that.key_; - } - - InvertableKeyWithHash operator!() const { - if(!ready_) - return InvertableKeyWithHash(!key_, hash_); - return InvertableKeyWithHash(!key_, hash_, !is_minimal_, idx_, ready_); - } - - InvertableKeyWithHash operator<<(char nucl) const { - return InvertableKeyWithHash(key_ << nucl, hash_); - } - - InvertableKeyWithHash operator>>(char nucl) const { - return InvertableKeyWithHash(key_ >> nucl, hash_); - } - - void operator<<=(char nucl) { - key_ <<= nucl; - ready_ = false; - } - - void operator>>=(char nucl) { - key_ >>= nucl; - ready_ = false; - } - - char operator[](size_t i) const { - return key_[i]; - } -}; - -template -stream &operator<<(stream &s, const InvertableKeyWithHash &kwh) { - s << "IKWH[" << kwh.key(); - if(kwh.ready()) { - return s << ", " << kwh.is_minimal() << ", " << kwh.idx() << "]"; - } else { - return s << ", not ready]"; - } -} - -template -struct StoringTraits; - -template -struct StoringTraits { - typedef SimpleKeyWithHash KeyWithHash; -}; - -template -struct StoringTraits { - typedef InvertableKeyWithHash KeyWithHash; -}; - -} diff -Nru spades-3.10.1+dfsg/src/common/utils/indices/kmer_extension_index_builder.hpp spades-3.11.1+dfsg/src/common/utils/indices/kmer_extension_index_builder.hpp --- spades-3.10.1+dfsg/src/common/utils/indices/kmer_extension_index_builder.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/indices/kmer_extension_index_builder.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,106 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2016 Saint Petersburg State University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#pragma once - -#include "kmer_extension_index.hpp" -#include "kmer_splitters.hpp" - -class DeBruijnExtensionIndexBuilder { -public: - template - size_t FillExtensionsFromStream(ReadStream &stream, Index &index) const { - unsigned k = index.k(); - size_t rl = 0; - - while (!stream.eof()) { - typename ReadStream::read_type r; - stream >> r; - rl = std::max(rl, r.size()); - - const Sequence &seq = r.sequence(); - if (seq.size() < k + 1) - continue; - - typename Index::KeyWithHash kwh = index.ConstructKWH(seq.start(k)); - for (size_t j = k; j < seq.size(); ++j) { - char nnucl = seq[j], pnucl = kwh[0]; - index.AddOutgoing(kwh, nnucl); - kwh <<= nnucl; - index.AddIncoming(kwh, pnucl); - } - } - - return rl; - } - - template - void FillExtensionsFromIndex(const std::string &KPlusOneMersFilename, - Index &index) const { - unsigned KPlusOne = index.k() + 1; - - typename Index::kmer_iterator it(KPlusOneMersFilename, - RtSeq::GetDataSize(KPlusOne)); - for (; it.good(); ++it) { - RtSeq kpomer(KPlusOne, *it); - - char pnucl = kpomer[0], nnucl = kpomer[KPlusOne - 1]; - TRACE("processing k+1-mer " << kpomer); - index.AddOutgoing(index.ConstructKWH(RtSeq(KPlusOne - 1, kpomer)), - nnucl); - // FIXME: This is extremely ugly. Needs to add start / end methods to extract first / last N symbols... - index.AddIncoming(index.ConstructKWH(RtSeq(KPlusOne - 1, kpomer << 0)), - pnucl); - } - } - -public: - template - ReadStatistics BuildExtensionIndexFromStream(Index &index, Streams &streams, io::SingleStream* contigs_stream = 0, - size_t read_buffer_size = 0) const { - unsigned nthreads = (unsigned) streams.size(); - - // First, build a k+1-mer index - DeBruijnReadKMerSplitter> - splitter(index.workdir(), index.k() + 1, 0xDEADBEEF, streams, - contigs_stream, read_buffer_size); - KMerDiskCounter counter(index.workdir(), splitter); - counter.CountAll(nthreads, nthreads, /* merge */false); - - // Now, count unique k-mers from k+1-mers - DeBruijnKMerKMerSplitter > - splitter2(index.workdir(), index.k(), - index.k() + 1, Index::storing_type::IsInvertable(), read_buffer_size); - for (unsigned i = 0; i < nthreads; ++i) - splitter2.AddKMers(counter.GetMergedKMersFname(i)); - KMerDiskCounter counter2(index.workdir(), splitter2); - - BuildIndex(index, counter2, 16, nthreads); - - // Build the kmer extensions - INFO("Building k-mer extensions from k+1-mers"); -# pragma omp parallel for num_threads(nthreads) - for (unsigned i = 0; i < nthreads; ++i) - FillExtensionsFromIndex(counter.GetMergedKMersFname(i), index); - INFO("Building k-mer extensions from k+1-mers finished."); - - return splitter.stats(); - } - -private: - DECL_LOGGER("DeBruijnExtensionIndexBuilder"); -}; - -template -struct ExtensionIndexHelper { - using IndexT = Index; - typedef typename Index::traits_t traits_t; - typedef typename Index::KMer Kmer; - typedef typename Index::KMerIdx KMerIdx; - using DeBruijnExtensionIndexBuilderT = DeBruijnExtensionIndexBuilder; -}; - diff -Nru spades-3.10.1+dfsg/src/common/utils/indices/kmer_extension_index.hpp spades-3.11.1+dfsg/src/common/utils/indices/kmer_extension_index.hpp --- spades-3.10.1+dfsg/src/common/utils/indices/kmer_extension_index.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/indices/kmer_extension_index.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,309 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#pragma once - -#include "perfect_hash_map.hpp" -#include "utils/simple_tools.hpp" -#include "storing_traits.hpp" -#include - -namespace debruijn_graph { - -inline uint8_t invert_byte_slow(uint8_t a) { - size_t res = 0; - for(size_t i = 0; i < 8; i++) { - res <<= 1; - res += a & 1; - a = uint8_t(a >> 1); - } - return uint8_t(res); -} - -inline vector count_invert_byte() { - vector result; - for (size_t a = 0; a < 256; a++) { - result.push_back(invert_byte_slow((uint8_t)a)); - } - return result; -} - -inline uint8_t invert_byte(uint8_t a) { - static vector precalc = count_invert_byte(); - return precalc[a]; -} - -class InOutMask { -private: - uint8_t mask_; - - bool CheckUnique(uint8_t mask) const { - static bool unique[] = - { 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 }; - return unique[mask]; - } - - char GetUnique(uint8_t mask) const { - static char next[] = { -1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, - -1, -1, -1 }; - VERIFY(next[mask] != -1) - return next[mask]; - } - - size_t Count(uint8_t mask) const { - static char count[] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; - return count[mask]; - } - - - char inv_position(char nucl, bool as_is) const { - if(as_is) - return nucl; - else - return char(7 - nucl); - } - -public: - explicit InOutMask(uint8_t mask = 0) : mask_(mask){ - } - - uint8_t get_mask() const { - return mask_; - } - - template - InOutMask conjugate(const Key & /*k*/) const { - return InOutMask(invert_byte(mask_)); - } - - void AddOutgoing(char nnucl, bool as_is) { - unsigned nmask = (unsigned) (1 << inv_position(nnucl, as_is)); - if (!(mask_ & nmask)) { -# pragma omp atomic - mask_ |= (unsigned char) nmask; - } - } - - void AddIncoming(char pnucl, bool as_is) { - unsigned pmask = (unsigned) (1 << inv_position(char(pnucl + 4), as_is)); - if (!(mask_ & pmask)) { -# pragma omp atomic - mask_|= (unsigned char) pmask; - } - } - - void DeleteOutgoing(char nnucl, bool as_is) { - unsigned nmask = (1 << inv_position(nnucl, as_is)); - if (mask_ & nmask) { -# pragma omp atomic - mask_ &= (unsigned char) ~nmask; - } - } - - void DeleteIncoming(char pnucl, bool as_is) { - unsigned pmask = (1 << inv_position(char(pnucl + 4), as_is)); - if (mask_ & pmask) { -# pragma omp atomic - mask_ &= (unsigned char) ~pmask; - } - } - - void IsolateVertex() { - mask_ = 0; - } - - bool CheckOutgoing(char nucl) const { - return mask_ & (1 << nucl); - } - - bool CheckIncoming(char nucl) const { - return mask_ & (1 << (4 + nucl)); - } - - bool IsDeadEnd() const { - return !(mask_ & 15); - } - - bool IsDeadStart() const { - return !(mask_ >> 4); - } - - bool CheckUniqueOutgoing() const { - return CheckUnique(mask_ & 15); - } - - bool CheckUniqueIncoming() const { - return CheckUnique(uint8_t(mask_ >> 4)); - } - - char GetUniqueOutgoing() const { - return GetUnique(mask_ & 15); - } - - char GetUniqueIncoming() const { - return GetUnique(uint8_t(mask_ >> 4)); - } - - size_t OutgoingEdgeCount() const { - return Count(mask_ & 15); - } - - size_t IncomingEdgeCount() const { - return Count(uint8_t(mask_ >> 4)); - } -}; - -template -Stream &operator<<(Stream& stream, const InOutMask &mask) { - return stream << std::bitset<8>(mask.get_mask()); -} - -template -struct slim_kmer_index_traits : public kmer_index_traits { - typedef kmer_index_traits __super; - - typedef MMappedRecordReader FinalKMerStorage; - - template - static void raw_serialize(Writer&, typename __super::RawKMerStorage*) { - VERIFY(false && "Cannot save extension index"); - } - - template - static typename __super::RawKMerStorage *raw_deserialize( - Reader&, const std::string &) { - VERIFY(false && "Cannot load extension index"); - return NULL; - } - -}; - -template -struct AbstractDeEdge { - KeyWithHash start; - KeyWithHash end; - AbstractDeEdge(KeyWithHash _start, KeyWithHash _end) : start(_start), end(_end) { - } - - AbstractDeEdge &operator=(const AbstractDeEdge &that) { - this->start = that.start; - this->end = that.end; - return *this; - } - - bool operator==(const AbstractDeEdge &other) { - return start == other.start && end == other.end; - } - - bool operator!=(const AbstractDeEdge &other) { - return !(*this == other); - } -}; - -template -stream &operator<<(stream &s, const AbstractDeEdge de_edge) { - return s << "DeEdge[" << de_edge.start << ", " << de_edge.end << "]"; -} - -template, class StoringType = DefaultStoring> -class DeBruijnExtensionIndex : public KeyIteratingMap { - typedef KeyIteratingMap base; - -public: - typedef typename base::traits_t traits_t; - typedef StoringType storing_type; - typedef typename base::KeyType KMer; - typedef typename base::IdxType KMerIdx; - typedef typename base::KeyWithHash KeyWithHash; - typedef AbstractDeEdge DeEdge; - using base::ConstructKWH; - - DeBruijnExtensionIndex(unsigned K, const std::string &workdir) - : base((size_t) K, workdir) { - } - - void AddOutgoing(const KeyWithHash &kwh, char nucl) { - TRACE("Add outgoing " << kwh << " " << size_t(nucl) << " " << kwh.is_minimal()); - this->get_raw_value_reference(kwh).AddOutgoing(nucl, kwh.is_minimal()); - } - - void AddIncoming(const KeyWithHash &kwh, char nucl) { - TRACE("Add incoming " << kwh << " " << size_t(nucl) << " " << kwh.is_minimal()); - this->get_raw_value_reference(kwh).AddIncoming(nucl, kwh.is_minimal()); - } - - void DeleteOutgoing(const KeyWithHash &kwh, char nucl) { - TRACE("Delete outgoing " << kwh << " " << size_t(nucl) << " " << kwh.is_minimal()); - this->get_raw_value_reference(kwh).DeleteOutgoing(nucl, kwh.is_minimal()); - } - - void DeleteIncoming(const KeyWithHash &kwh, char nucl) { - TRACE("Delete incoming " << kwh << " " << size_t(nucl) << " " << kwh.is_minimal()); - this->get_raw_value_reference(kwh).DeleteIncoming(nucl, kwh.is_minimal()); - } - - void IsolateVertex(const KeyWithHash &kwh) { - TRACE("Isolate vertex " << kwh); - this->get_raw_value_reference(kwh).IsolateVertex(); - } - - bool CheckOutgoing(const KeyWithHash &kwh, char nucl) const { - return this->get_value(kwh).CheckOutgoing(nucl); - } - - KeyWithHash GetOutgoing(const KeyWithHash &kwh, char nucl) const { - return kwh << nucl; - } - - bool CheckIncoming(const KeyWithHash &kwh, char nucl) const { - return this->get_value(kwh).CheckIncoming(nucl); - } - - KeyWithHash GetIncoming(const KeyWithHash &kwh, char nucl) const { - return kwh >> nucl; - } - - bool IsDeadEnd(const KeyWithHash &kwh) const { - return this->get_value(kwh).IsDeadEnd(); - } - - bool IsDeadStart(const KeyWithHash &kwh) const { - return this->get_value(kwh).IsDeadStart(); - } - - bool CheckUniqueOutgoing(const KeyWithHash &kwh) const { - return this->get_value(kwh).CheckUniqueOutgoing(); - } - - KeyWithHash GetUniqueOutgoing(const KeyWithHash &kwh) const { - return GetOutgoing(kwh, this->get_value(kwh).GetUniqueOutgoing()); - } - - bool CheckUniqueIncoming(const KeyWithHash &kwh) const { - return this->get_value(kwh).CheckUniqueIncoming(); - } - - KeyWithHash GetUniqueIncoming(const KeyWithHash &kwh) const { - return GetIncoming(kwh, this->get_value(kwh).GetUniqueIncoming()); - } - - size_t OutgoingEdgeCount(const KeyWithHash &kwh) const { - return this->get_value(kwh).OutgoingEdgeCount(); - } - - size_t IncomingEdgeCount(const KeyWithHash &kwh) const { - return this->get_value(kwh).IncomingEdgeCount(); - } - - ~DeBruijnExtensionIndex() { - } - -private: - DECL_LOGGER("ExtentionIndex"); -}; - -} diff -Nru spades-3.10.1+dfsg/src/common/utils/indices/kmer_splitters.hpp spades-3.11.1+dfsg/src/common/utils/indices/kmer_splitters.hpp --- spades-3.10.1+dfsg/src/common/utils/indices/kmer_splitters.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/indices/kmer_splitters.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,317 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#pragma once - -#include "io/reads/io_helper.hpp" -#include "storing_traits.hpp" - -#include "utils/file_limit.hpp" -#include "utils/mph_index/kmer_index_builder.hpp" - -namespace debruijn_graph { - -template -struct StoringTypeFilter { -}; - -template<> -struct StoringTypeFilter { - template - bool filter(const Kmer &/*kmer*/) const { - return true; - } -}; - -template<> -struct StoringTypeFilter { - template - bool filter(const Kmer &kmer) const { - return kmer.IsMinimal(); - } -}; - -using RtSeqKMerSplitter = ::KMerSortingSplitter; - -template -class DeBruijnKMerSplitter : public RtSeqKMerSplitter { - private: - KmerFilter kmer_filter_; - protected: - size_t read_buffer_size_; - protected: - bool FillBufferFromSequence(const Sequence &seq, - unsigned thread_id) { - if (seq.size() < this->K_) - return false; - - RtSeq kmer = seq.start(this->K_) >> 'A'; - bool stop = false; - for (size_t j = this->K_ - 1; j < seq.size(); ++j) { - kmer <<= seq[j]; - if (!kmer_filter_.filter(kmer)) - continue; - - stop |= this->push_back_internal(kmer, thread_id); - } - - return stop; - } - - public: - DeBruijnKMerSplitter(const std::string &work_dir, - unsigned K, KmerFilter kmer_filter, size_t read_buffer_size = 0, uint32_t seed = 0) - : RtSeqKMerSplitter(work_dir, K, seed), kmer_filter_(kmer_filter), read_buffer_size_(read_buffer_size) { - } - protected: - DECL_LOGGER("DeBruijnKMerSplitter"); -}; - -struct ReadStatistics { - size_t reads_; - size_t max_read_length_; - size_t bases_; -}; - -template -class DeBruijnReadKMerSplitter : public DeBruijnKMerSplitter { - io::ReadStreamList &streams_; - io::SingleStream *contigs_; - - template - ReadStatistics - FillBufferFromStream(ReadStream& stream, unsigned thread_id); - - ReadStatistics rs_; - - public: - DeBruijnReadKMerSplitter(const std::string &work_dir, - unsigned K, uint32_t seed, - io::ReadStreamList& streams, - io::SingleStream* contigs_stream = 0, - size_t read_buffer_size = 0) - : DeBruijnKMerSplitter(work_dir, K, KmerFilter(), read_buffer_size, seed), - streams_(streams), contigs_(contigs_stream), rs_({0 ,0 ,0}) {} - - path::files_t Split(size_t num_files) override; - - size_t read_length() const { return rs_.max_read_length_; } - ReadStatistics stats() const { return rs_; } -}; - -template template -ReadStatistics -DeBruijnReadKMerSplitter::FillBufferFromStream(ReadStream &stream, - unsigned thread_id) { - typename ReadStream::ReadT r; - size_t reads = 0, rl = 0, bases = 0; - - while (!stream.eof()) { - stream >> r; - rl = std::max(rl, r.size()); - reads += 1; - bases += r.size(); - - if (this->FillBufferFromSequence(r.sequence(), thread_id)) - break; - } - return { reads, rl, bases }; -} - -template -path::files_t DeBruijnReadKMerSplitter::Split(size_t num_files) { - unsigned nthreads = (unsigned) streams_.size(); - - INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while."); - path::files_t out = this->PrepareBuffers(num_files, nthreads, this->read_buffer_size_); - - size_t counter = 0, rl = 0, bases = 0, n = 15; - streams_.reset(); - while (!streams_.eof()) { -# pragma omp parallel for num_threads(nthreads) reduction(+ : counter) reduction(+ : bases) shared(rl) - for (unsigned i = 0; i < nthreads; ++i) { - ReadStatistics stats = FillBufferFromStream(streams_[i], i); - counter += stats.reads_; - bases += stats.bases_; - - // There is no max reduction in C/C++ OpenMP... Only in FORTRAN :( -# pragma omp flush(rl) - if (stats.max_read_length_ > rl) -# pragma omp critical - { - rl = std::max(rl, stats.max_read_length_); - } - } - - this->DumpBuffers(out); - - if (counter >> n) { - INFO("Processed " << counter << " reads"); - n += 1; - } - } - - if (contigs_) { - INFO("Adding contigs from previous K"); - unsigned cnt = 0; - contigs_->reset(); - while (!contigs_->eof()) { - FillBufferFromStream(*contigs_, cnt); - this->DumpBuffers(out); - if (++cnt >= nthreads) - cnt = 0; - } - } - - this->ClearBuffers(); - - INFO("Used " << counter << " reads. Maximum read length " << rl); - INFO("Average read length " << double(bases) / double(counter)); - rs_ = { counter, rl, bases }; - - return out; -} - -template -class DeBruijnGraphKMerSplitter : public DeBruijnKMerSplitter { - typedef typename Graph::ConstEdgeIt EdgeIt; - typedef typename Graph::EdgeId EdgeId; - - const Graph &g_; - - size_t FillBufferFromEdges(EdgeIt &edge, unsigned thread_id); - - public: - DeBruijnGraphKMerSplitter(const std::string &work_dir, - unsigned K, const Graph &g, size_t read_buffer_size = 0) - : DeBruijnKMerSplitter(work_dir, K, KmerFilter(), read_buffer_size), g_(g) {} - - path::files_t Split(size_t num_files) override; -}; - -template -size_t -DeBruijnGraphKMerSplitter::FillBufferFromEdges(EdgeIt &edge, - unsigned thread_id) { - size_t seqs = 0; - for (; !edge.IsEnd(); ++edge) { - const Sequence &nucls = g_.EdgeNucls(*edge); - - seqs += 1; - if (this->FillBufferFromSequence(nucls, thread_id)) - break; - } - - return seqs; -} - -template -path::files_t DeBruijnGraphKMerSplitter::Split(size_t num_files) { - INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while."); - - path::files_t out = this->PrepareBuffers(num_files, 1, this->read_buffer_size_); - - size_t counter = 0, n = 10; - for (auto it = g_.ConstEdgeBegin(); !it.IsEnd(); ) { - counter += FillBufferFromEdges(it, 0); - - this->DumpBuffers(out); - - if (counter >> n) { - INFO("Processed " << counter << " edges"); - n += 1; - } - } - - INFO("Used " << counter << " sequences."); - - this->ClearBuffers(); - - return out; -} - - -template -class DeBruijnKMerKMerSplitter : public DeBruijnKMerSplitter { - typedef MMappedFileRecordArrayIterator kmer_iterator; - - unsigned K_source_; - std::vector kmers_; - bool add_rc_; - - size_t FillBufferFromKMers(kmer_iterator &kmer, - unsigned thread_id); - - public: - DeBruijnKMerKMerSplitter(const std::string &work_dir, - unsigned K_target, unsigned K_source, bool add_rc, size_t read_buffer_size = 0) - : DeBruijnKMerSplitter(work_dir, K_target, KmerFilter(), read_buffer_size), - K_source_(K_source), add_rc_(add_rc) {} - - void AddKMers(const std::string &file) { - kmers_.push_back(file); - } - - path::files_t Split(size_t num_files) override; -}; - -template -inline size_t DeBruijnKMerKMerSplitter::FillBufferFromKMers(kmer_iterator &kmer, - unsigned thread_id) { - size_t seqs = 0; - for (; kmer.good(); ++kmer) { - Sequence nucls(RtSeq(K_source_, *kmer)); - seqs += 1; - - bool stop = this->FillBufferFromSequence(nucls, thread_id); - if (add_rc_) - stop |= this->FillBufferFromSequence(!nucls, thread_id); - - if (stop) - break; - } - - return seqs; -} - -template -path::files_t DeBruijnKMerKMerSplitter::Split(size_t num_files) { - unsigned nthreads = (unsigned) kmers_.size(); - - INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while."); - - path::files_t out = this->PrepareBuffers(num_files, nthreads, this->read_buffer_size_); - - size_t counter = 0, n = 10; - std::vector its; - its.reserve(nthreads); - for (auto it = kmers_.begin(), et = kmers_.end(); it != et; ++it) - its.emplace_back(*it, RtSeq::GetDataSize(K_source_)); - - while (std::any_of(its.begin(), its.end(), - [](const kmer_iterator &it) { return it.good(); })) { -# pragma omp parallel for num_threads(nthreads) reduction(+ : counter) - for (unsigned i = 0; i < nthreads; ++i) - counter += FillBufferFromKMers(its[i], i); - - this->DumpBuffers(out); - - if (counter >> n) { - INFO("Processed " << counter << " kmers"); - n += 1; - } - } - - INFO("Used " << counter << " kmers."); - - this->ClearBuffers(); - - return out; -} - - -} diff -Nru spades-3.10.1+dfsg/src/common/utils/indices/perfect_hash_map_builder.hpp spades-3.11.1+dfsg/src/common/utils/indices/perfect_hash_map_builder.hpp --- spades-3.10.1+dfsg/src/common/utils/indices/perfect_hash_map_builder.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/indices/perfect_hash_map_builder.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,102 +0,0 @@ -#pragma once -//*************************************************************************** -//* Copyright (c) 2016 Saint Petersburg State University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#include "utils/mph_index/kmer_index_builder.hpp" - -#include "perfect_hash_map.hpp" -#include "kmer_splitters.hpp" - -namespace debruijn_graph { - -struct PerfectHashMapBuilder { - template - void BuildIndex(PerfectHashMap &index, - Counter& counter, size_t bucket_num, - size_t thread_num, bool save_final = true) const { - using KMerIndex = typename PerfectHashMap::KMerIndexT; - - KMerIndexBuilder builder(index.workdir(), - (unsigned) bucket_num, - (unsigned) thread_num); - size_t sz = builder.BuildIndex(*index.index_ptr_, counter, save_final); - index.resize(sz); - } -}; - -struct KeyStoringIndexBuilder { - template - void BuildIndex(KeyStoringMap &index, - Counter& counter, size_t bucket_num, - size_t thread_num, bool save_final = true) const { - phm_builder_.BuildIndex(index, counter, bucket_num, thread_num, save_final); - VERIFY(!index.kmers_.get()); - index.kmers_ = counter.GetFinalKMers(); - VERIFY(index.kmers_.get()); - index.SortUniqueKMers(); - } - - private: - PerfectHashMapBuilder phm_builder_; -}; - -struct KeyIteratingIndexBuilder { - template - void BuildIndex(KeyIteratingMap &index, - Counter& counter, size_t bucket_num, - size_t thread_num, bool save_final = true) const { - phm_builder_.BuildIndex(index, counter, bucket_num, thread_num, save_final); - index.KMersFilename_ = counter.GetFinalKMersFname(); - } - - private: - PerfectHashMapBuilder phm_builder_; -}; - -template -void BuildIndex(KeyIteratingMap &index, - Counter& counter, size_t bucket_num, - size_t thread_num, bool save_final = true) { - KeyIteratingIndexBuilder().BuildIndex(index, counter, bucket_num, thread_num, save_final); -} - - -template -void BuildIndex(KeyStoringMap &index, - Counter& counter, size_t bucket_num, - size_t thread_num, bool save_final = true) { - KeyStoringIndexBuilder().BuildIndex(index, counter, bucket_num, thread_num, save_final); -} - -template -void BuildIndex(PerfectHashMap &index, - Counter& counter, size_t bucket_num, - size_t thread_num, bool save_final = true) { - PerfectHashMapBuilder().BuildIndex(index, counter, bucket_num, thread_num, save_final); -} - -template -size_t BuildIndexFromStream(Index &index, - Streams &streams, - io::SingleStream* contigs_stream = 0) { - DeBruijnReadKMerSplitter> - splitter(index.workdir(), index.k(), 0, streams, contigs_stream); - KMerDiskCounter counter(index.workdir(), splitter); - BuildIndex(index, counter, 16, streams.size()); - return 0; -} - -template -void BuildIndexFromGraph(Index &index, const Graph &g, size_t read_buffer_size = 0) { - DeBruijnGraphKMerSplitter> - splitter(index.workdir(), index.k(), g, read_buffer_size); - KMerDiskCounter counter(index.workdir(), splitter); - BuildIndex(index, counter, 16, 1); -} - -} diff -Nru spades-3.10.1+dfsg/src/common/utils/indices/perfect_hash_map.hpp spades-3.11.1+dfsg/src/common/utils/indices/perfect_hash_map.hpp --- spades-3.10.1+dfsg/src/common/utils/indices/perfect_hash_map.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/indices/perfect_hash_map.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,339 +0,0 @@ -#pragma once -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#include "utils/openmp_wrapper.h" -#include "utils/path_helper.hpp" -#include "io/kmers/kmer_iterator.hpp" - -#include "utils/mph_index/kmer_index.hpp" - -#include "key_with_hash.hpp" -#include "values.hpp" -#include "storing_traits.hpp" - -#include -#include -#include - -namespace debruijn_graph { - -template -class IndexWrapper { - static const size_t InvalidIdx = size_t(-1); -public: - typedef size_t IdxType; - typedef K KeyType; - typedef traits traits_t; -protected: - typedef KMerIndex KMerIndexT; - //these fields are protected only for reduction of storage in edge indices BinWrite - std::shared_ptr index_ptr_; -private: - std::string workdir_; - unsigned k_; - -protected: - size_t raw_seq_idx(const typename KMerIndexT::KMerRawReference s) const { - return index_ptr_->raw_seq_idx(s); - } - - bool valid(const size_t idx) const { - return idx != InvalidIdx && idx < index_ptr_->size(); - } -public: - IndexWrapper(size_t k, const std::string &workdir) - : index_ptr_(std::make_shared()) - , k_((unsigned) k) { - //fixme string literal - workdir_ = path::make_temp_dir(workdir, "kmeridx"); - } - - IndexWrapper(size_t k, const std::string &workdir, std::shared_ptr index_ptr) - : IndexWrapper(k, workdir) { - index_ptr_ = index_ptr; - } - - ~IndexWrapper() { - path::remove_dir(workdir_); - } - - void clear() { - index_ptr_->clear(); - } - - unsigned k() const { return k_; } - -public: - template - void BinWrite(Writer &writer) const { - index_ptr_->serialize(writer); - } - - template - void BinRead(Reader &reader, const std::string &) { - clear(); - index_ptr_->deserialize(reader); - } - - const std::string &workdir() const { - return workdir_; - } -}; - -template, class StoringType = SimpleStoring> -class PerfectHashMap : public ValueArray, public IndexWrapper { -public: - typedef size_t IdxType; - typedef K KeyType; - typedef ValueArray ValueBase; - typedef IndexWrapper KeyBase; - using KeyBase::index_ptr_; - typedef typename KeyBase::KMerIndexT KMerIndexT; - typedef typename StoringTraits::KeyWithHash KeyWithHash; - - KeyWithHash ConstructKWH(const KeyType &key) const { - return KeyWithHash(key, *index_ptr_); - } - - bool valid(const KeyWithHash &kwh) const { - return KeyBase::valid(kwh.idx()); - } - - PerfectHashMap(size_t k, const std::string &workdir) : KeyBase(k, workdir) { - } - - PerfectHashMap(size_t k, const std::string &workdir, std::shared_ptr index_ptr) - : KeyBase(k, workdir, index_ptr) { - ValueBase::resize(index_ptr_->size()); - } - - ~PerfectHashMap() { - } - - void clear() { - KeyBase::clear(); - ValueBase::clear(); - } - - const V get_value(const KeyWithHash &kwh) const { - return StoringType::get_value(*this, kwh); - } - - template - const V get_value(const KeyWithHash &kwh, const F& inverter) const { - return StoringType::get_value(*this, kwh, inverter); - } - - //Think twice or ask AntonB if you want to use it! - V &get_raw_value_reference(const KeyWithHash &kwh) { - return ValueBase::operator[](kwh.idx()); - } - - const V &get_raw_value_reference(const KeyWithHash &kwh) const { - return ValueBase::operator[](kwh.idx()); - } - - void put_value(const KeyWithHash &kwh, const V &value) { - StoringType::set_value(*this, kwh, value); - } - - template - void put_value(const KeyWithHash &kwh, const V &value, const F& inverter) { - StoringType::set_value(*this, kwh, value, inverter); - } - - template - void BinWrite(Writer &writer) const { - KeyBase::BinWrite(writer); - ValueBase::BinWrite(writer); - } - - template - void BinRead(Reader &reader, const std::string &tmp) { - KeyBase::BinRead(reader, tmp); - ValueBase::BinRead(reader, tmp); - } - - friend struct PerfectHashMapBuilder; -}; - - -template, class StoringType = SimpleStoring> -class KeyStoringMap : public PerfectHashMap { -private: - typedef PerfectHashMap base; - -public: - typedef traits traits_t; - typedef K KMer; - typedef typename base::IdxType KMerIdx; - typedef typename traits::FinalKMerStorage::iterator kmer_iterator; - typedef typename traits::FinalKMerStorage::const_iterator const_kmer_iterator; - typedef typename base::KeyWithHash KeyWithHash; - using base::ConstructKWH; - -private: - std::unique_ptr kmers_; - - void SortUniqueKMers() const { - size_t swaps = 0; - INFO("Arranging kmers in hash map order"); - for (auto I = kmers_->begin(), E = kmers_->end(); I != E; ++I) { - size_t cidx = I - kmers_->begin(); - size_t kidx = this->raw_seq_idx(*I); - while (cidx != kidx) { - auto J = kmers_->begin() + kidx; - using std::swap; - swap(*I, *J); - swaps += 1; - kidx = this->raw_seq_idx(*I); - } - } - INFO("Done. Total swaps: " << swaps); - } - -protected: - template - void BinWriteKmers(Writer &writer) const { - traits::raw_serialize(writer, this->kmers_); - } - - template - void BinReadKmers(Reader &reader, const std::string &FileName) { - this->kmers_ = traits_t::raw_deserialize(reader, FileName); - } - -public: - template - void BinWrite(Writer &writer) const { - base::BinWrite(writer); - BinWriteKmers(writer); - } - - template - void BinRead(Reader &reader, const std::string &FileName) { - base::BinRead(reader, FileName); - BinReadKmers(reader, FileName); - } - - KeyStoringMap(size_t k, const std::string &workdir) - : base(k, workdir), kmers_(nullptr) {} - - ~KeyStoringMap() {} - - KMer true_kmer(KeyWithHash kwh) const { - VERIFY(this->valid(kwh)); - - auto it = this->kmers_->begin() + kwh.idx(); - return (typename traits_t::raw_create()(this->k(), *it)); - } - - void clear() { - base::clear(); - kmers_ = nullptr; - } - - kmer_iterator kmer_begin() { - return kmers_->begin(); - } - const_kmer_iterator kmer_begin() const { - return kmers_->cbegin(); - } - - kmer_iterator kmer_end() { - return kmers_->end(); - } - const_kmer_iterator kmer_end() const { - return kmers_->cend(); - } - - bool valid(const KeyWithHash &kwh) const { - if (!base::valid(kwh)) - return false; - - auto it = this->kmers_->begin() + kwh.idx(); - if (!kwh.is_minimal()) - return (typename traits_t::raw_equal_to()(!kwh.key(), *it)); - else - return (typename traits_t::raw_equal_to()(kwh.key(), *it)); - } - - /** - * Number of edges going out of the param edge's end - */ - unsigned NextEdgeCount(const KeyWithHash &kwh) const { - unsigned res = 0; - for (char c = 0; c < 4; ++c) - if (valid(kwh << c)) - res += 1; - - return res; - } - - KeyWithHash NextEdge(const KeyWithHash &kwh) const { // returns any next edge - for (char c = 0; c < 4; ++c) { - if (valid(kwh << c)) - //hack for this code to work with long seqs! (oterwise return s is totally fine) - return ConstructKWH(true_kmer(kwh));//s; - } - - VERIFY_MSG(false, "Couldn't find requested edge!"); - return ConstructKWH(KMer(this->k())); - // no next edges (we should request one here). - } - - /** - * Number of edges coming into param edge's end - */ - unsigned RivalEdgeCount(const KeyWithHash &kwh) const { - KeyWithHash next = kwh << 'A'; - unsigned res = 0; - for (char c = 0; c < 4; ++c) - if (valid(next >> c)) - res += 1; - - return res; - } - - friend struct KeyStoringIndexBuilder; -}; - -template, class StoringType = SimpleStoring> -class KeyIteratingMap : public PerfectHashMap { - typedef PerfectHashMap base; - - std::string KMersFilename_; - -public: - typedef StoringType storing_type; - typedef typename base::traits_t traits_t; - typedef typename base::KeyType KMer; - typedef typename base::IdxType KMerIdx; - using base::ConstructKWH; - -public: - - KeyIteratingMap(size_t k, const std::string &workdir) - : base(k, workdir), KMersFilename_("") {} - - ~KeyIteratingMap() {} - - typedef MMappedFileRecordArrayIterator kmer_iterator; - - kmer_iterator kmer_begin() const { - return kmer_iterator(this->KMersFilename_, KMer::GetDataSize(base::k())); - } - - std::vector kmer_begin(size_t parts) const { - return io::make_kmer_iterator(this->KMersFilename_, base::k(), parts); - } - - friend struct KeyIteratingIndexBuilder; -}; - -} diff -Nru spades-3.10.1+dfsg/src/common/utils/indices/storing_traits.hpp spades-3.11.1+dfsg/src/common/utils/indices/storing_traits.hpp --- spades-3.10.1+dfsg/src/common/utils/indices/storing_traits.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/indices/storing_traits.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,81 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#pragma once -/* - * key_with_hash.hpp - * - * Created on: Nov 7, 2013 - * Author: anton - */ - -#include "values.hpp" - -namespace debruijn_graph { - - -struct SimpleStoring { - template - static V get_value(const ValueArray &values, const K& key) { - return values[key.idx()]; - } - - template - static void set_value(ValueArray &values, const K& key, const V& value) { - values[key.idx()] = value; - } - - static bool IsInvertable() { - return false; - } -}; - -struct InvertableStoring { - template - struct default_inverter { - template - V operator()(const V& v, const K& k) const { - return v.conjugate(k); - } - }; - - template - struct trivial_inverter { - template - V operator()(const V& v, const K& /*k*/) const { - return v; - } - }; - - template> - static V get_value(const ValueArray &values, const K& key, - const F& inverter = F()) { - if (key.is_minimal()) - return values[key.idx()]; - else - return inverter(values[key.idx()], key); - } - - template> - static void set_value(ValueArray& values, const K& key, const V& value, - const F& inverter = F()) { - VERIFY(key.idx() < values.size()); - if (key.is_minimal()) { - values[key.idx()] = value; - } else { - values[key.idx()] = inverter(value, key); - } - } - - static bool IsInvertable() { - return true; - } -}; - -typedef InvertableStoring DefaultStoring; - -} diff -Nru spades-3.10.1+dfsg/src/common/utils/indices/values.hpp spades-3.11.1+dfsg/src/common/utils/indices/values.hpp --- spades-3.10.1+dfsg/src/common/utils/indices/values.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/indices/values.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,97 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#pragma once -/* - * key_with_hash.hpp - * - * Created on: Nov 7, 2013 - * Author: anton - */ - -namespace debruijn_graph { - -template -class ValueArray { - static const size_t InvalidIdx = SIZE_MAX; -public: - typedef size_t IdxType; - typedef V ValueType; - -protected: - typedef std::vector StorageT; - StorageT data_; - - void resize(size_t size) { - data_.resize(size); - } - -public: - typedef typename StorageT::iterator value_iterator; - typedef typename StorageT::const_iterator const_value_iterator; - - ValueArray() { - } - - ~ValueArray() { - } - - void clear() { - data_.clear(); - StorageT().swap(data_); - } - - const V &operator[](size_t idx) const { - return data_[idx]; - } - - V &operator[](size_t idx) { - return data_[idx]; - } - -public: - size_t size() const { - return data_.size(); - } - - value_iterator value_begin() { - return data_.begin(); - } - const_value_iterator value_begin() const { - return data_.begin(); - } - const_value_iterator value_cbegin() const { - return data_.cbegin(); - } - value_iterator value_end() { - return data_.end(); - } - const_value_iterator value_end() const { - return data_.end(); - } - const_value_iterator value_cend() const { - return data_.cend(); - } - - template - void BinWrite(Writer &writer) const { - size_t sz = data_.size(); - writer.write((char*) &sz, sizeof(sz)); - writer.write((char*) &data_[0], sz * sizeof(data_[0])); - } - - template - void BinRead(Reader &reader, const std::string &) { - clear(); - size_t sz = 0; - reader.read((char*) &sz, sizeof(sz)); - data_.resize(sz); - reader.read((char*) &data_[0], sz * sizeof(data_[0])); - } -}; - -} diff -Nru spades-3.10.1+dfsg/src/common/utils/kmer_mph/CMakeLists.txt spades-3.11.1+dfsg/src/common/utils/kmer_mph/CMakeLists.txt --- spades-3.10.1+dfsg/src/common/utils/kmer_mph/CMakeLists.txt 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/kmer_mph/CMakeLists.txt 2017-09-28 09:05:11.000000000 +0000 @@ -0,0 +1,13 @@ +############################################################################ +# Copyright (c) 2015 Saint Petersburg State University +# Copyright (c) 2011-2014 Saint Petersburg Academic University +# All Rights Reserved +# See file LICENSE for details. +############################################################################ + +project(mph_index CXX) + +add_library(mph_index STATIC 1.cpp) + +target_link_libraries(mph_index cityhash) + diff -Nru spades-3.10.1+dfsg/src/common/utils/kmer_mph/kmer_index_builder.hpp spades-3.11.1+dfsg/src/common/utils/kmer_mph/kmer_index_builder.hpp --- spades-3.10.1+dfsg/src/common/utils/kmer_mph/kmer_index_builder.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/kmer_mph/kmer_index_builder.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -0,0 +1,324 @@ +#pragma once +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "kmer_index.hpp" + +#include "io/kmers/mmapped_reader.hpp" +#include "io/kmers/mmapped_writer.hpp" +#include "common/adt/kmer_vector.hpp" + +#include "utils/parallel/openmp_wrapper.h" + +#include "utils/logger/logger.hpp" +#include "utils/filesystem/path_helper.hpp" + +#include "utils/perf/memory_limit.hpp" +#include "utils/filesystem/file_limit.hpp" + +#include "adt/iterator_range.hpp" +#include "adt/loser_tree.hpp" + +#include "boomphf/BooPHF.h" + +#include + +#include +#ifdef USE_GLIBCXX_PARALLEL +#include +#endif + +#include "config.hpp" + +#ifdef SPADES_USE_JEMALLOC +# include +#endif + +#include +#include +#include + +#include "kmer_splitters.hpp" + +namespace utils { + + +template > +class KMerCounter { + public: + typedef typename traits::raw_data_iterator iterator; + typedef typename traits::raw_data_const_iterator const_iterator; + typedef typename traits::RawKMerStorage RawKMerStorage; + typedef typename traits::FinalKMerStorage FinalKMerStorage; + + virtual size_t kmer_size() const = 0; + + virtual size_t Count(unsigned num_buckets, unsigned num_threads) = 0; + virtual size_t CountAll(unsigned num_buckets, unsigned num_threads, bool merge = true) = 0; + virtual void MergeBuckets(unsigned num_buckets) = 0; + + virtual std::unique_ptr GetBucket(size_t idx, bool unlink = true) = 0; + virtual std::unique_ptr GetFinalKMers() = 0; + + virtual ~KMerCounter() {} + +protected: + DECL_LOGGER("K-mer Counting"); +}; + +template > +class KMerDiskCounter : public KMerCounter { + typedef KMerCounter __super; + typedef typename traits::RawKMerStorage BucketStorage; +public: + KMerDiskCounter(const std::string &work_dir, KMerSplitter &splitter) + : work_dir_(work_dir), splitter_(splitter) { + std::string prefix = fs::append_path(work_dir, "kmers_XXXXXX"); + char *tempprefix = strcpy(new char[prefix.length() + 1], prefix.c_str()); + VERIFY_MSG(-1 != (fd_ = ::mkstemp(tempprefix)), "Cannot create temporary file"); + kmer_prefix_ = tempprefix; + delete[] tempprefix; + } + + ~KMerDiskCounter() { + ::close(fd_); + ::unlink(kmer_prefix_.c_str()); + } + + size_t kmer_size() const override { + return Seq::GetDataSize(splitter_.K()) * sizeof(typename Seq::DataType); + } + + std::unique_ptr GetBucket(size_t idx, bool unlink = true) override { + unsigned K = splitter_.K(); + return std::unique_ptr(new BucketStorage(GetMergedKMersFname((unsigned)idx), Seq::GetDataSize(K), unlink)); + } + + size_t Count(unsigned num_buckets, unsigned num_threads) override { + unsigned K = splitter_.K(); + unsigned num_files = num_buckets * num_threads; + + // Split k-mers into buckets. + INFO("Splitting kmer instances into " << num_files << " buckets using " << num_threads << " threads. This might take a while."); + fs::files_t raw_kmers = splitter_.Split(num_files, num_threads); + + INFO("Starting k-mer counting."); + size_t kmers = 0; +# pragma omp parallel for shared(raw_kmers) num_threads(num_threads) schedule(dynamic) reduction(+:kmers) + for (unsigned iFile = 0; iFile < raw_kmers.size(); ++iFile) { + kmers += MergeKMers(raw_kmers[iFile], GetUniqueKMersFname(iFile), K); + } + INFO("K-mer counting done. There are " << kmers << " kmers in total. "); + if (!kmers) { + FATAL_ERROR("No kmers were extracted from reads. Check the read lengths and k-mer length settings"); + exit(-1); + } + + INFO("Merging temporary buckets."); + for (unsigned i = 0; i < num_buckets; ++i) { + std::string ofname = GetMergedKMersFname(i); + std::ofstream ofs(ofname.c_str(), std::ios::out | std::ios::binary); + for (unsigned j = 0; j < num_threads; ++j) { + BucketStorage ins(GetUniqueKMersFname(i + j * num_buckets), Seq::GetDataSize(K), /* unlink */ true); + ofs.write((const char*)ins.data(), ins.data_size()); + } + } + + return kmers; + } + + void MergeBuckets(unsigned num_buckets) override { + unsigned K = splitter_.K(); + + INFO("Merging final buckets."); + + MMappedRecordArrayWriter os(GetFinalKMersFname(), Seq::GetDataSize(K)); + std::string ofname = GetFinalKMersFname(); + std::ofstream ofs(ofname.c_str(), std::ios::out | std::ios::binary); + for (unsigned j = 0; j < num_buckets; ++j) { + auto bucket = GetBucket(j, /* unlink */ true); + ofs.write((const char*)bucket->data(), bucket->data_size()); + } + ofs.close(); + } + + size_t CountAll(unsigned num_buckets, unsigned num_threads, bool merge = true) override { + size_t kmers = Count(num_buckets, num_threads); + if (merge) + MergeBuckets(num_buckets); + + return kmers; + } + + std::unique_ptr GetFinalKMers() override { + unsigned K = splitter_.K(); + return std::unique_ptr(new typename __super::FinalKMerStorage(GetFinalKMersFname(), Seq::GetDataSize(K), /* unlink */ true)); + } + + std::string GetMergedKMersFname(unsigned suffix) const { + return kmer_prefix_ + ".merged." + std::to_string(suffix); + } + + std::string GetFinalKMersFname() const { + return kmer_prefix_ + ".final"; + } + +private: + std::string work_dir_; + KMerSplitter &splitter_; + int fd_; + std::string kmer_prefix_; + + std::string GetUniqueKMersFname(unsigned suffix) const { + return kmer_prefix_ + ".unique." + std::to_string(suffix); + } + + size_t MergeKMers(const std::string &ifname, const std::string &ofname, + unsigned K) { + MMappedRecordArrayReader ins(ifname, Seq::GetDataSize(K), /* unlink */ true); + + std::string IdxFileName = ifname + ".idx"; + if (FILE *f = fopen(IdxFileName.c_str(), "rb")) { + fclose(f); + MMappedRecordReader index(ifname + ".idx", true, -1ULL); + + // INFO("Total runs: " << index.size()); + + // Prepare runs + std::vector> ranges; + auto beg = ins.begin(); + for (size_t sz : index) { + auto end = std::next(beg, sz); + ranges.push_back(adt::make_range(beg, end)); + VERIFY(std::is_sorted(beg, end, adt::array_less())); + beg = end; + } + + // Construct tree on top entries of runs + adt::loser_tree> tree(ranges); + + if (tree.empty()) { + FILE *g = fopen(ofname.c_str(), "ab"); + VERIFY_MSG(g, "Cannot open temporary file to write"); + fclose(g); + return 0; + } + + // Write it down! + adt::KMerVector buf(K, 1024*1024); + auto pval = tree.pop(); + size_t total = 0; + while (!tree.empty()) { + buf.clear(); + for (size_t cnt = 0; cnt < buf.capacity() && !tree.empty(); ) { + auto cval = tree.pop(); + if (!adt::array_equal_to()(pval, cval)) { + buf.push_back(pval); + pval = cval; + cnt += 1; + } + } + total += buf.size(); + + FILE *g = fopen(ofname.c_str(), "ab"); + VERIFY_MSG(g, "Cannot open temporary file to write"); + fwrite(buf.data(), buf.el_data_size(), buf.size(), g); + fclose(g); + } + + // Handle very last value + { + FILE *g = fopen(ofname.c_str(), "ab"); + VERIFY_MSG(g, "Cannot open temporary file to write"); + fwrite(pval.data(), pval.data_size(), 1, g); + fclose(g); + total += 1; + } + + return total; + } else { + // Sort the stuff + libcxx::sort(ins.begin(), ins.end(), adt::array_less()); + + // FIXME: Use something like parallel version of unique_copy but with explicit + // resizing. + auto it = std::unique(ins.begin(), ins.end(), adt::array_equal_to()); + + MMappedRecordArrayWriter os(ofname, Seq::GetDataSize(K)); + os.resize(it - ins.begin()); + std::copy(ins.begin(), it, os.begin()); + + return it - ins.begin(); + } + } +}; + +template +class KMerIndexBuilder { + typedef typename Index::KMerSeq Seq; + typedef typename Index::kmer_index_traits kmer_index_traits; + + std::string work_dir_; + unsigned num_buckets_; + unsigned num_threads_; + + public: + KMerIndexBuilder(const std::string &workdir, + unsigned num_buckets, unsigned num_threads) + : work_dir_(workdir), num_buckets_(num_buckets), num_threads_(num_threads) {} + size_t BuildIndex(Index &out, KMerCounter &counter, + bool save_final = false); + + unsigned num_buckets() const { return num_buckets_; } + + private: + + DECL_LOGGER("K-mer Index Building"); +}; + +template +size_t KMerIndexBuilder::BuildIndex(Index &index, KMerCounter &counter, + bool save_final) { + index.clear(); + + INFO("Building kmer index "); + + // First, count the unique k-mers + size_t kmers = counter.Count(num_buckets_, num_threads_); + + index.num_buckets_ = num_buckets_; + index.bucket_starts_.resize(num_buckets_ + 1); + index.index_ = new typename KMerIndex::KMerDataIndex[num_buckets_]; + + INFO("Building perfect hash indices"); + +# pragma omp parallel for shared(index) num_threads(num_threads_) + for (unsigned iFile = 0; iFile < num_buckets_; ++iFile) { + typename KMerIndex::KMerDataIndex &data_index = index.index_[iFile]; + auto bucket = counter.GetBucket(iFile, !save_final); + size_t sz = bucket->end() - bucket->begin(); + index.bucket_starts_[iFile + 1] = sz; + + data_index = typename Index::KMerDataIndex(sz, + boomphf::range(bucket->begin(), bucket->end()), + 1, 2.0, false, false); + } + + // Finally, record the sizes of buckets. + for (unsigned iFile = 1; iFile < num_buckets_; ++iFile) + index.bucket_starts_[iFile] += index.bucket_starts_[iFile - 1]; + + if (save_final) + counter.MergeBuckets(num_buckets_); + + double bits_per_kmer = 8.0 * (double)index.mem_size() / (double)kmers; + INFO("Index built. Total " << index.mem_size() << " bytes occupied (" << bits_per_kmer << " bits per kmer)."); + index.count_size(); + return kmers; +} +} diff -Nru spades-3.10.1+dfsg/src/common/utils/kmer_mph/kmer_index.hpp spades-3.11.1+dfsg/src/common/utils/kmer_mph/kmer_index.hpp --- spades-3.10.1+dfsg/src/common/utils/kmer_mph/kmer_index.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/kmer_mph/kmer_index.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -0,0 +1,138 @@ +#pragma once +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "kmer_index_traits.hpp" + +#include +#include + +#include +#include + +namespace utils { + +template +class KMerIndexBuilder; + +template +class KMerIndex { +public: + typedef traits kmer_index_traits; + typedef typename traits::SeqType KMerSeq; + typedef typename traits::hash_function hash_function; + typedef typename traits::KMerRawData KMerRawData; + typedef typename traits::KMerRawReference KMerRawReference; + typedef size_t IdxType; + +private: + struct hash_function128 { + std::pair operator()(const KMerSeq &k) const{ + return CityHash128((const char *)k.data(), k.data_size() * sizeof(typename KMerSeq::DataType)); + } + std::pair operator()(const KMerRawReference k) const { + return CityHash128((const char *)k.data(), k.size() * sizeof(typename KMerSeq::DataType)); + } + }; + typedef KMerIndex __self; + typedef boomphf::mphf KMerDataIndex; + +public: + KMerIndex(): index_(NULL), num_buckets_(0), size_(0) {} + + KMerIndex(const KMerIndex&) = delete; + KMerIndex& operator=(const KMerIndex&) = delete; + + ~KMerIndex() { clear(); } + + void clear() { + num_buckets_ = 0; + bucket_starts_.clear(); + + delete[] index_; + index_ = NULL; + } + + size_t mem_size() { + size_t sz = 0; + for (size_t i = 0; i < num_buckets_; ++i) + sz += index_[i].mem_size(); + + return sz; + } + + void count_size() { + if (index_ == NULL) + return; + size_ = 0; + for (size_t i = 0; i < num_buckets_; i++) + size_ += index_[i].size(); + } + + size_t size() const { + return size_; + } + + size_t seq_idx(const KMerSeq &s) const { + size_t bucket = seq_bucket(s); + + return bucket_starts_[bucket] + index_[bucket].lookup(s); + } + + size_t raw_seq_idx(const KMerRawReference data) const { + size_t bucket = raw_seq_bucket(data); + + return bucket_starts_[bucket] + index_[bucket].lookup(data); + } + + template + void serialize(Writer &os) const { + os.write((char*)&num_buckets_, sizeof(num_buckets_)); + for (size_t i = 0; i < num_buckets_; ++i) + index_[i].save(os); + os.write((char*)&bucket_starts_[0], (num_buckets_ + 1) * sizeof(bucket_starts_[0])); + } + + template + void deserialize(Reader &is) { + clear(); + + is.read((char*)&num_buckets_, sizeof(num_buckets_)); + + index_ = new KMerDataIndex[num_buckets_]; + for (size_t i = 0; i < num_buckets_; ++i) + index_[i].load(is); + + bucket_starts_.resize(num_buckets_ + 1); + is.read((char*)&bucket_starts_[0], (num_buckets_ + 1) * sizeof(bucket_starts_[0])); + count_size(); + } + + void swap(KMerIndex &other) { + std::swap(index_, other.index_); + std::swap(num_buckets_, other.num_buckets_); + std::swap(size_, other.size_); + std::swap(bucket_starts_, other.bucket_starts_); + } + + private: + KMerDataIndex *index_; + + size_t num_buckets_; + std::vector bucket_starts_; + size_t size_; + + size_t seq_bucket(const KMerSeq &s) const { + return hash_function()(s) % num_buckets_; + } + size_t raw_seq_bucket(const KMerRawReference data) const { + return hash_function()(data) % num_buckets_; + } + + friend class KMerIndexBuilder<__self>; +}; +} diff -Nru spades-3.10.1+dfsg/src/common/utils/kmer_mph/kmer_index_traits.hpp spades-3.11.1+dfsg/src/common/utils/kmer_mph/kmer_index_traits.hpp --- spades-3.10.1+dfsg/src/common/utils/kmer_mph/kmer_index_traits.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/kmer_mph/kmer_index_traits.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -0,0 +1,80 @@ +#pragma once +//*************************************************************************** +//* Copyright (c) 2016 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "io/kmers/mmapped_reader.hpp" + +namespace utils { + +template +struct kmer_index_traits { + typedef Seq SeqType; + typedef MMappedRecordArrayReader RawKMerStorage; + typedef MMappedRecordArrayReader FinalKMerStorage; + typedef typename RawKMerStorage::iterator raw_data_iterator; + typedef typename RawKMerStorage::const_iterator raw_data_const_iterator; + typedef typename RawKMerStorage::iterator::value_type KMerRawData; + typedef typename RawKMerStorage::iterator::reference KMerRawReference; + typedef typename RawKMerStorage::const_iterator::reference KMerRawConstReference; + + struct raw_equal_to { + bool operator()(const Seq &lhs, const KMerRawReference rhs) { + return (adt::array_equal_to()(lhs.data(), lhs.data_size(), rhs)); + } + }; + + struct raw_create { + Seq operator()(unsigned K, const KMerRawReference kmer) { + return Seq(K, kmer.data()); + } + Seq operator()(unsigned K, const KMerRawConstReference kmer) { + return Seq(K, kmer.data()); + } + }; + + struct hash_function { + uint64_t operator()(const Seq &k, uint64_t seed = 0) const{ + return typename Seq::hash()(k, (uint32_t)seed); + } + uint64_t operator()(const KMerRawReference k, uint64_t seed = 0) const { + return typename Seq::hash()(k.data(), k.size(), (uint32_t)seed); + } + }; + + template + static void raw_serialize(Writer &writer, RawKMerStorage *data) { + size_t sz = data->data_size(), elcnt = data->elcnt(); + unsigned PageSize = getpagesize(); + writer.write((char*)&sz, sizeof(sz)); + writer.write((char*)&elcnt, sizeof(elcnt)); + // Make sure data is aligned to the page boundary + size_t cpos = writer.tellp(); + size_t pos = (cpos + PageSize - 1 + sizeof(size_t)) / PageSize * PageSize; + size_t off = pos - writer.tellp(); + writer.write((char*)&off, sizeof(off)); + writer.seekp(pos); + writer.write((char*)data->data(), data->data_size()); + } + + template + static void raw_serialize(Writer &writer, const std::unique_ptr &data) { + raw_serialize(writer, data.get()); + } + + template + static std::unique_ptr raw_deserialize(Reader &reader, const std::string &FileName) { + size_t sz, off, elcnt; + reader.read((char*)&sz, sizeof(sz)); + reader.read((char*)&elcnt, sizeof(elcnt)); + reader.read((char*)&off, sizeof(off)); + off -= sizeof(off); + off += reader.tellg(); + + return std::unique_ptr(new RawKMerStorage(FileName, elcnt, false, off, sz)); + } + +}; +} diff -Nru spades-3.10.1+dfsg/src/common/utils/kmer_mph/kmer_splitters.hpp spades-3.11.1+dfsg/src/common/utils/kmer_mph/kmer_splitters.hpp --- spades-3.10.1+dfsg/src/common/utils/kmer_mph/kmer_splitters.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/kmer_mph/kmer_splitters.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -0,0 +1,394 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "io/reads/io_helper.hpp" +#include "utils/filesystem/file_limit.hpp" + +namespace utils { + +template +class KMerSplitter { +public: + typedef typename Seq::hash hash_function; + + KMerSplitter(const std::string &work_dir, unsigned K, uint32_t seed = 0) + : work_dir_(work_dir), K_(K), seed_(seed) {} + + virtual ~KMerSplitter() {} + + virtual fs::files_t Split(size_t num_files, unsigned nthreads) = 0; + + size_t kmer_size() const { + return Seq::GetDataSize(K_) * sizeof(typename Seq::DataType); + } + + unsigned K() const { return K_; } + +protected: + const std::string &work_dir_; + hash_function hash_; + unsigned K_; + uint32_t seed_; + + DECL_LOGGER("K-mer Splitting"); +}; + +template +class KMerSortingSplitter : public KMerSplitter { +public: + KMerSortingSplitter(const std::string &work_dir, unsigned K, uint32_t seed = 0) + : KMerSplitter(work_dir, K, seed), cell_size_(0), num_files_(0) {} + +protected: + using SeqKMerVector = adt::KMerVector; + using KMerBuffer = std::vector; + + std::vector kmer_buffers_; + size_t cell_size_; + size_t num_files_; + + fs::files_t PrepareBuffers(size_t num_files, unsigned nthreads, size_t reads_buffer_size) { + num_files_ = num_files; + + // Determine the set of output files + fs::files_t out; + for (unsigned i = 0; i < num_files_; ++i) + out.push_back(this->GetRawKMersFname(i)); + + size_t file_limit = num_files_ + 2*nthreads; + size_t res = limit_file(file_limit); + if (res < file_limit) { + WARN("Failed to setup necessary limit for number of open files. The process might crash later on."); + WARN("Do 'ulimit -n " << file_limit << "' in the console to overcome the limit"); + } + + if (reads_buffer_size == 0) { + reads_buffer_size = 536870912ull; + size_t mem_limit = (size_t)((double)(utils::get_free_memory()) / (nthreads * 3)); + INFO("Memory available for splitting buffers: " << (double)mem_limit / 1024.0 / 1024.0 / 1024.0 << " Gb"); + reads_buffer_size = std::min(reads_buffer_size, mem_limit); + } + cell_size_ = reads_buffer_size / (num_files_ * this->kmer_size()); + // Set sane minimum cell size + if (cell_size_ < 16384) + cell_size_ = 16384; + + INFO("Using cell size of " << cell_size_); + kmer_buffers_.resize(nthreads); + for (unsigned i = 0; i < nthreads; ++i) { + KMerBuffer &entry = kmer_buffers_[i]; + entry.resize(num_files_, adt::KMerVector(this->K_, (size_t) (1.1 * (double) cell_size_))); + } + + return out; + } + + bool push_back_internal(const Seq &seq, unsigned thread_id) { + KMerBuffer &entry = kmer_buffers_[thread_id]; + + size_t idx = this->GetFileNumForSeq(seq, (unsigned)num_files_); + entry[idx].push_back(seq); + return entry[idx].size() > cell_size_; + } + + void DumpBuffers(const fs::files_t &ostreams) { + VERIFY(ostreams.size() == num_files_ && kmer_buffers_[0].size() == num_files_); + +# pragma omp parallel for + for (unsigned k = 0; k < num_files_; ++k) { + // Below k is thread id! + + size_t sz = 0; + for (size_t i = 0; i < kmer_buffers_.size(); ++i) + sz += kmer_buffers_[i][k].size(); + + adt::KMerVector SortBuffer(this->K_, sz); + for (auto & entry : kmer_buffers_) { + const auto &buffer = entry[k]; + for (size_t j = 0; j < buffer.size(); ++j) + SortBuffer.push_back(buffer[j]); + } + libcxx::sort(SortBuffer.begin(), SortBuffer.end(), typename adt::KMerVector::less2_fast()); + auto it = std::unique(SortBuffer.begin(), SortBuffer.end(), typename adt::KMerVector::equal_to()); + +# pragma omp critical + { + size_t cnt = it - SortBuffer.begin(); + + // Write k-mers + FILE *f = fopen(ostreams[k].c_str(), "ab"); + VERIFY_MSG(f, "Cannot open temporary file to write"); + fwrite(SortBuffer.data(), SortBuffer.el_data_size(), cnt, f); + fclose(f); + + // Write index + f = fopen((ostreams[k] + ".idx").c_str(), "ab"); + VERIFY_MSG(f, "Cannot open temporary file to write"); + fwrite(&cnt, sizeof(cnt), 1, f); + fclose(f); + } + } + + for (auto & entry : kmer_buffers_) + for (auto & eentry : entry) + eentry.clear(); + } + + void ClearBuffers() { + for (auto & entry : kmer_buffers_) + for (auto & eentry : entry) { + eentry.clear(); + eentry.shrink_to_fit(); + } + } + + std::string GetRawKMersFname(unsigned suffix) const { + return fs::append_path(this->work_dir_, "kmers.raw." + std::to_string(suffix)); + } + + unsigned GetFileNumForSeq(const Seq &s, unsigned total) const { + return (unsigned)(this->hash_(s, this->seed_) % total); + } + +}; + +using RtSeqKMerSplitter = KMerSortingSplitter; + +template +class DeBruijnKMerSplitter : public RtSeqKMerSplitter { + private: + KmerFilter kmer_filter_; + protected: + size_t read_buffer_size_; + protected: + bool FillBufferFromSequence(const Sequence &seq, + unsigned thread_id) { + if (seq.size() < this->K_) + return false; + + RtSeq kmer = seq.start(this->K_) >> 'A'; + bool stop = false; + for (size_t j = this->K_ - 1; j < seq.size(); ++j) { + kmer <<= seq[j]; + if (!kmer_filter_.filter(kmer)) + continue; + + stop |= this->push_back_internal(kmer, thread_id); + } + + return stop; + } + + bool FillBufferFromSequence(const RtSeq &seq, + unsigned thread_id) { + if (seq.size() < this->K_) + return false; + + RtSeq kmer = seq.start(this->K_) >> 'A'; + bool stop = false; + for (size_t j = this->K_ - 1; j < seq.size(); ++j) { + kmer <<= seq[j]; + if (!kmer_filter_.filter(kmer)) + continue; + + stop |= this->push_back_internal(kmer, thread_id); + } + + return stop; + } + + public: + DeBruijnKMerSplitter(const std::string &work_dir, + unsigned K, KmerFilter kmer_filter, size_t read_buffer_size = 0, uint32_t seed = 0) + : RtSeqKMerSplitter(work_dir, K, seed), kmer_filter_(kmer_filter), read_buffer_size_(read_buffer_size) { + } + protected: + DECL_LOGGER("DeBruijnKMerSplitter"); +}; + +struct ReadStatistics { + size_t reads_; + size_t max_read_length_; + size_t bases_; +}; + +template +class DeBruijnReadKMerSplitter : public DeBruijnKMerSplitter { + io::ReadStreamList &streams_; + io::SingleStream *contigs_; + + template + ReadStatistics + FillBufferFromStream(ReadStream& stream, unsigned thread_id); + + ReadStatistics rs_; + + public: + DeBruijnReadKMerSplitter(const std::string &work_dir, + unsigned K, uint32_t seed, + io::ReadStreamList& streams, + io::SingleStream* contigs_stream = 0, + size_t read_buffer_size = 0) + : DeBruijnKMerSplitter(work_dir, K, KmerFilter(), read_buffer_size, seed), + streams_(streams), contigs_(contigs_stream), rs_({0 ,0 ,0}) {} + + fs::files_t Split(size_t num_files, unsigned nthreads) override; + + size_t read_length() const { return rs_.max_read_length_; } + ReadStatistics stats() const { return rs_; } +}; + +template template +ReadStatistics +DeBruijnReadKMerSplitter::FillBufferFromStream(ReadStream &stream, + unsigned thread_id) { + typename ReadStream::ReadT r; + size_t reads = 0, rl = 0, bases = 0; + + while (!stream.eof()) { + stream >> r; + rl = std::max(rl, r.size()); + reads += 1; + bases += r.size(); + + if (this->FillBufferFromSequence(r.sequence(), thread_id)) + break; + } + return { reads, rl, bases }; +} + +template +fs::files_t DeBruijnReadKMerSplitter::Split(size_t num_files, unsigned nthreads) { + fs::files_t out = this->PrepareBuffers(num_files, nthreads, this->read_buffer_size_); + + size_t counter = 0, rl = 0, bases = 0, n = 15; + streams_.reset(); + while (!streams_.eof()) { +# pragma omp parallel for num_threads(nthreads) reduction(+ : counter) reduction(+ : bases) shared(rl) + for (unsigned i = 0; i < (unsigned)streams_.size(); ++i) { + ReadStatistics stats = FillBufferFromStream(streams_[i], i); + counter += stats.reads_; + bases += stats.bases_; + + // There is no max reduction in C/C++ OpenMP... Only in FORTRAN :( +# pragma omp flush(rl) + if (stats.max_read_length_ > rl) +# pragma omp critical + { + rl = std::max(rl, stats.max_read_length_); + } + } + + this->DumpBuffers(out); + + if (counter >> n) { + INFO("Processed " << counter << " reads"); + n += 1; + } + } + + if (contigs_) { + INFO("Adding contigs from previous K"); + unsigned cnt = 0; + contigs_->reset(); + while (!contigs_->eof()) { + FillBufferFromStream(*contigs_, cnt); + this->DumpBuffers(out); + if (++cnt >= nthreads) + cnt = 0; + } + } + + this->ClearBuffers(); + + INFO("Used " << counter << " reads. Maximum read length " << rl); + INFO("Average read length " << double(bases) / double(counter)); + rs_ = { counter, rl, bases }; + + return out; +} + +template +class DeBruijnKMerKMerSplitter : public DeBruijnKMerSplitter { + typedef MMappedFileRecordArrayIterator kmer_iterator; + + unsigned K_source_; + std::vector kmers_; + bool add_rc_; + + size_t FillBufferFromKMers(kmer_iterator &kmer, + unsigned thread_id); + + public: + DeBruijnKMerKMerSplitter(const std::string &work_dir, + unsigned K_target, unsigned K_source, bool add_rc, size_t read_buffer_size = 0) + : DeBruijnKMerSplitter(work_dir, K_target, KmerFilter(), read_buffer_size), + K_source_(K_source), add_rc_(add_rc) {} + + void AddKMers(const std::string &file) { + kmers_.push_back(file); + } + + fs::files_t Split(size_t num_files, unsigned nthreads) override; +}; + +template +inline size_t DeBruijnKMerKMerSplitter::FillBufferFromKMers(kmer_iterator &kmer, + unsigned thread_id) { + size_t seqs = 0; + for (; kmer.good(); ++kmer) { + RtSeq nucls(K_source_, *kmer); + seqs += 1; + + bool stop = this->FillBufferFromSequence(nucls, thread_id); + if (add_rc_) + stop |= this->FillBufferFromSequence(!nucls, thread_id); + + if (stop) + break; + } + + return seqs; +} + +template +fs::files_t DeBruijnKMerKMerSplitter::Split(size_t num_files, unsigned nthreads) { + unsigned nit = (unsigned) kmers_.size(); + + fs::files_t out = this->PrepareBuffers(num_files, nthreads, this->read_buffer_size_); + + size_t counter = 0, n = 10; + std::vector its; + its.reserve(nit); + for (auto it = kmers_.begin(), et = kmers_.end(); it != et; ++it) + its.emplace_back(*it, RtSeq::GetDataSize(K_source_)); + + while (std::any_of(its.begin(), its.end(), + [](const kmer_iterator &it) { return it.good(); })) { +# pragma omp parallel for num_threads(nthreads) reduction(+ : counter) + for (unsigned i = 0; i < nit; ++i) + counter += FillBufferFromKMers(its[i], i); + + this->DumpBuffers(out); + + if (counter >> n) { + INFO("Processed " << counter << " kmers"); + n += 1; + } + } + + INFO("Used " << counter << " kmers."); + + this->ClearBuffers(); + + return out; +} + + +} diff -Nru spades-3.10.1+dfsg/src/common/utils/levenshtein.hpp spades-3.11.1+dfsg/src/common/utils/levenshtein.hpp --- spades-3.10.1+dfsg/src/common/utils/levenshtein.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/levenshtein.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,241 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#pragma once - -#include -#include -#include "utils/simple_tools.hpp" - -/* - * Little modified copy-paste from http://www.merriampark.com/ldcpp.htm - */ -inline size_t edit_distance(const std::string &source, const std::string &target) { - - // Step 1 - - const size_t n = source.length(); - const size_t m = target.length(); - if (n == 0) { - return m; - } - if (m == 0) { - return n; - } - - // Good form to declare a TYPEDEF - - typedef std::vector > Tmatrix; - - Tmatrix matrix(n + 1); - - // Size the vectors in the 2.nd dimension. Unfortunately C++ doesn't - // allow for allocation on declaration of 2.nd dimension of vec of vec - - for (size_t i = 0; i <= n; i++) { - matrix[i].resize(m + 1); - } - - // Step 2 - - for (size_t i = 0; i <= n; i++) { - matrix[i][0] = i; - } - - for (size_t j = 0; j <= m; j++) { - matrix[0][j] = j; - } - - // Step 3 - - for (size_t i = 1; i <= n; i++) { - - const char s_i = source[i - 1]; - - // Step 4 - - for (size_t j = 1; j <= m; j++) { - - const char t_j = target[j - 1]; - - // Step 5 - - size_t cost; - if (s_i == t_j) { - cost = 0; - } - else { - cost = 1; - } - - // Step 6 - - const size_t above = matrix[i - 1][j]; - const size_t left = matrix[i][j - 1]; - const size_t diag = matrix[i - 1][j - 1]; - size_t cell = std::min(above + 1, std::min(left + 1, diag + cost)); - - // Step 6A: Cover transposition, in addition to deletion, - // insertion and substitution. This step is taken from: - // Berghel, Hal ; Roach, David : "An Extension of Ukkonen's - // Enhanced Dynamic Programming ASM Algorithm" - // (http://www.acm.org/~hlb/publications/asm/asm.html) - - if (i > 2 && j > 2) { - size_t trans = matrix[i - 2][j - 2] + 1; - if (source[i - 2] != t_j) trans++; - if (s_i != target[j - 2]) trans++; - if (cell > trans) cell = trans; - } - - matrix[i][j] = cell; - } - } - - // Step 7 - - return matrix[n][m]; -} - -inline std::pair, std::string> best_edit_distance_cigar(const std::string &source, - const std::string &target) { - - // Step 1 - - const size_t n = source.length(); - const size_t m = target.length(); -// if (n == 0) { -// return m; -// } -// if (m == 0) { -// return n; -// } - - // Good form to declare a TYPEDEF - - typedef std::vector > Tmatrix; - - Tmatrix matrix(n + 1); - - // Size the vectors in the 2.nd dimension. Unfortunately C++ doesn't - // allow for allocation on declaration of 2.nd dimension of vec of vec - - for (size_t i = 0; i <= n; i++) { - matrix[i].resize(m + 1); - } - - // Step 2 - - for (size_t i = 0; i <= n; i++) { - matrix[i][0] = (int) i; - } - - for (size_t j = 0; j <= m; j++) { - matrix[0][j] = 0; //free inserts in front - } - - // Step 3 - - for (size_t i = 1; i <= n; i++) { - - const char s_i = source[i - 1]; - - // Step 4 - - for (size_t j = 1; j <= m; j++) { - - const char t_j = target[j - 1]; - - // Step 5 - - int cost; - if (s_i == t_j) { - cost = 0; - } - else { - cost = 1; - } - - // Step 6 - - const int above = matrix[i - 1][j]; - const int left = matrix[i][j - 1]; - const int diag = matrix[i - 1][j - 1]; - int cell = std::min(above + 1, std::min(left + 1, diag + cost)); - - // Step 6A: Cover transposition, in addition to deletion, - // insertion and substitution. This step is taken from: - // Berghel, Hal ; Roach, David : "An Extension of Ukkonen's - // Enhanced Dynamic Programming ASM Algorithm" - // (http://www.acm.org/~hlb/publications/asm/asm.html) - -// if (i>2 && j>2) { -// int trans=matrix[i-2][j-2]+1; -// if (source[i-2]!=t_j) trans++; -// if (s_i!=target[j-2]) trans++; -// if (cell>trans) cell=trans; -// } - - matrix[i][j] = cell; - } - } - - // Step 7 - int min = matrix[n][m]; - size_t min_m = m; - - for (size_t j = 0; j <= m; j++) { - if (min > matrix[n][j]) { - min = matrix[n][j]; - min_m = j; - } - } - -// INFO("min = "< 0) { -// for (int i = 0; i <= n; i++) { -// INFO(ToString(matrix[i])); -// } -// } - - while ((cur_pos_i > 0) && (cur_pos_j > 0)) { - if (matrix[cur_pos_i - 1][cur_pos_j] < matrix[cur_pos_i][cur_pos_j]) { - cur_operation = 'I'; - cur_pos_i--; - } - else { - if (matrix[cur_pos_i][cur_pos_j - 1] < matrix[cur_pos_i][cur_pos_j]) { - cur_operation = 'D'; - cur_pos_j--; - } - else { - cur_operation = 'M'; - cur_pos_i--; - cur_pos_j--; - } - } - if (cur_operation != last_operation) { - if (last_operation != 0) - res = ToString(cnt_last_operation) + last_operation + res; - last_operation = cur_operation; - cnt_last_operation = 1; - } - else { - cnt_last_operation++; - } - } - res = ToString(cnt_last_operation) + last_operation + res; - return std::make_pair(std::make_pair(cur_pos_j, min_m), res); -} diff -Nru spades-3.10.1+dfsg/src/common/utils/logger/logger.hpp spades-3.11.1+dfsg/src/common/utils/logger/logger.hpp --- spades-3.10.1+dfsg/src/common/utils/logger/logger.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/logger/logger.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -6,7 +6,7 @@ //*************************************************************************** #pragma once -#include "utils/perfcounter.hpp" +#include "utils/perf/perfcounter.hpp" #include #include @@ -47,11 +47,8 @@ ///////////////////////////////////////////////////// struct writer { -#ifdef SPADES_USE_JEMALLOC virtual void write_msg(double time_in_sec, size_t cmem, size_t max_rss, level l, const char* file, size_t line_num, const char* source, const char* msg) = 0; -#else - virtual void write_msg(double time_in_sec, size_t max_rss, level l, const char* file, size_t line_num, const char* source, const char* msg) = 0; -#endif + virtual ~writer(){} }; @@ -97,7 +94,7 @@ private: properties props_ ; std::vector writers_; - perf_counter timer_ ; + utils::perf_counter timer_ ; }; std::shared_ptr& __logger(); diff -Nru spades-3.10.1+dfsg/src/common/utils/logger/logger_impl.cpp spades-3.11.1+dfsg/src/common/utils/logger/logger_impl.cpp --- spades-3.10.1+dfsg/src/common/utils/logger/logger_impl.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/logger/logger_impl.cpp 2017-09-28 09:05:11.000000000 +0000 @@ -97,28 +97,27 @@ return desired_level >= source_level; } -#ifdef SPADES_USE_JEMALLOC void logger::log(level desired_level, const char* file, size_t line_num, const char* source, const char* msg) { double time = timer_.time(); + size_t mem = -1ull; + size_t max_rss; + +#ifdef SPADES_USE_JEMALLOC const size_t *cmem = 0, *cmem_max = 0; size_t clen = sizeof(cmem); je_mallctl("stats.cactive", &cmem, &clen, NULL, 0); je_mallctl("stats.cactive_max", &cmem_max, &clen, NULL, 0); - - for (auto it = writers_.begin(); it != writers_.end(); ++it) - (*it)->write_msg(time, (*cmem) / 1024, (*cmem_max) / 1024, desired_level, file, line_num, source, msg); -} + mem = (*cmem) / 1024; + max_rss = (*cmem_max) / 1024; #else -void logger::log(level desired_level, const char* file, size_t line_num, const char* source, const char* msg) { - double time = timer_.time(); - size_t max_rss = get_max_rss(); + max_rss = get_max_rss(); +#endif for (auto it = writers_.begin(); it != writers_.end(); ++it) - (*it)->write_msg(time, max_rss, desired_level, file, line_num, source, msg); + (*it)->write_msg(time, mem, max_rss, desired_level, file, line_num, source, msg); } -#endif // void logger::add_writer(writer_ptr ptr) diff -Nru spades-3.10.1+dfsg/src/common/utils/logger/log_writers.hpp spades-3.11.1+dfsg/src/common/utils/logger/log_writers.hpp --- spades-3.10.1+dfsg/src/common/utils/logger/log_writers.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/logger/log_writers.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -7,7 +7,7 @@ #pragma once -#include "utils/path_helper.hpp" +#include "utils/filesystem/path_helper.hpp" #include "logger.hpp" #include @@ -15,29 +15,43 @@ #include "config.hpp" #include +#include namespace logging { struct console_writer : public writer { -#ifdef SPADES_USE_JEMALLOC void write_msg(double time, size_t cmem, size_t max_rss, level l, const char *file, size_t line_num, const char *source, const char *msg) { - std::cout << fmt::format("{:14s} {:>5s} / {:<5s} {:6.6s} {:24.24s} ({:26.26s}:{:4d}) {:s}", - human_readable_time(time), human_readable_memory(cmem), - human_readable_memory(max_rss), logging::level_name(l), - source, path::filename(file), int(line_num), msg) - << std::endl; + if (cmem != -1ull) + std::cout << fmt::format("{:14s} {:>5s} / {:<5s} {:6.6s} {:24.24s} ({:26.26s}:{:4d}) {:s}", + utils::human_readable_time(time), utils::human_readable_memory(cmem), + utils::human_readable_memory(max_rss), logging::level_name(l), + source, fs::filename(file), int(line_num), msg) + << std::endl; + else + std::cout << fmt::format("{:14s} {:^5s} {:6.6s} {:24.24s} ({:26.26s}:{:4d}) {:s}", + utils::human_readable_time(time), utils::human_readable_memory(max_rss), + logging::level_name(l), source, fs::filename(file), int(line_num), msg) + << std::endl; + } + +}; + +class mutex_writer : public writer { + std::mutex writer_mutex_; + std::shared_ptr writer_; + +public: + + mutex_writer(std::shared_ptr writer) : writer_(writer) {} + + void write_msg(double time, size_t cmem, size_t max_rss, level l, const char *file, size_t line_num, + const char *source, const char *msg) override { + std::lock_guard guard(writer_mutex_); + writer_->write_msg(time, cmem, max_rss, l, file, line_num, source, msg); } -#else -void write_msg(double time, size_t max_rss, level l, const char* file, size_t line_num, const char* source, const char* msg) { - std::cout << fmt::format("{:14s} {:^5s} {:6.6s} {:24.24s} ({:26.26s}:{:4d}) {:s}", - human_readable_time(time), human_readable_memory(max_rss), logging::level_name(l), - source, path::filename(file), int(line_num), msg) - << std::endl; -} -#endif }; } // logging diff -Nru spades-3.10.1+dfsg/src/common/utils/log.hpp spades-3.11.1+dfsg/src/common/utils/log.hpp --- spades-3.10.1+dfsg/src/common/utils/log.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/log.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,33 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -/* - * Compile time log(n,base) function for use in templates - * - * Created on: 02.03.2011 - * Author: vyahhi - */ - -#ifndef LOG_HPP_ -#define LOG_HPP_ - -template -struct log_ { - const static size_t value = 1 + log_::value; -}; - -template -struct log_<1, base> { - const static size_t value = 0; -}; - -template -struct log_<0, base> { - const static size_t value = 0; -}; - -#endif /* LOG_HPP_ */ diff -Nru spades-3.10.1+dfsg/src/common/utils/md5.h spades-3.11.1+dfsg/src/common/utils/md5.h --- spades-3.10.1+dfsg/src/common/utils/md5.h 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/md5.h 2017-09-28 09:05:11.000000000 +0000 @@ -39,6 +39,8 @@ #include #include +namespace utils { + #pragma region MD5 defines // Constants for MD5Transform routine. #define S11 7 @@ -390,4 +392,6 @@ } } ; +} + #endif diff -Nru spades-3.10.1+dfsg/src/common/utils/memory.hpp spades-3.11.1+dfsg/src/common/utils/memory.hpp --- spades-3.10.1+dfsg/src/common/utils/memory.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/memory.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,69 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -/* - * memory.hpp - * - * Created on: 05.03.2011 - * Author: http://stackoverflow.com/questions/669438/how-to-get-memory-usage-at-run-time-in-c - */ - -#ifndef MEMORY_HPP_ -#define MEMORY_HPP_ - -#include -#include -#include -#include -#include - -////////////////////////////////////////////////////////////////////////////// -// -// process_mem_usage(unsigned long &, long &) - takes two longs by reference, -// attempts to read the system-dependent data for a process' virtual memory -// size and resident set size, and return the results in KB. -// -// On failure, returns 0, 0 - -void process_mem_usage(unsigned long& vm_usage, long& resident_set) -{ - using std::ios_base; - using std::ifstream; - using std::string; - - vm_usage = 0; - resident_set = 0; - - // 'file' stat seems to give the most reliable results - // - ifstream stat_stream("/proc/self/stat",ios_base::in); - - // dummy vars for leading entries in stat that we don't care about - // - string pid, comm, state, ppid, pgrp, session, tty_nr; - string tpgid, flags, minflt, cminflt, majflt, cmajflt; - string utime, stime, cutime, cstime, priority, nice; - string O, itrealvalue, starttime; - - // the two fields we want - // - unsigned long vsize; - long rss; - - stat_stream >> pid >> comm >> state >> ppid >> pgrp >> session >> tty_nr - >> tpgid >> flags >> minflt >> cminflt >> majflt >> cmajflt - >> utime >> stime >> cutime >> cstime >> priority >> nice - >> O >> itrealvalue >> starttime >> vsize >> rss; // don't care about the rest - - stat_stream.close(); - - long page_size_kb = sysconf(_SC_PAGE_SIZE) / 1024; // in case x86-64 is configured to use 2MB pages - vm_usage = vsize / 1024; - resident_set = rss * page_size_kb; -} - -#endif /* MEMORY_HPP_ */ diff -Nru spades-3.10.1+dfsg/src/common/utils/memory_limit.hpp spades-3.11.1+dfsg/src/common/utils/memory_limit.hpp --- spades-3.10.1+dfsg/src/common/utils/memory_limit.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/memory_limit.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,97 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#pragma once - -#if __DARWIN || __DARWIN_UNIX03 -#include -#include -#else - -#include - -#endif - -#include -#include - -#include "config.hpp" - -#ifdef SPADES_USE_JEMALLOC - -# include - -#endif - -inline void limit_memory(size_t limit) { - rlimit rl; - if (sizeof(rlim_t) < 8) { - INFO("Can't limit virtual memory because of 32-bit system"); - return; - } - - int res = getrlimit(RLIMIT_AS, &rl); - VERIFY_MSG(res == 0, - "getrlimit(2) call failed, errno = " << errno); - - // We cannot go beyond hard limit and we might not have enough privileges to - // increase the hard limit - rl.rlim_cur = std::min(limit, rl.rlim_max); - res = setrlimit(RLIMIT_AS, &rl); - VERIFY_MSG(res == 0, - "setrlimit(2) call failed, errno = " << errno); - INFO("Memory limit set to " << (1.0 * (double) rl.rlim_cur / 1024 / 1024 / 1024) << " Gb"); -} - -inline size_t get_memory_limit() { - rlimit rl; - int res = getrlimit(RLIMIT_AS, &rl); - VERIFY_MSG(res == 0, - "getrlimit(2) call failed, errno = " << errno); - - return rl.rlim_cur; -} - -#if __DARWIN || __DARWIN_UNIX03 -inline size_t get_max_rss() { - struct task_basic_info t_info; - mach_msg_type_number_t t_info_count = TASK_BASIC_INFO_COUNT; - - if (KERN_SUCCESS != - task_info(mach_task_self(), - TASK_BASIC_INFO, (task_info_t)&t_info, &t_info_count)) - return -1U; - - return t_info.resident_size / 1024; -} -#else - -inline size_t get_max_rss() { - rusage ru; - getrusage(RUSAGE_SELF, &ru); - - return ru.ru_maxrss; -} - -#endif - -inline size_t get_used_memory() { -#ifdef SPADES_USE_JEMALLOC - const size_t *cmem = 0; - size_t clen = sizeof(cmem); - - je_mallctl("stats.cactive", &cmem, &clen, NULL, 0); - return *cmem; -#else - get_max_rss(); -#endif -} - - -inline size_t get_free_memory() { - return get_memory_limit() - get_used_memory(); -} diff -Nru spades-3.10.1+dfsg/src/common/utils/mph_index/base_hash.hpp spades-3.11.1+dfsg/src/common/utils/mph_index/base_hash.hpp --- spades-3.10.1+dfsg/src/common/utils/mph_index/base_hash.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/mph_index/base_hash.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,293 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include "common.hpp" - -namespace emphf { - - inline uint64_t unaligned_load64(uint8_t const* from) - { - uint64_t tmp; - memcpy(reinterpret_cast(&tmp), from, 8); - // XXX(ot): reverse bytes in big-endian architectures - return tmp; - } - - - struct jenkins64_hasher { - - typedef uint64_t seed_t; - typedef uint64_t hash_t; - typedef std::tuple hash_triple_t; - - jenkins64_hasher() - {} - - jenkins64_hasher(uint64_t seed) - : m_seed(seed) - {} - - template - static jenkins64_hasher generate(Rng& rng) - { - return jenkins64_hasher(rng()); - } - - // Adapted from http://www.burtleburtle.net/bob/c/lookup8.c - hash_triple_t operator()(byte_range_t s) const - { - using std::get; - hash_triple_t h(m_seed, m_seed, 0x9e3779b97f4a7c13ULL); - - size_t len = (size_t)(s.second - s.first); - uint8_t const* cur = s.first; - uint8_t const* end = s.second; - - while (end - cur >= 24) { - get<0>(h) += unaligned_load64(cur); - cur += 8; - get<1>(h) += unaligned_load64(cur); - cur += 8; - get<2>(h) += unaligned_load64(cur); - cur += 8; - - mix(h); - } - - get<2>(h) += len; - - switch (end - cur) { - case 23: get<2>(h) += (uint64_t(cur[22]) << 56); - case 22: get<2>(h) += (uint64_t(cur[21]) << 48); - case 21: get<2>(h) += (uint64_t(cur[20]) << 40); - case 20: get<2>(h) += (uint64_t(cur[19]) << 32); - case 19: get<2>(h) += (uint64_t(cur[18]) << 24); - case 18: get<2>(h) += (uint64_t(cur[17]) << 16); - case 17: get<2>(h) += (uint64_t(cur[16]) << 8); - // the first byte of c is reserved for the length - case 16: get<1>(h) += (uint64_t(cur[15]) << 56); - case 15: get<1>(h) += (uint64_t(cur[14]) << 48); - case 14: get<1>(h) += (uint64_t(cur[13]) << 40); - case 13: get<1>(h) += (uint64_t(cur[12]) << 32); - case 12: get<1>(h) += (uint64_t(cur[11]) << 24); - case 11: get<1>(h) += (uint64_t(cur[10]) << 16); - case 10: get<1>(h) += (uint64_t(cur[ 9]) << 8); - case 9: get<1>(h) += (uint64_t(cur[ 8])); - case 8: get<0>(h) += (uint64_t(cur[ 7]) << 56); - case 7: get<0>(h) += (uint64_t(cur[ 6]) << 48); - case 6: get<0>(h) += (uint64_t(cur[ 5]) << 40); - case 5: get<0>(h) += (uint64_t(cur[ 4]) << 32); - case 4: get<0>(h) += (uint64_t(cur[ 3]) << 24); - case 3: get<0>(h) += (uint64_t(cur[ 2]) << 16); - case 2: get<0>(h) += (uint64_t(cur[ 1]) << 8); - case 1: get<0>(h) += (uint64_t(cur[ 0])); - case 0: break; // nothing to add - default: assert(false); - } - - mix(h); - - return h; - } - - // rehash a hash triple - hash_triple_t operator()(hash_triple_t h) const - { - std::get<0>(h) += m_seed; - std::get<1>(h) += m_seed; - std::get<2>(h) += 0x9e3779b97f4a7c13ULL; - - mix(h); - - return h; - } - - void swap(jenkins64_hasher& other) - { - std::swap(m_seed, other.m_seed); - } - - void save(std::ostream& os) const - { - os.write(reinterpret_cast(&m_seed), sizeof(m_seed)); - } - - void load(std::istream& is) - { - is.read(reinterpret_cast(&m_seed), sizeof(m_seed)); - } - - seed_t seed() const - { - return m_seed; - } - - protected: - - static void mix(hash_triple_t& h) - { - uint64_t& a = std::get<0>(h); - uint64_t& b = std::get<1>(h); - uint64_t& c = std::get<2>(h); - - a -= b; a -= c; a ^= (c >> 43); - b -= c; b -= a; b ^= (a << 9); - c -= a; c -= b; c ^= (b >> 8); - a -= b; a -= c; a ^= (c >> 38); - b -= c; b -= a; b ^= (a << 23); - c -= a; c -= b; c ^= (b >> 5); - a -= b; a -= c; a ^= (c >> 35); - b -= c; b -= a; b ^= (a << 49); - c -= a; c -= b; c ^= (b >> 11); - a -= b; a -= c; a ^= (c >> 12); - b -= c; b -= a; b ^= (a << 18); - c -= a; c -= b; c ^= (b >> 22); - } - - seed_t m_seed; - }; - - - // This is basically a wrapper to jenkins64_hasher that uses a - // 32-bit seed and returns 32-bit hashes by truncation - struct jenkins32_hasher { - - typedef uint32_t seed_t; - typedef uint32_t hash_t; - typedef std::tuple hash_triple_t; - - jenkins32_hasher() - {} - - jenkins32_hasher(uint32_t seed) - : m_seed(seed) - {} - - template - static jenkins32_hasher generate(Rng& rng) - { - return jenkins32_hasher((uint32_t)rng()); - } - - hash_triple_t operator()(byte_range_t s) const - { - using std::get; - auto h64 = jenkins64_hasher(seed64())(s); - return hash_triple_t((uint32_t)get<0>(h64), - (uint32_t)get<1>(h64), - (uint32_t)get<2>(h64)); - } - - hash_triple_t operator()(hash_triple_t h) const - { - using std::get; - auto h64 = jenkins64_hasher::hash_triple_t(get<0>(h), - get<1>(h), - get<2>(h)); - h64 = jenkins64_hasher(seed64())(h64); - return hash_triple_t((uint32_t)get<0>(h64), - (uint32_t)get<1>(h64), - (uint32_t)get<2>(h64)); - } - - void swap(jenkins32_hasher& other) - { - std::swap(m_seed, other.m_seed); - } - - void save(std::ostream& os) const - { - os.write(reinterpret_cast(&m_seed), sizeof(m_seed)); - } - - void load(std::istream& is) - { - is.read(reinterpret_cast(&m_seed), sizeof(m_seed)); - } - - seed_t seed() const - { - return m_seed; - } - - protected: - - uint64_t seed64() const - { - return (uint64_t(m_seed) << 32) | m_seed; - } - - seed_t m_seed; - - }; - - - struct city_hasher { - typedef uint64_t seed_t; - typedef uint64_t hash_t; - typedef std::tuple hash_triple_t; - - city_hasher() - {} - - city_hasher(uint64_t seed) - : m_seed(seed) - {} - - template - static city_hasher generate(Rng& rng) { - return city_hasher(rng()); - } - - hash_triple_t operator()(byte_range_t s) const { - city_uint128 ch = CityHash128WithSeed((char*)s.first, s.second - s.first, {m_seed, 0x9e3779b97f4a7c13ULL}); - hash_triple_t h(ch.first, 0x9e3779b97f4a7c13ULL, ch.second); - mix(h); - - return h; - } - - void swap(city_hasher& other) { - std::swap(m_seed, other.m_seed); - } - - void save(std::ostream& os) const { - os.write(reinterpret_cast(&m_seed), sizeof(m_seed)); - } - - void load(std::istream& is) { - is.read(reinterpret_cast(&m_seed), sizeof(m_seed)); - } - - seed_t seed() const { - return m_seed; - } - - protected: - seed_t m_seed; - - static void mix(hash_triple_t& h) { - uint64_t& a = std::get<0>(h); - uint64_t& b = std::get<1>(h); - uint64_t& c = std::get<2>(h); - - a -= b; a -= c; a ^= (c >> 43); - b -= c; b -= a; b ^= (a << 9); - c -= a; c -= b; c ^= (b >> 8); - a -= b; a -= c; a ^= (c >> 38); - b -= c; b -= a; b ^= (a << 23); - c -= a; c -= b; c ^= (b >> 5); - a -= b; a -= c; a ^= (c >> 35); - b -= c; b -= a; b ^= (a << 49); - c -= a; c -= b; c ^= (b >> 11); - a -= b; a -= c; a ^= (c >> 12); - b -= c; b -= a; b ^= (a << 18); - c -= a; c -= b; c ^= (b >> 22); - } - }; - -} diff -Nru spades-3.10.1+dfsg/src/common/utils/mph_index/bitpair_vector.cpp spades-3.11.1+dfsg/src/common/utils/mph_index/bitpair_vector.cpp --- spades-3.10.1+dfsg/src/common/utils/mph_index/bitpair_vector.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/mph_index/bitpair_vector.cpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,77 +0,0 @@ -// -// Created by anton on 3/22/16. -// - -#include "bitpair_vector.hpp" - -#include - -void emphf::bitpair_vector::resize(uint64_t n) { - // can only grow, for now - assert(n >= size()); - m_size = n; - m_bits.resize((m_size + 31) / 32); -} - -size_t emphf::bitpair_vector::size() const { - return m_size; -} - -size_t emphf::bitpair_vector::mem_size() const { - return m_bits.size() * sizeof(m_bits[0]); -} - -uint64_t emphf::bitpair_vector::operator[](uint64_t pos) const { - return (m_bits[pos / 32] >> ((pos % 32) * 2)) % 4; -} - -void emphf::bitpair_vector::set(uint64_t pos, uint64_t val) { - assert(val < 4); - uint64_t word_pos = pos / 32; - uint64_t word_offset = (pos % 32) * 2; - m_bits[word_pos] &= ~(3ULL << word_offset); - m_bits[word_pos] |= val << word_offset; -} - -uint64_t emphf::bitpair_vector::range_nonzeros(uint64_t begin, uint64_t end) const { - assert(begin <= end); - assert(end <= size()); - - uint64_t word_begin = begin / 32; - uint64_t offset_begin = (begin % 32) * 2; - uint64_t word_end = end / 32; - uint64_t offset_end = (end % 32) * 2; - uint64_t r = 0; - - uint64_t word = (m_bits[word_begin] >> offset_begin) << offset_begin; - for (uint64_t w = word_begin; w < word_end; ++w) { - r += nonzero_pairs(word); - word = m_bits[w + 1]; - } - - uint64_t mask = (uint64_t(1) << offset_end) - 1; - r += nonzero_pairs(word & mask); - - return r; -} - -void emphf::bitpair_vector::swap(bitpair_vector& other) { - std::swap(m_size, other.m_size); - m_bits.swap(other.m_bits); - } - - -void emphf::bitpair_vector::save(std::ostream& os) const { - os.write(reinterpret_cast(&m_size), sizeof(m_size)); - os.write(reinterpret_cast(m_bits.data()), (std::streamsize)(sizeof(m_bits[0]) * m_bits.size())); - } - -void emphf::bitpair_vector::load(std::istream& is) { - is.read(reinterpret_cast(&m_size), sizeof(m_size)); - m_bits.resize((m_size + 31) / 32); - is.read(reinterpret_cast(m_bits.data()), (std::streamsize)(sizeof(m_bits[0]) * m_bits.size())); - } - -std::vector const &emphf::bitpair_vector::data() const { - return m_bits; -} diff -Nru spades-3.10.1+dfsg/src/common/utils/mph_index/bitpair_vector.hpp spades-3.11.1+dfsg/src/common/utils/mph_index/bitpair_vector.hpp --- spades-3.10.1+dfsg/src/common/utils/mph_index/bitpair_vector.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/mph_index/bitpair_vector.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,27 +0,0 @@ -#pragma once - -#include "common.hpp" -#include - -namespace emphf { - - class bitpair_vector { - public: - bitpair_vector(): m_size(0) {} - bitpair_vector(uint64_t n): m_size(0){resize(n);} - void resize(uint64_t n); - size_t size() const; - size_t mem_size() const; - uint64_t operator[](uint64_t pos) const; - void set(uint64_t pos, uint64_t val); - uint64_t range_nonzeros(uint64_t begin, uint64_t end) const; - void swap(bitpair_vector& other); - void save(std::ostream& os) const; - void load(std::istream& is); - std::vector const & data() const; - protected: - std::vector m_bits; - uint64_t m_size; - }; - -} diff -Nru spades-3.10.1+dfsg/src/common/utils/mph_index/CMakeLists.txt spades-3.11.1+dfsg/src/common/utils/mph_index/CMakeLists.txt --- spades-3.10.1+dfsg/src/common/utils/mph_index/CMakeLists.txt 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/mph_index/CMakeLists.txt 1970-01-01 00:00:00.000000000 +0000 @@ -1,13 +0,0 @@ -############################################################################ -# Copyright (c) 2015 Saint Petersburg State University -# Copyright (c) 2011-2014 Saint Petersburg Academic University -# All Rights Reserved -# See file LICENSE for details. -############################################################################ - -project(mph_index CXX) - -add_library(mph_index STATIC bitpair_vector.cpp) - -target_link_libraries(mph_index cityhash) - diff -Nru spades-3.10.1+dfsg/src/common/utils/mph_index/common.hpp spades-3.11.1+dfsg/src/common/utils/mph_index/common.hpp --- spades-3.10.1+dfsg/src/common/utils/mph_index/common.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/mph_index/common.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,66 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -#include "emphf_config.hpp" - -namespace emphf { - - template - struct iter_range - { - iter_range(Iterator b, Iterator e) - : m_begin(b) - , m_end(e) - {} - - Iterator begin() const - { return m_begin; } - - Iterator end() const - { return m_end; } - - Iterator m_begin, m_end; - }; - - typedef std::pair byte_range_t; - - struct identity_adaptor - { - byte_range_t operator()(byte_range_t s) const - { - return s; - } - }; - - template - iter_range range(Iterator begin, Iterator end) - { - return iter_range(begin, end); - } - - inline uint64_t nonzero_pairs(uint64_t x) - { - static const uint64_t ones_step_4 = 0x1111111111111111ULL; - x = (x | (x >> 1)) & (0x5 * ones_step_4); - -#if EMPHF_USE_POPCOUNT - return (uint64_t)__builtin_popcountll(x); -#else - static const uint64_t ones_step_8 = 0x0101010101010101ULL; - x = (x & 3 * ones_step_4) + ((x >> 2) & 3 * ones_step_4); - x = (x + (x >> 4)) & 0x0f * ones_step_8; - return (x * ones_step_8) >> 56; -#endif - } - - inline uint64_t msb(uint64_t x) - { - assert(x); - return 63 - __builtin_clzll(x); - } - -} diff -Nru spades-3.10.1+dfsg/src/common/utils/mph_index/emphf_config.hpp spades-3.11.1+dfsg/src/common/utils/mph_index/emphf_config.hpp --- spades-3.10.1+dfsg/src/common/utils/mph_index/emphf_config.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/mph_index/emphf_config.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,6 +0,0 @@ -#pragma once - -#define EMPHF_USE_POPCOUNT 1 -#ifndef EMPHF_USE_POPCOUNT -# define EMPHF_USE_POPCOUNT 0 -#endif diff -Nru spades-3.10.1+dfsg/src/common/utils/mph_index/hypergraph.hpp spades-3.11.1+dfsg/src/common/utils/mph_index/hypergraph.hpp --- spades-3.10.1+dfsg/src/common/utils/mph_index/hypergraph.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/mph_index/hypergraph.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,137 +0,0 @@ -#pragma once - -#include - -namespace emphf { - - template - struct hypergraph { - - typedef NodeType node_t; // last value is used as sentinel - - struct hyperedge { - // deliberately do not initialize, to avoid polluting the - // page cache when initializing large mmapped arrays - hyperedge() - {} - - hyperedge(NodeType v0_, NodeType v1_, NodeType v2_) - : v0(v0_) - , v1(v1_) - , v2(v2_) - {} - - friend inline - std::ostream& operator<<(std::ostream& os, hyperedge const& t) - { - os << "(" - << t.v0 << ", " - << t.v1 << ", " - << t.v2 << ")"; - return os; - } - - friend inline - bool operator<(hyperedge const& lhs, hyperedge const& rhs) - { - return - std::make_tuple(lhs.v0, lhs.v1, lhs.v2) < - std::make_tuple(rhs.v0, rhs.v1, rhs.v2); - } - - friend inline - bool operator==(hyperedge const& lhs, hyperedge const& rhs) - { - return - lhs.v0 == rhs.v0 && - lhs.v1 == rhs.v1 && - lhs.v2 == rhs.v2; - } - - friend inline - bool operator!=(hyperedge const& lhs, hyperedge const& rhs) - { - return !(lhs == rhs); - } - - NodeType v0, v1, v2; - }; - - static hyperedge sentinel() - { - return hyperedge(-node_t(1), -node_t(1), -node_t(1)); - } - - struct xored_adj_list { - xored_adj_list(node_t degree_= 0, node_t v1s_ = 0, node_t v2s_ = 0) - : degree(degree_) - , v1s(v1s_) - , v2s(v2s_) - {} - - void add_edge(hyperedge const& edge) - { - degree += 1; - xor_edge(edge); - } - - void delete_edge(hyperedge const& edge) - { - assert(degree >= 1); - degree -= 1; - xor_edge(edge); - } - - hyperedge edge_from(node_t v0) const - { - assert(degree == 1); - return hyperedge(v0, v1s, v2s); - } - - node_t degree; - node_t v1s; - node_t v2s; - - private: - - void xor_edge(hyperedge const& edge) - { - assert(edge.v1 < edge.v2); - v1s ^= edge.v1; - v2s ^= edge.v2; - } - - }; - }; - - // a brief note about hyperedge orientations: throughout the - // code we keep the invariant that for every hyperedge (v0, - // v1, v2) it holds v1 < v2. This leaves only three - // orientations, which we index with 0, 1, and 2 depending on - // whether v0 is the first, second, or third smallest node. We - // call the 0-orientation "canonical". - template - static unsigned orientation(HyperEdge const& t) - { - // although it should be v0 < v1 < v2, sometimes we - // compare sentinel edges - assert(t.v1 <= t.v2); - return (t.v0 > t.v1) + (t.v0 > t.v2); - } - - template - static HyperEdge canonicalize_edge(HyperEdge t) - { - assert(t.v1 <= t.v2); - if (t.v0 > t.v2) { - std::swap(t.v0, t.v2); - } - - if (t.v0 > t.v1) { - std::swap(t.v0, t.v1); - } - - assert(orientation(t) == 0); - return t; - } -} diff -Nru spades-3.10.1+dfsg/src/common/utils/mph_index/hypergraph_sorter_seq.hpp spades-3.11.1+dfsg/src/common/utils/mph_index/hypergraph_sorter_seq.hpp --- spades-3.10.1+dfsg/src/common/utils/mph_index/hypergraph_sorter_seq.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/mph_index/hypergraph_sorter_seq.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,130 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "common.hpp" -#include "hypergraph.hpp" - -#include "utils/logger/logger.hpp" - -namespace emphf { - - template - class hypergraph_sorter_seq { - public: - typedef HypergraphType hg; - typedef typename hg::node_t node_t; - typedef typename hg::hyperedge hyperedge; - typedef typename hg::xored_adj_list xored_adj_list; - - hypergraph_sorter_seq() - {} - - template - bool try_generate_and_sort(Range const& input_range, - EdgeGenerator const& edge_gen, - size_t n, - size_t hash_domain, - bool verbose = true) - { - using std::get; - std::vector adj_lists; - - size_t m = hash_domain * 3; - - // do all the allocations upfront - m_peeling_order.clear(); - m_peeling_order.reserve(n); - adj_lists.resize(m); - - // generate edges - if (verbose) { - //logger() << "Generating hyperedges and populating adjacency lists" - // << std::endl; - } - - for (auto const& val: input_range) { - auto edge = edge_gen(val); - // canonical by construction - assert(orientation(edge) == 0); - - adj_lists[edge.v0].add_edge(edge); - - std::swap(edge.v0, edge.v1); - adj_lists[edge.v0].add_edge(edge); - - std::swap(edge.v0, edge.v2); - adj_lists[edge.v0].add_edge(edge); - } - - // peel - if (verbose) { - // logger() << "Peeling" << std::endl; - } - - auto visit = [&](node_t v0) { - if (adj_lists[v0].degree == 1) { - auto edge = adj_lists[v0].edge_from(v0); - m_peeling_order.push_back(edge); - - edge = canonicalize_edge(edge); - adj_lists[edge.v0].delete_edge(edge); - - std::swap(edge.v0, edge.v1); - adj_lists[edge.v0].delete_edge(edge); - - std::swap(edge.v0, edge.v2); - adj_lists[edge.v0].delete_edge(edge); - } - }; - - size_t queue_position = 0; - for (node_t v0 = 0; v0 < m; ++v0) { - visit(v0); - - while (queue_position < m_peeling_order.size()) { - auto const& cur_edge = m_peeling_order[queue_position]; - - visit(cur_edge.v1); - visit(cur_edge.v2); - queue_position += 1; - } - } - - if (m_peeling_order.size() < n) { - if (verbose) { - // logger() << "Hypergraph is not peelable: " - // << (n - m_peeling_order.size()) << " edges remaining" - // << std::endl; - } - return false; - } - - assert(m_peeling_order.size() == n); - - return true; - } - - typedef typename std::vector::const_reverse_iterator - peeling_iterator; - - std::pair - get_peeling_order() const - { - return std::make_pair(m_peeling_order.crbegin(), - m_peeling_order.crend()); - } - - private: - - size_t m_hash_domain; - std::vector m_peeling_order; - }; -} diff -Nru spades-3.10.1+dfsg/src/common/utils/mph_index/kmer_index_builder.hpp spades-3.11.1+dfsg/src/common/utils/mph_index/kmer_index_builder.hpp --- spades-3.10.1+dfsg/src/common/utils/mph_index/kmer_index_builder.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/mph_index/kmer_index_builder.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,486 +0,0 @@ -#pragma once -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#include "kmer_index.hpp" - -#include "io/kmers/mmapped_reader.hpp" -#include "io/kmers/mmapped_writer.hpp" -#include "common/adt/pointer_iterator.hpp" -#include "common/adt/kmer_vector.hpp" - -#include "utils/openmp_wrapper.h" - -#include "utils/logger/logger.hpp" -#include "utils/path_helper.hpp" - -#include "utils/memory_limit.hpp" -#include "utils/file_limit.hpp" - -#include "adt/iterator_range.hpp" -#include "adt/loser_tree.hpp" - -#include "mphf.hpp" -#include "base_hash.hpp" -#include "hypergraph.hpp" -#include "hypergraph_sorter_seq.hpp" - -#include - -#include -#ifdef USE_GLIBCXX_PARALLEL -#include -#endif - -#include "config.hpp" - -#ifdef SPADES_USE_JEMALLOC -# include -#endif - -#include -#include -#include - -template -class KMerSplitter { - public: - typedef typename Seq::hash hash_function; - - KMerSplitter(const std::string &work_dir, unsigned K, uint32_t seed = 0) - : work_dir_(work_dir), K_(K), seed_(seed) {} - - virtual ~KMerSplitter() {} - - virtual path::files_t Split(size_t num_files) = 0; - - size_t kmer_size() const { - return Seq::GetDataSize(K_) * sizeof(typename Seq::DataType); - } - - unsigned K() const { return K_; } - - protected: - const std::string &work_dir_; - hash_function hash_; - unsigned K_; - uint32_t seed_; - - DECL_LOGGER("K-mer Splitting"); -}; - -template -class KMerSortingSplitter : public KMerSplitter { - public: - KMerSortingSplitter(const std::string &work_dir, unsigned K, uint32_t seed = 0) - : KMerSplitter(work_dir, K, seed), cell_size_(0), num_files_(0) {} - - protected: - using SeqKMerVector = KMerVector; - using KMerBuffer = std::vector; - - std::vector kmer_buffers_; - size_t cell_size_; - size_t num_files_; - - path::files_t PrepareBuffers(size_t num_files, unsigned nthreads, size_t reads_buffer_size) { - num_files_ = num_files; - - // Determine the set of output files - path::files_t out; - for (unsigned i = 0; i < num_files_; ++i) - out.push_back(this->GetRawKMersFname(i)); - - size_t file_limit = num_files_ + 2*nthreads; - size_t res = limit_file(file_limit); - if (res < file_limit) { - WARN("Failed to setup necessary limit for number of open files. The process might crash later on."); - WARN("Do 'ulimit -n " << file_limit << "' in the console to overcome the limit"); - } - - if (reads_buffer_size == 0) { - reads_buffer_size = 536870912ull; - size_t mem_limit = (size_t)((double)(get_free_memory()) / (nthreads * 3)); - INFO("Memory available for splitting buffers: " << (double)mem_limit / 1024.0 / 1024.0 / 1024.0 << " Gb"); - reads_buffer_size = std::min(reads_buffer_size, mem_limit); - } - cell_size_ = reads_buffer_size / (num_files_ * this->kmer_size()); - // Set sane minimum cell size - if (cell_size_ < 16384) - cell_size_ = 16384; - - INFO("Using cell size of " << cell_size_); - kmer_buffers_.resize(nthreads); - for (unsigned i = 0; i < nthreads; ++i) { - KMerBuffer &entry = kmer_buffers_[i]; - entry.resize(num_files_, KMerVector(this->K_, (size_t) (1.1 * (double) cell_size_))); - } - - return out; - } - - bool push_back_internal(const Seq &seq, unsigned thread_id) { - KMerBuffer &entry = kmer_buffers_[thread_id]; - - size_t idx = this->GetFileNumForSeq(seq, (unsigned)num_files_); - entry[idx].push_back(seq); - return entry[idx].size() > cell_size_; - } - - void DumpBuffers(const path::files_t &ostreams) { - VERIFY(ostreams.size() == num_files_ && kmer_buffers_[0].size() == num_files_); - -# pragma omp parallel for - for (unsigned k = 0; k < num_files_; ++k) { - // Below k is thread id! - - size_t sz = 0; - for (size_t i = 0; i < kmer_buffers_.size(); ++i) - sz += kmer_buffers_[i][k].size(); - - KMerVector SortBuffer(this->K_, sz); - for (auto & entry : kmer_buffers_) { - const auto &buffer = entry[k]; - for (size_t j = 0; j < buffer.size(); ++j) - SortBuffer.push_back(buffer[j]); - } - libcxx::sort(SortBuffer.begin(), SortBuffer.end(), typename KMerVector::less2_fast()); - auto it = std::unique(SortBuffer.begin(), SortBuffer.end(), typename KMerVector::equal_to()); - -# pragma omp critical - { - size_t cnt = it - SortBuffer.begin(); - - // Write k-mers - FILE *f = fopen(ostreams[k].c_str(), "ab"); - VERIFY_MSG(f, "Cannot open temporary file to write"); - fwrite(SortBuffer.data(), SortBuffer.el_data_size(), cnt, f); - fclose(f); - - // Write index - f = fopen((ostreams[k] + ".idx").c_str(), "ab"); - VERIFY_MSG(f, "Cannot open temporary file to write"); - fwrite(&cnt, sizeof(cnt), 1, f); - fclose(f); - } - } - - for (auto & entry : kmer_buffers_) - for (auto & eentry : entry) - eentry.clear(); - } - - void ClearBuffers() { - for (auto & entry : kmer_buffers_) - for (auto & eentry : entry) { - eentry.clear(); - eentry.shrink_to_fit(); - } - } - - std::string GetRawKMersFname(unsigned suffix) const { - return path::append_path(this->work_dir_, "kmers.raw." + std::to_string(suffix)); - } - - unsigned GetFileNumForSeq(const Seq &s, unsigned total) const { - return (unsigned)(this->hash_(s, this->seed_) % total); - } - -}; - -template > -class KMerCounter { - public: - typedef typename traits::raw_data_iterator iterator; - typedef typename traits::raw_data_const_iterator const_iterator; - typedef typename traits::RawKMerStorage RawKMerStorage; - typedef typename traits::FinalKMerStorage FinalKMerStorage; - - virtual size_t kmer_size() const = 0; - - virtual size_t Count(unsigned num_buckets, unsigned num_threads) = 0; - virtual size_t CountAll(unsigned num_buckets, unsigned num_threads, bool merge = true) = 0; - virtual void MergeBuckets(unsigned num_buckets) = 0; - - virtual std::unique_ptr GetBucket(size_t idx, bool unlink = true) = 0; - virtual std::unique_ptr GetFinalKMers() = 0; - - virtual ~KMerCounter() {} - -protected: - DECL_LOGGER("K-mer Counting"); -}; - -template > -class KMerDiskCounter : public KMerCounter { - typedef KMerCounter __super; - typedef typename traits::RawKMerStorage BucketStorage; -public: - KMerDiskCounter(const std::string &work_dir, KMerSplitter &splitter) - : work_dir_(work_dir), splitter_(splitter) { - std::string prefix = path::append_path(work_dir, "kmers_XXXXXX"); - char *tempprefix = strcpy(new char[prefix.length() + 1], prefix.c_str()); - VERIFY_MSG(-1 != (fd_ = ::mkstemp(tempprefix)), "Cannot create temporary file"); - kmer_prefix_ = tempprefix; - delete[] tempprefix; - } - - ~KMerDiskCounter() { - ::close(fd_); - ::unlink(kmer_prefix_.c_str()); - } - - size_t kmer_size() const override { - return Seq::GetDataSize(splitter_.K()) * sizeof(typename Seq::DataType); - } - - std::unique_ptr GetBucket(size_t idx, bool unlink = true) override { - unsigned K = splitter_.K(); - return std::unique_ptr(new BucketStorage(GetMergedKMersFname((unsigned)idx), Seq::GetDataSize(K), unlink)); - } - - size_t Count(unsigned num_buckets, unsigned num_threads) override { - unsigned K = splitter_.K(); - - // Split k-mers into buckets. - path::files_t raw_kmers = splitter_.Split(num_buckets * num_threads); - - INFO("Starting k-mer counting."); - size_t kmers = 0; -# pragma omp parallel for shared(raw_kmers) num_threads(num_threads) schedule(dynamic) reduction(+:kmers) - for (unsigned iFile = 0; iFile < raw_kmers.size(); ++iFile) { - kmers += MergeKMers(raw_kmers[iFile], GetUniqueKMersFname(iFile), K); - } - INFO("K-mer counting done. There are " << kmers << " kmers in total. "); - - INFO("Merging temporary buckets."); - for (unsigned i = 0; i < num_buckets; ++i) { - std::string ofname = GetMergedKMersFname(i); - std::ofstream ofs(ofname.c_str(), std::ios::out | std::ios::binary); - for (unsigned j = 0; j < num_threads; ++j) { - BucketStorage ins(GetUniqueKMersFname(i + j * num_buckets), Seq::GetDataSize(K), /* unlink */ true); - ofs.write((const char*)ins.data(), ins.data_size()); - } - } - - return kmers; - } - - void MergeBuckets(unsigned num_buckets) override { - unsigned K = splitter_.K(); - - INFO("Merging final buckets."); - - MMappedRecordArrayWriter os(GetFinalKMersFname(), Seq::GetDataSize(K)); - std::string ofname = GetFinalKMersFname(); - std::ofstream ofs(ofname.c_str(), std::ios::out | std::ios::binary); - for (unsigned j = 0; j < num_buckets; ++j) { - auto bucket = GetBucket(j, /* unlink */ true); - ofs.write((const char*)bucket->data(), bucket->data_size()); - } - ofs.close(); - } - - size_t CountAll(unsigned num_buckets, unsigned num_threads, bool merge = true) override { - size_t kmers = Count(num_buckets, num_threads); - if (merge) - MergeBuckets(num_buckets); - - return kmers; - } - - std::unique_ptr GetFinalKMers() override { - unsigned K = splitter_.K(); - return std::unique_ptr(new typename __super::FinalKMerStorage(GetFinalKMersFname(), Seq::GetDataSize(K), /* unlink */ true)); - } - - std::string GetMergedKMersFname(unsigned suffix) const { - return kmer_prefix_ + ".merged." + std::to_string(suffix); - } - - std::string GetFinalKMersFname() const { - return kmer_prefix_ + ".final"; - } - -private: - std::string work_dir_; - KMerSplitter &splitter_; - int fd_; - std::string kmer_prefix_; - - std::string GetUniqueKMersFname(unsigned suffix) const { - return kmer_prefix_ + ".unique." + std::to_string(suffix); - } - - size_t MergeKMers(const std::string &ifname, const std::string &ofname, - unsigned K) { - MMappedRecordArrayReader ins(ifname, Seq::GetDataSize(K), /* unlink */ true); - - std::string IdxFileName = ifname + ".idx"; - if (FILE *f = fopen(IdxFileName.c_str(), "rb")) { - fclose(f); - MMappedRecordReader index(ifname + ".idx", true, -1ULL); - - // INFO("Total runs: " << index.size()); - - // Prepare runs - std::vector> ranges; - auto beg = ins.begin(); - for (size_t sz : index) { - auto end = std::next(beg, sz); - ranges.push_back(adt::make_range(beg, end)); - VERIFY(std::is_sorted(beg, end, array_less())); - beg = end; - } - - // Construct tree on top entries of runs - adt::loser_tree> tree(ranges); - - if (tree.empty()) { - FILE *g = fopen(ofname.c_str(), "ab"); - VERIFY_MSG(g, "Cannot open temporary file to write"); - fclose(g); - return 0; - } - - // Write it down! - KMerVector buf(K, 1024*1024); - auto pval = tree.pop(); - size_t total = 0; - while (!tree.empty()) { - buf.clear(); - for (size_t cnt = 0; cnt < buf.capacity() && !tree.empty(); ) { - auto cval = tree.pop(); - if (!array_equal_to()(pval, cval)) { - buf.push_back(pval); - pval = cval; - cnt += 1; - } - } - total += buf.size(); - - FILE *g = fopen(ofname.c_str(), "ab"); - VERIFY_MSG(g, "Cannot open temporary file to write"); - fwrite(buf.data(), buf.el_data_size(), buf.size(), g); - fclose(g); - } - - // Handle very last value - { - FILE *g = fopen(ofname.c_str(), "ab"); - VERIFY_MSG(g, "Cannot open temporary file to write"); - fwrite(pval.data(), pval.data_size(), 1, g); - fclose(g); - total += 1; - } - - return total; - } else { - // Sort the stuff - libcxx::sort(ins.begin(), ins.end(), array_less()); - - // FIXME: Use something like parallel version of unique_copy but with explicit - // resizing. - auto it = std::unique(ins.begin(), ins.end(), array_equal_to()); - - MMappedRecordArrayWriter os(ofname, Seq::GetDataSize(K)); - os.resize(it - ins.begin()); - std::copy(ins.begin(), it, os.begin()); - - return it - ins.begin(); - } - } -}; - -template -class KMerIndexBuilder { - typedef typename Index::KMerSeq Seq; - typedef typename Index::kmer_index_traits kmer_index_traits; - - std::string work_dir_; - unsigned num_buckets_; - unsigned num_threads_; - - public: - KMerIndexBuilder(const std::string &workdir, - unsigned num_buckets, unsigned num_threads) - : work_dir_(workdir), num_buckets_(num_buckets), num_threads_(num_threads) {} - size_t BuildIndex(Index &out, KMerCounter &counter, - bool save_final = false); - - unsigned num_buckets() const { return num_buckets_; } - - private: - - DECL_LOGGER("K-mer Index Building"); -}; - -template -size_t KMerIndexBuilder::BuildIndex(Index &index, KMerCounter &counter, - bool save_final) { - index.clear(); - - INFO("Building kmer index "); - - // First, count the unique k-mers - size_t kmers = counter.Count(num_buckets_, num_threads_); - - index.num_buckets_ = num_buckets_; - index.bucket_starts_.resize(num_buckets_ + 1); - index.index_ = new typename KMerIndex::KMerDataIndex[num_buckets_]; - - INFO("Building perfect hash indices"); - - // Index building requires up to 40 bytes per k-mer. Limit number of threads depending on the memory limit. - unsigned num_threads = num_threads_; -# ifdef SPADES_USE_JEMALLOC - const size_t *cmem = 0; - size_t clen = sizeof(cmem); - - je_mallctl("stats.cactive", &cmem, &clen, NULL, 0); - size_t bucket_size = (36 * kmers + kmers * counter.kmer_size()) / num_buckets_; - num_threads = std::min((unsigned) ((get_memory_limit() - *cmem) / bucket_size), num_threads); - if (num_threads < 1) - num_threads = 1; - if (num_threads < num_threads_) - WARN("Number of threads was limited down to " << num_threads << " in order to fit the memory limits during the index construction"); -# endif - -# pragma omp parallel for shared(index) num_threads(num_threads) - for (unsigned iFile = 0; iFile < num_buckets_; ++iFile) { - typename KMerIndex::KMerDataIndex &data_index = index.index_[iFile]; - auto bucket = counter.GetBucket(iFile, !save_final); - size_t sz = bucket->end() - bucket->begin(); - index.bucket_starts_[iFile + 1] = sz; - typename kmer_index_traits::KMerRawReferenceAdaptor adaptor; - size_t max_nodes = (size_t(std::ceil(double(sz) * 1.23)) + 2) / 3 * 3; - if (max_nodes >= uint64_t(1) << 32) { - emphf::hypergraph_sorter_seq > sorter; - typename KMerIndex::KMerDataIndex(sorter, - sz, emphf::range(bucket->begin(), bucket->end()), - adaptor).swap(data_index); - } else { - emphf::hypergraph_sorter_seq > sorter; - typename KMerIndex::KMerDataIndex(sorter, - sz, emphf::range(bucket->begin(), bucket->end()), - adaptor).swap(data_index); - } - } - - // Finally, record the sizes of buckets. - for (unsigned iFile = 1; iFile < num_buckets_; ++iFile) - index.bucket_starts_[iFile] += index.bucket_starts_[iFile - 1]; - - if (save_final) - counter.MergeBuckets(num_buckets_); - - double bits_per_kmer = 8.0 * (double)index.mem_size() / (double)kmers; - INFO("Index built. Total " << index.mem_size() << " bytes occupied (" << bits_per_kmer << " bits per kmer)."); - index.count_size(); - return kmers; -} diff -Nru spades-3.10.1+dfsg/src/common/utils/mph_index/kmer_index.hpp spades-3.11.1+dfsg/src/common/utils/mph_index/kmer_index.hpp --- spades-3.10.1+dfsg/src/common/utils/mph_index/kmer_index.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/mph_index/kmer_index.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,129 +0,0 @@ -#pragma once -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#include "mphf.hpp" -#include "base_hash.hpp" - -#include "kmer_index_traits.hpp" - -#include -#include - -template -class KMerIndexBuilder; - -template -class KMerIndex { - public: - typedef traits kmer_index_traits; - typedef typename traits::SeqType KMerSeq; - typedef typename traits::hash_function hash_function; - typedef typename traits::KMerRawData KMerRawData; - typedef typename traits::KMerRawReference KMerRawReference; - typedef size_t IdxType; - - private: - using KMerDataIndex = emphf::mphf; - typedef KMerIndex __self; - - public: - KMerIndex(): index_(NULL), num_buckets_(0), size_(0) {} - - KMerIndex(const KMerIndex&) = delete; - KMerIndex& operator=(const KMerIndex&) = delete; - - ~KMerIndex() { clear(); } - - void clear() { - num_buckets_ = 0; - bucket_starts_.clear(); - - delete[] index_; - index_ = NULL; - } - - size_t mem_size() { - size_t sz = 0; - for (size_t i = 0; i < num_buckets_; ++i) - sz += index_[i].mem_size(); - - return sz; - } - - void count_size() { - if (index_ == NULL) - return; - size_ = 0; - for (size_t i = 0; i < num_buckets_; i++) - size_ += index_[i].size(); - } - - size_t size() const { - return size_; - } - - size_t seq_idx(const KMerSeq &s) const { - size_t bucket = seq_bucket(s); - - return bucket_starts_[bucket] + - index_[bucket].lookup(s, typename traits::KMerSeqAdaptor()); - } - - size_t raw_seq_idx(const KMerRawReference data) const { - size_t bucket = raw_seq_bucket(data); - - return bucket_starts_[bucket] + - index_[bucket].lookup(data, typename traits::KMerRawReferenceAdaptor()); - } - - template - void serialize(Writer &os) const { - os.write((char*)&num_buckets_, sizeof(num_buckets_)); - for (size_t i = 0; i < num_buckets_; ++i) - index_[i].save(os); - os.write((char*)&bucket_starts_[0], (num_buckets_ + 1) * sizeof(bucket_starts_[0])); - } - - template - void deserialize(Reader &is) { - clear(); - - is.read((char*)&num_buckets_, sizeof(num_buckets_)); - - index_ = new KMerDataIndex[num_buckets_]; - for (size_t i = 0; i < num_buckets_; ++i) - index_[i].load(is); - - bucket_starts_.resize(num_buckets_ + 1); - is.read((char*)&bucket_starts_[0], (num_buckets_ + 1) * sizeof(bucket_starts_[0])); - count_size(); - } - - void swap(KMerIndex &other) { - std::swap(index_, other.index_); - std::swap(num_buckets_, other.num_buckets_); - std::swap(size_, other.size_); - std::swap(bucket_starts_, other.bucket_starts_); - } - - private: - KMerDataIndex *index_; - - size_t num_buckets_; - std::vector bucket_starts_; - size_t size_; - - size_t seq_bucket(const KMerSeq &s) const { - return hash_function()(s) % num_buckets_; - } - size_t raw_seq_bucket(const KMerRawReference data) const { - return hash_function()(data) % num_buckets_; - } - - friend class KMerIndexBuilder<__self>; -}; diff -Nru spades-3.10.1+dfsg/src/common/utils/mph_index/kmer_index_traits.hpp spades-3.11.1+dfsg/src/common/utils/mph_index/kmer_index_traits.hpp --- spades-3.10.1+dfsg/src/common/utils/mph_index/kmer_index_traits.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/mph_index/kmer_index_traits.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,92 +0,0 @@ -#pragma once -//*************************************************************************** -//* Copyright (c) 2016 Saint Petersburg State University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#include "io/kmers/mmapped_reader.hpp" -#include "mphf.hpp" - -template -struct kmer_index_traits { - typedef Seq SeqType; - typedef MMappedRecordArrayReader RawKMerStorage; - typedef MMappedRecordArrayReader FinalKMerStorage; - typedef typename RawKMerStorage::iterator raw_data_iterator; - typedef typename RawKMerStorage::const_iterator raw_data_const_iterator; - typedef typename RawKMerStorage::iterator::value_type KMerRawData; - typedef typename RawKMerStorage::iterator::reference KMerRawReference; - typedef typename RawKMerStorage::const_iterator::reference KMerRawConstReference; - - struct raw_equal_to { - bool operator()(const Seq &lhs, const KMerRawReference rhs) { - return (array_equal_to()(lhs.data(), lhs.data_size(), rhs)); - } - }; - - struct raw_create { - Seq operator()(unsigned K, const KMerRawReference kmer) { - return Seq(K, kmer.data()); - } - Seq operator()(unsigned K, const KMerRawConstReference kmer) { - return Seq(K, kmer.data()); - } - }; - - struct hash_function { - uint64_t operator()(const Seq &k) const{ - return typename Seq::hash()(k); - } - uint64_t operator()(const KMerRawReference k) const { - return typename Seq::hash()(k.data(), k.size()); - } - }; - - struct KMerRawReferenceAdaptor { - emphf::byte_range_t operator()(const KMerRawReference k) const { - const uint8_t * data = (const uint8_t*)k.data(); - return std::make_pair(data, data + k.data_size()); - } - }; - - struct KMerSeqAdaptor { - emphf::byte_range_t operator()(const Seq &k) const { - const uint8_t * data = (const uint8_t*)k.data(); - return std::make_pair(data, data + k.data_size() * sizeof(typename Seq::DataType)); - } - }; - - template - static void raw_serialize(Writer &writer, RawKMerStorage *data) { - size_t sz = data->data_size(), elcnt = data->elcnt(); - unsigned PageSize = getpagesize(); - writer.write((char*)&sz, sizeof(sz)); - writer.write((char*)&elcnt, sizeof(elcnt)); - // Make sure data is aligned to the page boundary - size_t cpos = writer.tellp(); - size_t pos = (cpos + PageSize - 1 + sizeof(size_t)) / PageSize * PageSize; - size_t off = pos - writer.tellp(); - writer.write((char*)&off, sizeof(off)); - writer.seekp(pos); - writer.write((char*)data->data(), data->data_size()); - } - - template - static void raw_serialize(Writer &writer, const std::unique_ptr &data) { - raw_serialize(writer, data.get()); - } - - template - static std::unique_ptr raw_deserialize(Reader &reader, const std::string &FileName) { - size_t sz, off, elcnt; - reader.read((char*)&sz, sizeof(sz)); - reader.read((char*)&elcnt, sizeof(elcnt)); - reader.read((char*)&off, sizeof(off)); - off -= sizeof(off); - off += reader.tellg(); - - return std::unique_ptr(new RawKMerStorage(FileName, elcnt, false, off, sz)); - } - -}; diff -Nru spades-3.10.1+dfsg/src/common/utils/mph_index/mphf.hpp spades-3.11.1+dfsg/src/common/utils/mph_index/mphf.hpp --- spades-3.10.1+dfsg/src/common/utils/mph_index/mphf.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/mph_index/mphf.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,136 +0,0 @@ -#pragma once - -#include - -#include "bitpair_vector.hpp" -#include "ranked_bitpair_vector.hpp" - -#include "utils/logger/logger.hpp" - -namespace emphf { - - template - class mphf { - public: - mphf() - {} - - template - mphf(HypergraphSorter& sorter, size_t n, - Range const& input_range, Adaptor adaptor, - double gamma = 1.23) - : m_n(n) - , m_hash_domain(std::max((size_t(std::ceil(double(m_n) * gamma)) + 2) / 3, size_t(2))) - { - typedef typename HypergraphSorter::node_t node_t; - typedef typename HypergraphSorter::hyperedge hyperedge; - typedef decltype(*std::begin(input_range)) value_type; - - size_t nodes_domain = m_hash_domain * 3; - - if (nodes_domain >= std::numeric_limits::max()) { - throw std::invalid_argument("Too many nodes for node_t"); - } - - auto edge_gen = [&](value_type s) { - using std::get; - auto hashes = m_hasher(adaptor(s)); - return hyperedge((node_t)(get<0>(hashes) % m_hash_domain), - (node_t)(m_hash_domain + - (get<1>(hashes) % m_hash_domain)), - (node_t)(2 * m_hash_domain + - (get<2>(hashes) % m_hash_domain))); - }; - - std::mt19937_64 rng(37); // deterministic seed - - for (size_t trial = 0; ; ++trial) { - //logger() << "Hypergraph generation: trial " << trial << std::endl; - - m_hasher = BaseHasher::generate(rng); - if (sorter.try_generate_and_sort(input_range, edge_gen, - m_n, m_hash_domain)) break; - } - - auto peeling_order = sorter.get_peeling_order(); - bitpair_vector bv(nodes_domain); - - //logger() << "Assigning values" << std::endl; - - for (auto edge = peeling_order.first; - edge != peeling_order.second; - ++edge) { - - uint64_t target = orientation(*edge); - uint64_t assigned = bv[edge->v1] + bv[edge->v2]; - - // "assigned values" must be nonzeros to be ranked, so - // if the result is 0 we assign 3 - bv.set(edge->v0, ((target - assigned + 9) % 3) ?: 3); - } - - m_bv.build(std::move(bv)); - } - - uint64_t size() const - { - return m_n; - } - - size_t mem_size() const { - return m_bv.mem_size(); - } - - BaseHasher const& base_hasher() const - { - return m_hasher; - } - - template - uint64_t lookup(const T &val, Adaptor adaptor) - { - using std::get; - auto hashes = m_hasher(adaptor(val)); - uint64_t nodes[3] = {get<0>(hashes) % m_hash_domain, - m_hash_domain + (get<1>(hashes) % m_hash_domain), - 2 * m_hash_domain + (get<2>(hashes) % m_hash_domain)}; - - uint64_t hidx = (m_bv[nodes[0]] + m_bv[nodes[1]] + m_bv[nodes[2]]) % 3; - return m_bv.rank(nodes[hidx]); - } - - void swap(mphf& other) - { - std::swap(m_n, other.m_n); - std::swap(m_hash_domain, other.m_hash_domain); - m_hasher.swap(other.m_hasher); - m_bv.swap(other.m_bv); - } - - void save(std::ostream& os) const - { - os.write(reinterpret_cast(&m_n), sizeof(m_n)); - os.write(reinterpret_cast(&m_hash_domain), - sizeof(m_hash_domain)); - m_hasher.save(os); - m_bv.save(os); - } - - void load(std::istream& is) - { - is.read(reinterpret_cast(&m_n), sizeof(m_n)); - is.read(reinterpret_cast(&m_hash_domain), - sizeof(m_hash_domain)); - m_hasher.load(is); - m_bv.load(is); - } - - - private: - - uint64_t m_n; - uint64_t m_hash_domain; - BaseHasher m_hasher; - ranked_bitpair_vector m_bv; - }; -} diff -Nru spades-3.10.1+dfsg/src/common/utils/mph_index/ranked_bitpair_vector.hpp spades-3.11.1+dfsg/src/common/utils/mph_index/ranked_bitpair_vector.hpp --- spades-3.10.1+dfsg/src/common/utils/mph_index/ranked_bitpair_vector.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/mph_index/ranked_bitpair_vector.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,91 +0,0 @@ -#pragma once - -#include - -#include "common.hpp" -#include "bitpair_vector.hpp" - -namespace emphf { - - class ranked_bitpair_vector { - public: - - ranked_bitpair_vector() - {} - - void build(bitpair_vector&& bv) - { - m_bv.swap(bv); - - uint64_t cur_rank = 0; - auto const& words = m_bv.data(); - for (size_t i = 0; i < words.size(); ++i) { - if (((i * 32) % pairs_per_block) == 0) { - m_block_ranks.push_back(cur_rank); - } - cur_rank += nonzero_pairs(words[i]); - } - } - - size_t size() const - { - return m_bv.size(); - } - - size_t mem_size() const { - return m_bv.mem_size() + m_block_ranks.size() * sizeof(m_block_ranks[0]); - } - - uint64_t operator[](uint64_t pos) const - { - return m_bv[pos]; - } - - uint64_t rank(uint64_t pos) const - { - uint64_t word_idx = pos / 32; - uint64_t word_offset = pos % 32; - uint64_t block = pos / pairs_per_block; - uint64_t r = m_block_ranks[block]; - - for (uint64_t w = block * pairs_per_block / 32; w < word_idx; ++w) { - r += nonzero_pairs(m_bv.data()[w]); - } - - uint64_t mask = (uint64_t(1) << (word_offset * 2)) - 1; - r += nonzero_pairs(m_bv.data()[word_idx] & mask); - - return r; - } - - void swap(ranked_bitpair_vector& other) - { - m_bv.swap(other.m_bv); - m_block_ranks.swap(other.m_block_ranks); - } - - void save(std::ostream& os) const - { - m_bv.save(os); - assert(m_block_ranks.size() == - (m_bv.size() + pairs_per_block - 1) / pairs_per_block); - os.write(reinterpret_cast(m_block_ranks.data()), - (std::streamsize)(sizeof(m_block_ranks[0]) * m_block_ranks.size())); - } - - void load(std::istream& is) - { - m_bv.load(is); - m_block_ranks.resize((m_bv.size() + pairs_per_block - 1) / pairs_per_block); - is.read(reinterpret_cast(m_block_ranks.data()), - (std::streamsize)(sizeof(m_block_ranks[0]) * m_block_ranks.size())); - } - - protected: - - static const uint64_t pairs_per_block = 512; - bitpair_vector m_bv; - std::vector m_block_ranks; - }; - -} diff -Nru spades-3.10.1+dfsg/src/common/utils/openmp_wrapper.h spades-3.11.1+dfsg/src/common/utils/openmp_wrapper.h --- spades-3.10.1+dfsg/src/common/utils/openmp_wrapper.h 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/openmp_wrapper.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,26 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#ifndef __OMP_WRAPPER_H__ -#define __OMP_WRAPPER_H__ - -#ifdef _OPENMP -# include -#else -/* Provide single-threaded stubs */ -# define omp_set_num_threads(x) ((void)(x)) -# define omp_get_max_threads() 1 -# define omp_get_thread_num() 0 -# define omp_get_num_threads() 1 -# define omp_lock_t size_t -# define omp_init_lock(x) ((void)(x)) -# define omp_destroy_lock(x) ((void)(x)) -# define omp_set_lock(x) ((void)(x)) -# define omp_unset_lock(x) ((void)(x)) -#endif - -#endif /* __OMP_WRAPPER_H__ */ diff -Nru spades-3.10.1+dfsg/src/common/utils/parallel/openmp_wrapper.h spades-3.11.1+dfsg/src/common/utils/parallel/openmp_wrapper.h --- spades-3.10.1+dfsg/src/common/utils/parallel/openmp_wrapper.h 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/parallel/openmp_wrapper.h 2017-09-28 09:05:11.000000000 +0000 @@ -0,0 +1,26 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#ifndef __OMP_WRAPPER_H__ +#define __OMP_WRAPPER_H__ + +#ifdef _OPENMP +# include +#else +/* Provide single-threaded stubs */ +# define omp_set_num_threads(x) ((void)(x)) +# define omp_get_max_threads() 1 +# define omp_get_thread_num() 0 +# define omp_get_num_threads() 1 +# define omp_lock_t size_t +# define omp_init_lock(x) ((void)(x)) +# define omp_destroy_lock(x) ((void)(x)) +# define omp_set_lock(x) ((void)(x)) +# define omp_unset_lock(x) ((void)(x)) +#endif + +#endif /* __OMP_WRAPPER_H__ */ diff -Nru spades-3.10.1+dfsg/src/common/utils/parallel/parallel_wrapper.hpp spades-3.11.1+dfsg/src/common/utils/parallel/parallel_wrapper.hpp --- spades-3.10.1+dfsg/src/common/utils/parallel/parallel_wrapper.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/parallel/parallel_wrapper.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -0,0 +1,43 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +/* + * parallel_wrapper.hpp + * + * Created on: Dec 14, 2013 + * Author: anton + */ + +#pragma once +#ifdef USE_GLIBCXX_PARALLEL +#include +#else +#include +#endif + +namespace parallel { +#ifdef USE_GLIBCXX_PARALLEL + template + void sort (RandomAccessIterator first, RandomAccessIterator last) { + __gnu_parallel::sort(first, last); + } + template + void sort (RandomAccessIterator first, RandomAccessIterator last, Compare comp) { + __gnu_parallel::sort(first, last, comp); + } +#else + template + void sort (RandomAccessIterator first, RandomAccessIterator last) { + std::sort(first, last); + } + template + void sort (RandomAccessIterator first, RandomAccessIterator last, Compare comp) { + std::sort(first, last, comp); + } +#endif +} + diff -Nru spades-3.10.1+dfsg/src/common/utils/parallel_wrapper.hpp spades-3.11.1+dfsg/src/common/utils/parallel_wrapper.hpp --- spades-3.10.1+dfsg/src/common/utils/parallel_wrapper.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/parallel_wrapper.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,43 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -/* - * parallel_wrapper.hpp - * - * Created on: Dec 14, 2013 - * Author: anton - */ - -#pragma once -#ifdef USE_GLIBCXX_PARALLEL -#include -#else -#include -#endif - -namespace parallel { -#ifdef USE_GLIBCXX_PARALLEL - template - void sort (RandomAccessIterator first, RandomAccessIterator last) { - __gnu_parallel::sort(first, last); - } - template - void sort (RandomAccessIterator first, RandomAccessIterator last, Compare comp) { - __gnu_parallel::sort(first, last, comp); - } -#else - template - void sort (RandomAccessIterator first, RandomAccessIterator last) { - std::sort(first, last); - } - template - void sort (RandomAccessIterator first, RandomAccessIterator last, Compare comp) { - std::sort(first, last, comp); - } -#endif -} - diff -Nru spades-3.10.1+dfsg/src/common/utils/path_helper.cpp spades-3.11.1+dfsg/src/common/utils/path_helper.cpp --- spades-3.10.1+dfsg/src/common/utils/path_helper.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/path_helper.cpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,249 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#include "utils/path_helper.hpp" - -#include -#include -#include -#include - -#include -#include - -#include -#include - -namespace path { - -bool make_dir(std::string const& folder) { - return mkdir(folder.c_str(), 0755) == 0; -} - -std::string make_temp_dir(std::string const& prefix, - std::string const& suffix) { - std::string name = append_path(prefix, suffix + "_XXXXXX"); - char* actual; - if ((actual = ::mkdtemp(strcpy(new char[name.length() + 1], name.c_str()))) - == NULL) - throw std::runtime_error("Cannot create temporary dir " + name); - - std::string result(actual); - if (result == name) - throw std::runtime_error("Cannot create temporary dir " + name); - - delete[] actual; - - return result; -} - -void remove_dir(std::string const& folder) { - DIR *dp; - if ((dp = opendir(folder.c_str())) == NULL) - throw std::runtime_error("can not open folder " + folder); - - struct dirent *dirp; - while ((dirp = readdir(dp)) != NULL) { - std::string full_path = folder + "/" + dirp->d_name; - - if (dirp->d_type == DT_DIR) { - if (std::string(".") != dirp->d_name - && std::string("..") != dirp->d_name) { - remove_dir(full_path); - } - } else - remove(full_path.c_str()); - } - - closedir(dp); - remove(folder.c_str()); -} - -bool is_regular_file(std::string const& path) { - struct stat st; - return (stat(path.c_str(), &st) == 0) && (S_ISREG(st.st_mode)); -} - -std::string append_path(std::string const& prefix, std::string const& suffix) { - std::string delimiter = ""; - - if (!boost::ends_with(prefix, "/") && !boost::starts_with(suffix, "/") - && !prefix.empty()) { - delimiter = "/"; - } - - return prefix + delimiter + suffix; -} - -std::string current_dir() { - char* cwd = getcwd(NULL, 0); - std::string result = cwd; - - free(cwd); - return result; -} - -void make_full_path(std::string& path) { - if (!boost::starts_with(path, "/")) // relative path - path = append_path(current_dir(), path); -} - -std::string filename(std::string const& path) { - size_t pos = path.find_last_of('/'); - return pos != std::string::npos ? path.substr(pos + 1) : path; -} - -std::string basename(std::string const& path) { - size_t slash = path.find_last_of('/'); - size_t after_slash = slash == std::string::npos ? 0 : slash + 1; - - size_t dot = path.find_last_of('.'); - if (dot < after_slash) - dot = std::string::npos; - - return path.substr(after_slash, dot - after_slash); -} - -std::string extension(std::string const& path) { - size_t slash = path.find_last_of('/'); - size_t after_slash = slash == std::string::npos ? 0 : slash + 1; - size_t dot = path.find_last_of('.'); - - if (dot < after_slash || dot == std::string::npos || dot + 1 == path.size()) - return std::string(); - - return path.substr(dot); -} - -std::string parent_path(std::string const& path) { - std::string cpath(path); - - make_full_path(cpath); - size_t slash_pos = cpath.find_last_of('/'); - - return (slash_pos == 0 ? std::string("/") : cpath.substr(0, slash_pos)); -} - -bool check_existence(std::string const& path) { - struct stat st_buf; - return stat(path.c_str(), &st_buf) == 0 - && (S_ISREG(st_buf.st_mode) || S_ISDIR(st_buf.st_mode)); // exists and (file or dir) -} - -void remove_if_exists(std::string const& path) { - if (check_existence(path)) { - if (is_regular_file(path)) // file - remove(path.c_str()); - else // dir - remove_dir(path); - } -} - -//TODO do we need to screen anything but whitespaces? -std::string screen_whitespaces(std::string const &path) { - std::string to_search = " "; - std::string res = ""; - for (size_t i = 0; i < path.size(); i++) { - if ((i == 0) || (path[i] != ' ') || (path[i - 1] == '\\')) { - res += path[i]; - } else { - res +='\\'; - res +=' '; - } - } -// res += "'"; - return res; -} - -//todo reduce code duplication!!! -bool FileExists(std::string const &filename) { - struct stat st_buf; - return stat(filename.c_str(), &st_buf) == 0 && S_ISREG(st_buf.st_mode); -} - -void CheckFileExistenceFATAL(std::string const &filename) { - if (!FileExists(filename)) FATAL_ERROR("File " << filename << " doesn't exist or can't be read!"); -} - -void make_dirs(std::string const &path) { - VERIFY(!path.empty()); - - size_t slash_pos = 0; - while ((slash_pos = path.find_first_of('/', slash_pos + 1)) != std::string::npos) { - make_dir(path.substr(0, slash_pos)); - } - if (path[path.size() - 1] != '/') { - make_dir(path); - } -} - -// doesn't support symlinks -std::string resolve(std::string const& path) { - typedef boost::char_delimiters_separator separator_t; - typedef boost::tokenizer tokenizer_t; - - tokenizer_t tok(path, separator_t(false, "", "/")); - - std::string result = "/"; - for (auto it = tok.begin(); it != tok.end(); ++it) { - if (*it == "..") - result = parent_path(result); - - else if (*it == ".") - ; // Ignore - - else - // Just cat other path entries - result = append_path(result, *it); - } - - return result; -} - -std::string make_relative_path(std::string p, std::string base) { - p = resolve(p); - base = resolve(base); - - std::string pp = parent_path(p); - - typedef boost::char_delimiters_separator separator_t; - typedef boost::tokenizer tokenizer_t; - - tokenizer_t pp_tok(pp, separator_t(false, "", "/")); - tokenizer_t base_tok(base, separator_t(false, "", "/")); - - auto i = pp_tok.begin(); - auto j = base_tok.begin(); - - while (i != pp_tok.end() && j != base_tok.end() && *i == *j) { - ++i; - ++j; - } - - std::string result; - for (; j != base_tok.end(); ++j) - result = append_path("..", result); - - for (; i != pp_tok.end(); ++i) - result = append_path(result, *i); - - return append_path(result, filename(p)); -} - -std::string MakeLaunchTimeDirName() { - time_t rawtime; - struct tm * timeinfo; - char buffer[80]; - - time(&rawtime); - timeinfo = localtime(&rawtime); - - strftime(buffer, 80, "%m.%d_%H.%M.%S", timeinfo); - return std::string(buffer); -} - -} diff -Nru spades-3.10.1+dfsg/src/common/utils/path_helper.hpp spades-3.11.1+dfsg/src/common/utils/path_helper.hpp --- spades-3.10.1+dfsg/src/common/utils/path_helper.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/path_helper.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,74 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#pragma once - -#include -#include -#include -#include - -#include -#include -#include "utils/logger/logger.hpp" -#include "utils/verify.hpp" - -namespace path { -//todo review and make names consistent! - -typedef std::vector files_t; - -bool make_dir(std::string const &folder); - -std::string make_temp_dir(std::string const &prefix, std::string const &suffix); - -void remove_dir(std::string const &folder); - -bool is_regular_file(std::string const &path); - -std::string append_path(std::string const &prefix, std::string const &suffix); - -std::string current_dir(); - -//todo why non-cons argument?! -void make_full_path(std::string &path); - -std::string filename(std::string const &path); - -std::string basename(std::string const &path); - -std::string extension(std::string const &path); - -std::string parent_path(std::string const &path); - -bool check_existence(std::string const &path); - -void remove_if_exists(std::string const &path); - -std::string screen_whitespaces(std::string const &path); - -/** -* Checks if file exists. -* Analogs: http://www.techbytes.ca/techbyte103.html , http://www.gamedev.net/topic/211918-determining-if-a-file-exists-c/ -*/ -bool FileExists(std::string const &filename); - -/** -* Exit(1) if file doesn't exists, writes FATAL log message. -*/ -void CheckFileExistenceFATAL(std::string const &filename); - -void make_dirs(std::string const &path); - -// doesn't support symlinks -std::string resolve(std::string const &path); - -std::string make_relative_path(std::string p, std::string base = current_dir()); - -std::string MakeLaunchTimeDirName(); - -} diff -Nru spades-3.10.1+dfsg/src/common/utils/perf/memory.hpp spades-3.11.1+dfsg/src/common/utils/perf/memory.hpp --- spades-3.10.1+dfsg/src/common/utils/perf/memory.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/perf/memory.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -0,0 +1,70 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +/* + * memory.hpp + * + * Created on: 05.03.2011 + * Author: http://stackoverflow.com/questions/669438/how-to-get-memory-usage-at-run-time-in-c + */ + +#ifndef MEMORY_HPP_ +#define MEMORY_HPP_ + +#include +#include +#include +#include +#include + +namespace utils { +////////////////////////////////////////////////////////////////////////////// +// +// process_mem_usage(unsigned long &, long &) - takes two longs by reference, +// attempts to read the system-dependent data for a process' virtual memory +// size and resident set size, and return the results in KB. +// +// On failure, returns 0, 0 + +void process_mem_usage(unsigned long &vm_usage, long &resident_set) { + using std::ios_base; + using std::ifstream; + using std::string; + + vm_usage = 0; + resident_set = 0; + + // 'file' stat seems to give the most reliable results + // + ifstream stat_stream("/proc/self/stat", ios_base::in); + + // dummy vars for leading entries in stat that we don't care about + // + string pid, comm, state, ppid, pgrp, session, tty_nr; + string tpgid, flags, minflt, cminflt, majflt, cmajflt; + string utime, stime, cutime, cstime, priority, nice; + string O, itrealvalue, starttime; + + // the two fields we want + // + unsigned long vsize; + long rss; + + stat_stream >> pid >> comm >> state >> ppid >> pgrp >> session >> tty_nr + >> tpgid >> flags >> minflt >> cminflt >> majflt >> cmajflt + >> utime >> stime >> cutime >> cstime >> priority >> nice + >> O >> itrealvalue >> starttime >> vsize >> rss; // don't care about the rest + + stat_stream.close(); + + long page_size_kb = sysconf(_SC_PAGE_SIZE) / 1024; // in case x86-64 is configured to use 2MB pages + vm_usage = vsize / 1024; + resident_set = rss * page_size_kb; +} + +} +#endif /* MEMORY_HPP_ */ diff -Nru spades-3.10.1+dfsg/src/common/utils/perf/memory_limit.hpp spades-3.11.1+dfsg/src/common/utils/perf/memory_limit.hpp --- spades-3.10.1+dfsg/src/common/utils/perf/memory_limit.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/perf/memory_limit.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -0,0 +1,102 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#if __DARWIN || __DARWIN_UNIX03 +#include +#include +#else + +#include + +#endif + +#include +#include + +#include "config.hpp" + +#ifdef SPADES_USE_JEMALLOC + +# include + +#endif + +#include + +namespace utils { + +inline void limit_memory(size_t limit) { + rlimit rl; + if (sizeof(rlim_t) < 8) { + INFO("Can't limit virtual memory because of 32-bit system"); + return; + } + + int res = getrlimit(RLIMIT_AS, &rl); + VERIFY_MSG(res == 0, + "getrlimit(2) call failed, errno = " << errno); + + // We cannot go beyond hard limit and we might not have enough privileges to + // increase the hard limit + rl.rlim_cur = std::min(limit, rl.rlim_max); + res = setrlimit(RLIMIT_AS, &rl); + VERIFY_MSG(res == 0, + "setrlimit(2) call failed, errno = " << errno); + INFO("Memory limit set to " << (1.0 * (double) rl.rlim_cur / 1024 / 1024 / 1024) << " Gb"); +} + +inline size_t get_memory_limit() { + rlimit rl; + int res = getrlimit(RLIMIT_AS, &rl); + VERIFY_MSG(res == 0, + "getrlimit(2) call failed, errno = " << errno); + + return rl.rlim_cur; +} + +#if __DARWIN || __DARWIN_UNIX03 +inline size_t get_max_rss() { + struct task_basic_info t_info; + mach_msg_type_number_t t_info_count = TASK_BASIC_INFO_COUNT; + + if (KERN_SUCCESS != + task_info(mach_task_self(), + TASK_BASIC_INFO, (task_info_t)&t_info, &t_info_count)) + return -1U; + + return t_info.resident_size / 1024; +} +#else + +inline size_t get_max_rss() { + rusage ru; + getrusage(RUSAGE_SELF, &ru); + + return ru.ru_maxrss; +} + +#endif + +inline size_t get_used_memory() { +#ifdef SPADES_USE_JEMALLOC + const size_t *cmem = 0; + size_t clen = sizeof(cmem); + + je_mallctl("stats.cactive", &cmem, &clen, NULL, 0); + return *cmem; +#else + get_max_rss(); +#endif +} + +inline size_t get_free_memory() { + return get_memory_limit() - get_used_memory(); +} + +} diff -Nru spades-3.10.1+dfsg/src/common/utils/perf/perfcounter.hpp spades-3.11.1+dfsg/src/common/utils/perf/perfcounter.hpp --- spades-3.10.1+dfsg/src/common/utils/perf/perfcounter.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/perf/perfcounter.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -0,0 +1,113 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once +#include +#include +#include + +namespace utils { +struct perf_counter { + perf_counter() { + reset(); + } + + double time() const { + struct timeval now; + gettimeofday(&now, NULL); + + return (double) (now.tv_sec - time_.tv_sec) + (double) (now.tv_usec - time_.tv_usec) * 1e-6; + } + + double time_ms() const { + return time() * 1e3; + } + + void reset() { + gettimeofday(&time_, NULL); + } + +private: + struct timeval time_; +}; + + +inline std::string human_readable_time(double time_in_sec) { +// assert(time_in_sec > 0); + + size_t msec = size_t(time_in_sec * 1000) % 1000; + size_t sec = size_t(time_in_sec); + size_t hours = sec / 3600; + size_t mins = (sec / 60) % 60; + sec %= 60; + + return fmt::format("{:3d}:{:02d}:{:02d}.{:03d}", hours, mins, sec, msec); +} + +inline std::string human_readable_memory(size_t max_rss) { + if (max_rss < 1024 * 1024) { + return fmt::format("{:d}M", (max_rss / 1024)); + } else { + return fmt::format("{:d}G", (max_rss / (1024 * 1024))); + } +} + +struct avg_perf_counter { + avg_perf_counter(/*const string& name*/)// : name_(name) + { + reset(); + } + +// ~avg_perf_counter() { +// cout << "Time in counter " << name_ << ": " << human_readable_time(time()) << endl; +// } + + int start(int ret = 0) { + p_cnt_.reset(); + return ret; + } + + int stop(int ret = 0) { + counter_++; + whole_time_ += p_cnt_.time(); + return ret; + } + + double time() const { + return whole_time_; + } + + size_t counts() { + return counter_; + } + + double time_ms() const { + return time() * 1e3; + } + + double avg_time() const { + return counter_ > 0 ? whole_time_ / (double) counter_ : 0.; + } + + double avg_time_ms() const { + return avg_time() * 1e3; + } + + void reset() { + p_cnt_.reset(); + whole_time_ = 0; + counter_ = 0; + } + +private: + const std::string name_; + perf_counter p_cnt_; + double whole_time_; + size_t counter_; + +}; +} diff -Nru spades-3.10.1+dfsg/src/common/utils/perfcounter.hpp spades-3.11.1+dfsg/src/common/utils/perfcounter.hpp --- spades-3.10.1+dfsg/src/common/utils/perfcounter.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/perfcounter.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,123 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#pragma once -#include -#include -#include - -struct perf_counter -{ - perf_counter() - { - reset(); - } - - double time() const - { - struct timeval now; - gettimeofday(&now, NULL); - - return (double)(now.tv_sec - time_.tv_sec) + (double)(now.tv_usec - time_.tv_usec) * 1e-6; - } - - double time_ms() const - { - return time() * 1e3; - } - - void reset() - { - gettimeofday(&time_, NULL); - } - -private: - struct timeval time_; -}; - - -inline std::string human_readable_time(double time_in_sec) -{ -// assert(time_in_sec > 0); - - size_t msec = size_t(time_in_sec * 1000) % 1000; - size_t sec = size_t(time_in_sec); - size_t hours = sec / 3600; - size_t mins = (sec / 60) % 60; - sec %= 60; - - return fmt::format("{:3d}:{:02d}:{:02d}.{:03d}", hours, mins, sec, msec); -} - -inline std::string human_readable_memory(size_t max_rss) { - if (max_rss < 1024 * 1024) { - return fmt::format("{:d}M", (max_rss / 1024)); - } else { - return fmt::format("{:d}G", (max_rss / (1024 * 1024))); - } -} - -struct avg_perf_counter -{ - avg_perf_counter(/*const string& name*/)// : name_(name) - { - reset(); - } - -// ~avg_perf_counter() { -// cout << "Time in counter " << name_ << ": " << human_readable_time(time()) << endl; -// } - - int start(int ret = 0) - { - p_cnt_.reset(); - return ret; - } - - int stop(int ret = 0) - { - counter_++; - whole_time_ += p_cnt_.time(); - return ret; - } - double time() const - { - return whole_time_; - } - size_t counts() - { - return counter_; - } - double time_ms() const - { - return time() * 1e3; - } - - double avg_time() const - { - return counter_ > 0 ? whole_time_/(double)counter_ : 0.; - } - - double avg_time_ms() const - { - return avg_time() * 1e3; - } - - void reset() - { - p_cnt_.reset(); - whole_time_ = 0; - counter_ = 0; - } - -private: - const std::string name_; - perf_counter p_cnt_; - double whole_time_; - size_t counter_; - -}; diff -Nru spades-3.10.1+dfsg/src/common/utils/ph_map/key_with_hash.hpp spades-3.11.1+dfsg/src/common/utils/ph_map/key_with_hash.hpp --- spades-3.10.1+dfsg/src/common/utils/ph_map/key_with_hash.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/ph_map/key_with_hash.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -0,0 +1,229 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "storing_traits.hpp" + +namespace utils { + +template +class SimpleKeyWithHash { +public: + typedef Key KeyType; +private: + typedef typename HashFunction::IdxType IdxType; + const HashFunction &hash_; + Key key_; + mutable IdxType idx_; //lazy computation + mutable bool ready_; + + void CountIdx() const { + ready_ = true; + idx_ = hash_.seq_idx(key_); + } + + void SetKey(const Key &key) { + ready_ = false; + key_ = key; + } +public: + + SimpleKeyWithHash(Key key, const HashFunction &hash) : hash_(hash), key_(key), idx_(0), ready_(false) { + } + + Key key() const { + return key_; + } + + IdxType idx() const { + if(!ready_) { + CountIdx(); + } + return idx_; + } + + SimpleKeyWithHash &operator=(const SimpleKeyWithHash &that) { + VERIFY(&this->hash_ == &that.hash_); + this->key_= that.key_; + this->idx_ = that.idx_; + this->ready_ = that.ready_; + return *this; + } + + bool operator==(const SimpleKeyWithHash &that) const { + VERIFY(&this->hash_ == &that.hash_); + if (this->ready_ && that.ready_) + return this->idx_ == that.idx_ && this->is_minimal_ == that.is_minimal_; + return this->key_ == that.key_; + } + + bool operator!=(const SimpleKeyWithHash &that) const { + VERIFY(&this->hash_ == &that.hash_); + return this->key_ != that.key_; + } + + SimpleKeyWithHash operator!() const { + return SimpleKeyWithHash(!key_, hash_); + } + + SimpleKeyWithHash operator<<(char nucl) const { + return SimpleKeyWithHash(key_ << nucl, hash_); + } + + SimpleKeyWithHash operator>>(char nucl) const { + return SimpleKeyWithHash(key_ >> nucl, hash_); + } + + void operator<<=(char nucl) { + SetKey(key_ << nucl); + } + + void operator>>=(char nucl) { + SetKey(key_ >> nucl); + } + + char operator[](size_t i) const { + return key_[i]; + } + + bool is_minimal() const { + return true; + } +}; + +template +stream &operator<<(stream &s, const SimpleKeyWithHash &kwh) { + return s << "SKWH[" << kwh.key() << ", " << kwh.idx() << "]"; +} + +//Would it make sense to also store inverted kmer for not minimal kwh? +template +class InvertableKeyWithHash { +private: + typedef typename HashFunction::IdxType IdxType; + + const HashFunction &hash_; + Key key_; + mutable IdxType idx_; //lazy computation + mutable bool is_minimal_; + mutable bool ready_; + + void CountIdx() const { + ready_ = true; + is_minimal_ = key_.IsMinimal(); + if(is_minimal_) + idx_ = hash_.seq_idx(key_); + else{ + idx_ = hash_.seq_idx(!key_); + } + } + + InvertableKeyWithHash(Key key, const HashFunction &hash, bool is_minimal, + size_t idx, bool ready) + : hash_(hash), key_(key), idx_(idx), + is_minimal_(is_minimal), ready_(ready) { + } + public: + + InvertableKeyWithHash(Key key, const HashFunction &hash) + : hash_(hash), key_(key), idx_(0), is_minimal_(false), ready_(false) {} + + const Key &key() const { + return key_; + } + + IdxType idx() const { + if (!ready_) + CountIdx(); + + return idx_; + } + + bool is_minimal() const { + if(!ready_) { + return key_.IsMinimal(); + } + return is_minimal_; + } + + bool ready() const { + return ready_; + } + + InvertableKeyWithHash &operator=(const InvertableKeyWithHash &that) { + VERIFY(&this->hash_ == &that.hash_); + this->key_= that.key_; + this->idx_ = that.idx_; + this->ready_ = that.ready_; + this->is_minimal_ = that.is_minimal_; + return *this; + } + + bool operator==(const InvertableKeyWithHash &that) const { + VERIFY(&this->hash_ == &that.hash_); + return this->key_ == that.key_; + } + + bool operator!=(const InvertableKeyWithHash &that) const { + VERIFY(&this->hash_ == &that.hash_); + return this->key_ != that.key_; + } + + InvertableKeyWithHash operator!() const { + if(!ready_) + return InvertableKeyWithHash(!key_, hash_); + return InvertableKeyWithHash(!key_, hash_, !is_minimal_, idx_, ready_); + } + + InvertableKeyWithHash operator<<(char nucl) const { + return InvertableKeyWithHash(key_ << nucl, hash_); + } + + InvertableKeyWithHash operator>>(char nucl) const { + return InvertableKeyWithHash(key_ >> nucl, hash_); + } + + void operator<<=(char nucl) { + key_ <<= nucl; + ready_ = false; + } + + void operator>>=(char nucl) { + key_ >>= nucl; + ready_ = false; + } + + char operator[](size_t i) const { + return key_[i]; + } +}; + +template +stream &operator<<(stream &s, const InvertableKeyWithHash &kwh) { + s << "IKWH[" << kwh.key(); + if(kwh.ready()) { + return s << ", " << kwh.is_minimal() << ", " << kwh.idx() << "]"; + } else { + return s << ", not ready]"; + } +} + +template +struct StoringTraits; + +template +struct StoringTraits { + typedef SimpleKeyWithHash KeyWithHash; +}; + +template +struct StoringTraits { + typedef InvertableKeyWithHash KeyWithHash; +}; + +} diff -Nru spades-3.10.1+dfsg/src/common/utils/ph_map/perfect_hash_map_builder.hpp spades-3.11.1+dfsg/src/common/utils/ph_map/perfect_hash_map_builder.hpp --- spades-3.10.1+dfsg/src/common/utils/ph_map/perfect_hash_map_builder.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/ph_map/perfect_hash_map_builder.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -0,0 +1,93 @@ +#pragma once +//*************************************************************************** +//* Copyright (c) 2016 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "utils/kmer_mph/kmer_index_builder.hpp" +#include "utils/kmer_mph/kmer_splitters.hpp" + +#include "perfect_hash_map.hpp" + +namespace utils { + +struct PerfectHashMapBuilder { + template + void BuildIndex(PerfectHashMap &index, + Counter& counter, size_t bucket_num, + size_t thread_num, bool save_final = true) const { + using KMerIndex = typename PerfectHashMap::KMerIndexT; + + KMerIndexBuilder builder(index.workdir(), + (unsigned) bucket_num, + (unsigned) thread_num); + size_t sz = builder.BuildIndex(*index.index_ptr_, counter, save_final); + index.resize(sz); + } +}; + +struct KeyStoringIndexBuilder { + template + void BuildIndex(KeyStoringMap &index, + Counter& counter, size_t bucket_num, + size_t thread_num, bool save_final = true) const { + phm_builder_.BuildIndex(index, counter, bucket_num, thread_num, save_final); + VERIFY(!index.kmers_.get()); + index.kmers_ = counter.GetFinalKMers(); + VERIFY(index.kmers_.get()); + index.SortUniqueKMers(); + } + + private: + PerfectHashMapBuilder phm_builder_; +}; + +struct KeyIteratingIndexBuilder { + template + void BuildIndex(KeyIteratingMap &index, + Counter& counter, size_t bucket_num, + size_t thread_num, bool save_final = true) const { + phm_builder_.BuildIndex(index, counter, bucket_num, thread_num, save_final); + index.KMersFilename_ = counter.GetFinalKMersFname(); + } + + private: + PerfectHashMapBuilder phm_builder_; +}; + +template +void BuildIndex(KeyIteratingMap &index, + Counter& counter, size_t bucket_num, + size_t thread_num, bool save_final = true) { + KeyIteratingIndexBuilder().BuildIndex(index, counter, bucket_num, thread_num, save_final); +} + + +template +void BuildIndex(KeyStoringMap &index, + Counter& counter, size_t bucket_num, + size_t thread_num, bool save_final = true) { + KeyStoringIndexBuilder().BuildIndex(index, counter, bucket_num, thread_num, save_final); +} + +template +void BuildIndex(PerfectHashMap &index, + Counter& counter, size_t bucket_num, + size_t thread_num, bool save_final = true) { + PerfectHashMapBuilder().BuildIndex(index, counter, bucket_num, thread_num, save_final); +} + +template +size_t BuildIndexFromStream(Index &index, + Streams &streams, + io::SingleStream* contigs_stream = 0) { + DeBruijnReadKMerSplitter> + splitter(index.workdir(), index.k(), 0, streams, contigs_stream); + KMerDiskCounter counter(index.workdir(), splitter); + BuildIndex(index, counter, 16, streams.size()); + return 0; +} + +} diff -Nru spades-3.10.1+dfsg/src/common/utils/ph_map/perfect_hash_map.hpp spades-3.11.1+dfsg/src/common/utils/ph_map/perfect_hash_map.hpp --- spades-3.10.1+dfsg/src/common/utils/ph_map/perfect_hash_map.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/ph_map/perfect_hash_map.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -0,0 +1,339 @@ +#pragma once +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "utils/parallel/openmp_wrapper.h" +#include "utils/filesystem/path_helper.hpp" +#include "io/kmers/kmer_iterator.hpp" + +#include "utils/kmer_mph/kmer_index.hpp" + +#include "key_with_hash.hpp" +#include "values.hpp" +#include "storing_traits.hpp" + +#include +#include +#include + +namespace utils { + +template +class IndexWrapper { + static const size_t InvalidIdx = size_t(-1); +public: + typedef size_t IdxType; + typedef K KeyType; + typedef traits traits_t; +protected: + typedef KMerIndex KMerIndexT; + //these fields are protected only for reduction of storage in edge indices BinWrite + std::shared_ptr index_ptr_; +private: + std::string workdir_; + unsigned k_; + +protected: + size_t raw_seq_idx(const typename KMerIndexT::KMerRawReference s) const { + return index_ptr_->raw_seq_idx(s); + } + + bool valid(const size_t idx) const { + return idx != InvalidIdx && idx < index_ptr_->size(); + } +public: + IndexWrapper(size_t k, const std::string &workdir) + : index_ptr_(std::make_shared()) + , k_((unsigned) k) { + //fixme string literal + workdir_ = fs::make_temp_dir(workdir, "kmeridx"); + } + + IndexWrapper(size_t k, const std::string &workdir, std::shared_ptr index_ptr) + : IndexWrapper(k, workdir) { + index_ptr_ = index_ptr; + } + + ~IndexWrapper() { + fs::remove_dir(workdir_); + } + + void clear() { + index_ptr_->clear(); + } + + unsigned k() const { return k_; } + +public: + template + void BinWrite(Writer &writer) const { + index_ptr_->serialize(writer); + } + + template + void BinRead(Reader &reader, const std::string &) { + clear(); + index_ptr_->deserialize(reader); + } + + const std::string &workdir() const { + return workdir_; + } +}; + +template, class StoringType = SimpleStoring> +class PerfectHashMap : public ValueArray, public IndexWrapper { +public: + typedef size_t IdxType; + typedef K KeyType; + typedef ValueArray ValueBase; + typedef IndexWrapper KeyBase; + using KeyBase::index_ptr_; + typedef typename KeyBase::KMerIndexT KMerIndexT; + typedef typename StoringTraits::KeyWithHash KeyWithHash; + + KeyWithHash ConstructKWH(const KeyType &key) const { + return KeyWithHash(key, *index_ptr_); + } + + bool valid(const KeyWithHash &kwh) const { + return KeyBase::valid(kwh.idx()); + } + + PerfectHashMap(size_t k, const std::string &workdir) : KeyBase(k, workdir) { + } + + PerfectHashMap(size_t k, const std::string &workdir, std::shared_ptr index_ptr) + : KeyBase(k, workdir, index_ptr) { + ValueBase::resize(index_ptr_->size()); + } + + ~PerfectHashMap() { + } + + void clear() { + KeyBase::clear(); + ValueBase::clear(); + } + + const V get_value(const KeyWithHash &kwh) const { + return StoringType::get_value(*this, kwh); + } + + template + const V get_value(const KeyWithHash &kwh, const F& inverter) const { + return StoringType::get_value(*this, kwh, inverter); + } + + //Think twice or ask AntonB if you want to use it! + V &get_raw_value_reference(const KeyWithHash &kwh) { + return ValueBase::operator[](kwh.idx()); + } + + const V &get_raw_value_reference(const KeyWithHash &kwh) const { + return ValueBase::operator[](kwh.idx()); + } + + void put_value(const KeyWithHash &kwh, const V &value) { + StoringType::set_value(*this, kwh, value); + } + + template + void put_value(const KeyWithHash &kwh, const V &value, const F& inverter) { + StoringType::set_value(*this, kwh, value, inverter); + } + + template + void BinWrite(Writer &writer) const { + KeyBase::BinWrite(writer); + ValueBase::BinWrite(writer); + } + + template + void BinRead(Reader &reader, const std::string &tmp) { + KeyBase::BinRead(reader, tmp); + ValueBase::BinRead(reader, tmp); + } + + friend struct PerfectHashMapBuilder; +}; + + +template, class StoringType = SimpleStoring> +class KeyStoringMap : public PerfectHashMap { +private: + typedef PerfectHashMap base; + +public: + typedef traits traits_t; + typedef K KMer; + typedef typename base::IdxType KMerIdx; + typedef typename traits::FinalKMerStorage::iterator kmer_iterator; + typedef typename traits::FinalKMerStorage::const_iterator const_kmer_iterator; + typedef typename base::KeyWithHash KeyWithHash; + using base::ConstructKWH; + +private: + std::unique_ptr kmers_; + + void SortUniqueKMers() const { + size_t swaps = 0; + INFO("Arranging kmers in hash map order"); + for (auto I = kmers_->begin(), E = kmers_->end(); I != E; ++I) { + size_t cidx = I - kmers_->begin(); + size_t kidx = this->raw_seq_idx(*I); + while (cidx != kidx) { + auto J = kmers_->begin() + kidx; + using std::swap; + swap(*I, *J); + swaps += 1; + kidx = this->raw_seq_idx(*I); + } + } + INFO("Done. Total swaps: " << swaps); + } + +protected: + template + void BinWriteKmers(Writer &writer) const { + traits::raw_serialize(writer, this->kmers_); + } + + template + void BinReadKmers(Reader &reader, const std::string &FileName) { + this->kmers_ = traits_t::raw_deserialize(reader, FileName); + } + +public: + template + void BinWrite(Writer &writer) const { + base::BinWrite(writer); + BinWriteKmers(writer); + } + + template + void BinRead(Reader &reader, const std::string &FileName) { + base::BinRead(reader, FileName); + BinReadKmers(reader, FileName); + } + + KeyStoringMap(size_t k, const std::string &workdir) + : base(k, workdir), kmers_(nullptr) {} + + ~KeyStoringMap() {} + + KMer true_kmer(KeyWithHash kwh) const { + VERIFY(this->valid(kwh)); + + auto it = this->kmers_->begin() + kwh.idx(); + return (typename traits_t::raw_create()(this->k(), *it)); + } + + void clear() { + base::clear(); + kmers_ = nullptr; + } + + kmer_iterator kmer_begin() { + return kmers_->begin(); + } + const_kmer_iterator kmer_begin() const { + return kmers_->cbegin(); + } + + kmer_iterator kmer_end() { + return kmers_->end(); + } + const_kmer_iterator kmer_end() const { + return kmers_->cend(); + } + + bool valid(const KeyWithHash &kwh) const { + if (!base::valid(kwh)) + return false; + + auto it = this->kmers_->begin() + kwh.idx(); + if (!kwh.is_minimal()) + return (typename traits_t::raw_equal_to()(!kwh.key(), *it)); + else + return (typename traits_t::raw_equal_to()(kwh.key(), *it)); + } + + /** + * Number of edges going out of the param edge's end + */ + unsigned NextEdgeCount(const KeyWithHash &kwh) const { + unsigned res = 0; + for (char c = 0; c < 4; ++c) + if (valid(kwh << c)) + res += 1; + + return res; + } + + KeyWithHash NextEdge(const KeyWithHash &kwh) const { // returns any next edge + for (char c = 0; c < 4; ++c) { + if (valid(kwh << c)) + //hack for this code to work with long seqs! (oterwise return s is totally fine) + return ConstructKWH(true_kmer(kwh));//s; + } + + VERIFY_MSG(false, "Couldn't find requested edge!"); + return ConstructKWH(KMer(this->k())); + // no next edges (we should request one here). + } + + /** + * Number of edges coming into param edge's end + */ + unsigned RivalEdgeCount(const KeyWithHash &kwh) const { + KeyWithHash next = kwh << 'A'; + unsigned res = 0; + for (char c = 0; c < 4; ++c) + if (valid(next >> c)) + res += 1; + + return res; + } + + friend struct KeyStoringIndexBuilder; +}; + +template, class StoringType = SimpleStoring> +class KeyIteratingMap : public PerfectHashMap { + typedef PerfectHashMap base; + + std::string KMersFilename_; + +public: + typedef StoringType storing_type; + typedef typename base::traits_t traits_t; + typedef typename base::KeyType KMer; + typedef typename base::IdxType KMerIdx; + using base::ConstructKWH; + +public: + + KeyIteratingMap(size_t k, const std::string &workdir) + : base(k, workdir), KMersFilename_("") {} + + ~KeyIteratingMap() {} + + typedef MMappedFileRecordArrayIterator kmer_iterator; + + kmer_iterator kmer_begin() const { + return kmer_iterator(this->KMersFilename_, KMer::GetDataSize(base::k())); + } + + std::vector kmer_begin(size_t parts) const { + return io::make_kmer_iterator(this->KMersFilename_, base::k(), parts); + } + + friend struct KeyIteratingIndexBuilder; +}; + +} diff -Nru spades-3.10.1+dfsg/src/common/utils/ph_map/storing_traits.hpp spades-3.11.1+dfsg/src/common/utils/ph_map/storing_traits.hpp --- spades-3.10.1+dfsg/src/common/utils/ph_map/storing_traits.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/ph_map/storing_traits.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -0,0 +1,100 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once +/* + * key_with_hash.hpp + * + * Created on: Nov 7, 2013 + * Author: anton + */ + +#include "values.hpp" + +namespace utils { + +struct SimpleStoring { + template + static V get_value(const ValueArray &values, const K& key) { + return values[key.idx()]; + } + + template + static void set_value(ValueArray &values, const K& key, const V& value) { + values[key.idx()] = value; + } + + static bool IsInvertable() { + return false; + } +}; + +struct InvertableStoring { + template + struct default_inverter { + template + V operator()(const V& v, const K& k) const { + return v.conjugate(k); + } + }; + + template + struct trivial_inverter { + template + V operator()(const V& v, const K& /*k*/) const { + return v; + } + }; + + template> + static V get_value(const ValueArray &values, const K& key, + const F& inverter = F()) { + if (key.is_minimal()) + return values[key.idx()]; + else + return inverter(values[key.idx()], key); + } + + template> + static void set_value(ValueArray& values, const K& key, const V& value, + const F& inverter = F()) { + VERIFY(key.idx() < values.size()); + if (key.is_minimal()) { + values[key.idx()] = value; + } else { + values[key.idx()] = inverter(value, key); + } + } + + static bool IsInvertable() { + return true; + } +}; + +typedef InvertableStoring DefaultStoring; + +template +struct StoringTypeFilter { +}; + +template<> +struct StoringTypeFilter { + template + bool filter(const Kmer &/*kmer*/) const { + return true; + } +}; + +template<> +struct StoringTypeFilter { + template + bool filter(const Kmer &kmer) const { + return kmer.IsMinimal(); + } +}; + +} diff -Nru spades-3.10.1+dfsg/src/common/utils/ph_map/values.hpp spades-3.11.1+dfsg/src/common/utils/ph_map/values.hpp --- spades-3.10.1+dfsg/src/common/utils/ph_map/values.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/ph_map/values.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -0,0 +1,97 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once +/* + * key_with_hash.hpp + * + * Created on: Nov 7, 2013 + * Author: anton + */ + +namespace utils { + +template +class ValueArray { + static const size_t InvalidIdx = SIZE_MAX; +public: + typedef size_t IdxType; + typedef V ValueType; + +protected: + typedef std::vector StorageT; + StorageT data_; + + void resize(size_t size) { + data_.resize(size); + } + +public: + typedef typename StorageT::iterator value_iterator; + typedef typename StorageT::const_iterator const_value_iterator; + + ValueArray() { + } + + ~ValueArray() { + } + + void clear() { + data_.clear(); + StorageT().swap(data_); + } + + const V &operator[](size_t idx) const { + return data_[idx]; + } + + V &operator[](size_t idx) { + return data_[idx]; + } + +public: + size_t size() const { + return data_.size(); + } + + value_iterator value_begin() { + return data_.begin(); + } + const_value_iterator value_begin() const { + return data_.begin(); + } + const_value_iterator value_cbegin() const { + return data_.cbegin(); + } + value_iterator value_end() { + return data_.end(); + } + const_value_iterator value_end() const { + return data_.end(); + } + const_value_iterator value_cend() const { + return data_.cend(); + } + + template + void BinWrite(Writer &writer) const { + size_t sz = data_.size(); + writer.write((char*) &sz, sizeof(sz)); + writer.write((char*) &data_[0], sz * sizeof(data_[0])); + } + + template + void BinRead(Reader &reader, const std::string &) { + clear(); + size_t sz = 0; + reader.read((char*) &sz, sizeof(sz)); + data_.resize(sz); + reader.read((char*) &data_[0], sz * sizeof(data_[0])); + } +}; + +} diff -Nru spades-3.10.1+dfsg/src/common/utils/range.hpp spades-3.11.1+dfsg/src/common/utils/range.hpp --- spades-3.10.1+dfsg/src/common/utils/range.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/range.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,92 +0,0 @@ -#pragma once - -#include "utils/verify.hpp" - -namespace omnigraph { - -struct Range { -private: - bool inside(size_t left, size_t right, size_t point) const { - return left <= point && point <= right; - } - -public: - //inclusive - size_t start_pos; - //exclusive - size_t end_pos; - - size_t size() const { - VERIFY(end_pos >= start_pos); - return end_pos - start_pos; - } - - void shift(int shift) { - VERIFY(shift > 0 || size_t(-shift) <= start_pos); - start_pos += shift; - end_pos += shift; - } - - Range(): start_pos(0), end_pos(0) { - VERIFY(end_pos >= start_pos); - } - - Range(size_t start_pos, size_t end_pos) - : start_pos(start_pos), - end_pos(end_pos) { - VERIFY(end_pos >= start_pos); - } - - bool operator<(const Range &other) const { - if (start_pos != other.start_pos) - return start_pos < other.start_pos; - return end_pos < other.end_pos; - } - - bool contains(const Range& that) const { - return start_pos <= that.start_pos && end_pos >= that.end_pos; - } - - Range Merge(const Range &other) const { - return Range(this->start_pos, other.end_pos); - } - - Range Invert(size_t base_length) const { - VERIFY(base_length >= end_pos); - return Range(base_length - end_pos, base_length - start_pos); - } - - Range& operator=(const Range& other) { - start_pos = other.start_pos; - end_pos = other.end_pos; - return *this; - } - - bool empty() const { - return start_pos == end_pos; - } - - bool Intersect(const Range &other) const { - return inside(start_pos, end_pos, other.start_pos) || inside(start_pos, end_pos, other.end_pos) || - inside(other.start_pos, other.end_pos, start_pos); - } - - bool IntersectLeftOf(const Range &other) const { - return inside(start_pos, end_pos, other.start_pos) && inside(other.start_pos, other.end_pos, end_pos); - } - - bool operator==(const Range &that) const { - return start_pos == that.start_pos && end_pos == that.end_pos; - } - - bool operator!=(const Range &that) const { - return !(*this == that); - } -}; - -inline std::ostream& operator<<(std::ostream& os, const Range& range) { - os << "[" << (range.start_pos + 1) << " - " << range.end_pos << "]"; - return os; -} - -} diff -Nru spades-3.10.1+dfsg/src/common/utils/segfault_handler.hpp spades-3.11.1+dfsg/src/common/utils/segfault_handler.hpp --- spades-3.10.1+dfsg/src/common/utils/segfault_handler.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/segfault_handler.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -11,8 +11,11 @@ #include "utils/stacktrace.hpp" #include "boost/noncopyable.hpp" +#include #include +namespace utils { + struct segfault_handler : boost::noncopyable { typedef std::function callback_t; @@ -56,3 +59,5 @@ private: seg_handler_t old_func_; }; + +} diff -Nru spades-3.10.1+dfsg/src/common/utils/simple_tools.hpp spades-3.11.1+dfsg/src/common/utils/simple_tools.hpp --- spades-3.10.1+dfsg/src/common/utils/simple_tools.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/simple_tools.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,189 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -/* - * simple_tools.hpp - * - * Created on: 27.05.2011 - * Author: vyahhi - */ - -#ifndef SIMPLE_TOOLS_HPP_ -#define SIMPLE_TOOLS_HPP_ - -#include -#include -#include - -#include "utils/verify.hpp" -#include "io/reads/ireader.hpp" -#include "utils/path_helper.hpp" -#include -#include -#include -#include - -/** - * Converts anything to string (using ostringstream). - */ -template -std::string ToString(const T& t) { - std::ostringstream ss; - ss << t; - return ss.str(); -} - -template -std::string ToString(const T& t, size_t length) { - std::ostringstream ss; - ss << t; - std::string result = ss.str(); - while(result.size() < length) - result = "0" + result; - return result; -} - -template -std::string ToString(std::vector& t) { - std::ostringstream ss; - ss << "Size "< -std::string ToString(std::set& t) { - std::ostringstream ss; - ss << "Size "< -inline const std::pair ReversePair(std::pair ep) { - return std::pair(ep.second, ep.first); -} - -template -void push_back_all(ContainerT1& target, const ContainerT2& to_insert) { - target.insert(target.end(), to_insert.begin(), to_insert.end()); -} - -template -void insert_all(ContainerT1& target, const ContainerT2& to_insert) { - target.insert(to_insert.begin(), to_insert.end()); -} - -template -std::set key_set(const MapT& m) { - std::set answer; - for (auto it = m.begin(); it != m.end(); ++it) { - answer.insert(it->first); - } - return answer; -} - -template -std::set value_set(const MapT& m) { - std::set answer; - for (auto it = m.begin(); it != m.end(); ++it) { - answer.insert(it->second); - } - return answer; -} - -template -const typename MapT::mapped_type& get(const MapT& from, const typename MapT::key_type& key) { - auto it = from.find(key); - VERIFY(it != from.end()); - return it->second; -} - -template -typename MapT::mapped_type& get(MapT& from, const typename MapT::key_type& key) { - auto it = from.find(key); - VERIFY(it != from.end()); - return it->second; -} - -template -const std::vector get_all(const MMapT& from, const typename MMapT::key_type& key) { - std::vector answer; - for (auto it = from.lower_bound(key); it != from.upper_bound(key); ++it) { - answer.push_back(it->second); - } - return answer; -} - -class TmpFolderFixture -{ - std::string tmp_folder_; - -public: - TmpFolderFixture(std::string tmp_folder = "tmp") : - tmp_folder_(tmp_folder) - { - path::make_dirs(tmp_folder_); - } - - ~TmpFolderFixture() - { - path::remove_dir(tmp_folder_); - } -}; - -namespace std -{ -template -std::ostream& operator<< (std::ostream& os, std::pair const& pair) -{ - return os << "(" << pair.first << ", " << pair.second << ")"; -} -//} - -//namespace omnigraph -//{ -template -std::ostream& operator<< (std::ostream& os, const std::vector& v) -{ - os << "["; - std::string delim = ""; - for (auto it = v.begin(); it != v.end(); ++it) { - os << delim << *it; - delim = ", "; - } -// std::copy(v.begin(), v.end(), std::ostream_iterator(os, ", ")); - os << "]"; - return os; -} - -template -std::ostream& operator<< (std::ostream& os, const std::set& set) -{ - os << "{"; - bool delim = false; - for (const auto& i : set) { - if (delim) os << ", "; - os << i; - delim = true; - } - os << "}"; - return os; -} - -} - -template -inline bool instanceof(const T *ptr) { - return dynamic_cast(ptr) != nullptr; -} - -#endif /* SIMPLE_TOOLS_HPP_ */ diff -Nru spades-3.10.1+dfsg/src/common/utils/stacktrace.hpp spades-3.11.1+dfsg/src/common/utils/stacktrace.hpp --- spades-3.10.1+dfsg/src/common/utils/stacktrace.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/stacktrace.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -16,16 +16,17 @@ #include #include -inline void print_stacktrace() -{ +namespace utils { + +inline void print_stacktrace() { std::cout << "=== Stack Trace ===" << std::endl; const size_t max_stack_size = 1000; - void* stack_pointers[max_stack_size]; + void *stack_pointers[max_stack_size]; int count = backtrace(stack_pointers, max_stack_size); - char** func_names = backtrace_symbols(stack_pointers, count); + char **func_names = backtrace_symbols(stack_pointers, count); // Print the stack trace for (int i = 0; i < count; ++i) @@ -34,3 +35,5 @@ // Free the string pointers free(func_names); } + +} diff -Nru spades-3.10.1+dfsg/src/common/utils/standard_base.hpp spades-3.11.1+dfsg/src/common/utils/standard_base.hpp --- spades-3.10.1+dfsg/src/common/utils/standard_base.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/standard_base.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -87,16 +87,16 @@ #include "utils/stacktrace.hpp" // path manipulation instead of boost filesystem -#include "utils/path_helper.hpp" -using path::make_dir; -using path::remove_dir; +#include "filesystem/path_helper.hpp" +using fs::make_dir; +using fs::remove_dir; #ifndef NDEBUG namespace boost { inline void assertion_failed(char const * expr, char const * function, char const * file, long line) { std::cerr << "Aborted by assert: " << std::endl; - print_stacktrace(); + utils::print_stacktrace(); #if __DARWIN_UNIX03 __assert_rtn (expr, file, (int)line, function); #elif __DARWIN @@ -110,7 +110,7 @@ char const * function, char const * file, long line) { std::cerr << "Aborted by assert: " << msg << std::endl; - print_stacktrace(); + utils::print_stacktrace(); #if __DARWIN_UNIX03 __assert_rtn (expr, file, (int)line, function); #elif __DARWIN diff -Nru spades-3.10.1+dfsg/src/common/utils/stl_utils.hpp spades-3.11.1+dfsg/src/common/utils/stl_utils.hpp --- spades-3.10.1+dfsg/src/common/utils/stl_utils.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/stl_utils.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -0,0 +1,141 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include +#include +#include + +#include "utils/verify.hpp" +#include "io/reads/ireader.hpp" +#include "filesystem/path_helper.hpp" +#include +#include +#include +#include + +namespace utils { + +template +std::string ContainerToString(const Container &c) { + std::ostringstream ss; + ss << "Size " << c.size() << ": ["; + for (const auto &el : c) + ss << el << ", "; + ss << "]"; + return ss.str(); +} + +template +inline const std::pair ReversePair(std::pair ep) { + return std::pair(ep.second, ep.first); +} + +template +void push_back_all(ContainerT1 &target, const ContainerT2 &to_insert) { + target.insert(target.end(), to_insert.begin(), to_insert.end()); +} + +template +void insert_all(ContainerT1 &target, const ContainerT2 &to_insert) { + target.insert(to_insert.begin(), to_insert.end()); +} + +template +std::set key_set(const MapT &m) { + std::set answer; + for (auto it = m.begin(); it != m.end(); ++it) { + answer.insert(it->first); + } + return answer; +} + +template +std::set value_set(const MapT &m) { + std::set answer; + for (auto it = m.begin(); it != m.end(); ++it) { + answer.insert(it->second); + } + return answer; +} + +template +const typename MapT::mapped_type &get(const MapT &from, const typename MapT::key_type &key) { + auto it = from.find(key); + VERIFY(it != from.end()); + return it->second; +} + +template +typename MapT::mapped_type &get(MapT &from, const typename MapT::key_type &key) { + auto it = from.find(key); + VERIFY(it != from.end()); + return it->second; +} + +template +const std::vector get_all(const MMapT &from, const typename MMapT::key_type &key) { + std::vector answer; + for (auto it = from.lower_bound(key); it != from.upper_bound(key); ++it) { + answer.push_back(it->second); + } + return answer; +} + +template +std::string join(const Container &c, + const std::string &delim = ", ", + F str_f = [] (typename Container::value_type t) { return std::to_string(t); }) { + std::stringstream ss; + std::string d = ""; + for (const auto &item : c) { + ss << d << str_f(item); + d = delim; + } + return ss.str(); +} + +} + +namespace std { +template +std::ostream &operator<<(std::ostream &os, std::pair const &pair) { + return os << "(" << pair.first << ", " << pair.second << ")"; +} +//} + +//namespace omnigraph +//{ +template +std::ostream &operator<<(std::ostream &os, const std::vector &v) { + os << "["; + std::string delim = ""; + for (auto it = v.begin(); it != v.end(); ++it) { + os << delim << *it; + delim = ", "; + } +// std::copy(v.begin(), v.end(), std::ostream_iterator(os, ", ")); + os << "]"; + return os; +} + +template +std::ostream &operator<<(std::ostream &os, const std::set &set) { + os << "{"; + bool delim = false; + for (const auto &i : set) { + if (delim) os << ", "; + os << i; + delim = true; + } + os << "}"; + return os; +} + +} + diff -Nru spades-3.10.1+dfsg/src/common/utils/verify.hpp spades-3.11.1+dfsg/src/common/utils/verify.hpp --- spades-3.10.1+dfsg/src/common/utils/verify.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/utils/verify.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -7,7 +7,6 @@ #pragma once #include "utils/stacktrace.hpp" -#include "boost/current_function.hpp" #include #include #include @@ -15,15 +14,15 @@ #define VERIFY(expr) \ do { \ if(!(expr))\ - print_stacktrace();\ + utils::print_stacktrace();\ assert(expr); \ } while(0); #define VERIFY_MSG(expr, msg) \ if (!(expr)) { \ std::stringstream ss; \ - print_stacktrace();\ - ss << "Verification of expression '" << #expr << "' failed in function '" << BOOST_CURRENT_FUNCTION << \ + utils::print_stacktrace();\ + ss << "Verification of expression '" << #expr << "' failed in function '" << __PRETTY_FUNCTION__ << \ "'. In file '" << __FILE__ << "' on line " << __LINE__ << ". Message '" << msg << "'." ; \ std::cout << ss.str() << std::endl; \ std::cerr << ss.str() << std::endl; \ diff -Nru spades-3.10.1+dfsg/src/common/visualization/graph_labeler.hpp spades-3.11.1+dfsg/src/common/visualization/graph_labeler.hpp --- spades-3.10.1+dfsg/src/common/visualization/graph_labeler.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/visualization/graph_labeler.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -7,9 +7,9 @@ #pragma once -#include "utils/simple_tools.hpp" +#include "utils/stl_utils.hpp" #include "utils/standard_base.hpp" -#include "common/assembly_graph/handlers/edges_position_handler.hpp" +#include "assembly_graph/handlers/edges_position_handler.hpp" namespace visualization { @@ -163,7 +163,7 @@ LengthGraphLabeler(const Graph &g) : base(g) {} /*virtual*/ std::string label(EdgeId e) const { - return ToString(this->graph().length(e)); + return std::to_string(this->graph().length(e)); } }; @@ -178,7 +178,7 @@ std::string label(EdgeId e) const { double coverage = this->graph().coverage(e); - return " {Cov:" + ToString(coverage) + "}"; + return " {Cov:" + std::to_string(coverage) + "}"; } }; @@ -287,7 +287,7 @@ } virtual std::string label(VertexId vertexId) const { - return ToString(vertexId.int_id()); + return std::to_string(vertexId.int_id()); } virtual std::string label(EdgeId edgeId) const { @@ -296,7 +296,7 @@ ret_label += "Positions:\\n" + edges_positions_.str(edgeId); size_t len = g_.length(edgeId); double cov = g_.coverage(edgeId); - ret_label += "Len(cov): " + ToString(len) + "(" + ToString(cov) + ")"; + ret_label += "Len(cov): " + std::to_string(len) + "(" + std::to_string(cov) + ")"; return ret_label; } diff -Nru spades-3.10.1+dfsg/src/common/visualization/graph_print_utils.hpp spades-3.11.1+dfsg/src/common/visualization/graph_print_utils.hpp --- spades-3.10.1+dfsg/src/common/visualization/graph_print_utils.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/visualization/graph_print_utils.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -247,9 +247,9 @@ string constructTableEntry(SingleVertex v/*, const string &label, const string &href*/) { stringstream ss; ss << ""; - ss << constructPortCell(ToString(v.id_) + "_in", v.href_, v.fill_color_); + ss << constructPortCell(std::to_string(v.id_) + "_in", v.href_, v.fill_color_); ss << constructLabelCell(v.label_, v.href_, v.fill_color_); - ss << constructPortCell(ToString(v.id_) + "_out", v.href_, v.fill_color_); + ss << constructPortCell(std::to_string(v.id_) + "_out", v.href_, v.fill_color_); ss << "\n"; return ss.str(); } @@ -257,9 +257,9 @@ string constructReverceTableEntry(SingleVertex v/*, const string &label, const string &href*/) { stringstream ss; ss << ""; - ss << constructPortCell(ToString(v.id_) + "_out", v.href_, v.fill_color_); + ss << constructPortCell(std::to_string(v.id_) + "_out", v.href_, v.fill_color_); ss << constructLabelCell(v.label_, v.href_, v.fill_color_); - ss << constructPortCell(ToString(v.id_) + "_in", v.href_, v.fill_color_); + ss << constructPortCell(std::to_string(v.id_) + "_in", v.href_, v.fill_color_); ss << "\n"; return ss.str(); } diff -Nru spades-3.10.1+dfsg/src/common/visualization/position_filler.hpp spades-3.11.1+dfsg/src/common/visualization/position_filler.hpp --- spades-3.10.1+dfsg/src/common/visualization/position_filler.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/visualization/position_filler.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -7,7 +7,7 @@ #pragma once -#include "common/modules/alignment/sequence_mapper.hpp" +#include "modules/alignment/sequence_mapper.hpp" #include "assembly_graph/handlers/edges_position_handler.hpp" #include "io/reads/wrapper_collection.hpp" #include "io/reads/io_helper.hpp" @@ -19,44 +19,49 @@ template class PosFiller { typedef typename Graph::EdgeId EdgeId; - typedef std::shared_ptr> MapperPtr; + typedef std::shared_ptr> MapperPtr; const Graph &g_; MapperPtr mapper_; omnigraph::EdgesPositionHandler &edge_pos_; public: + typedef omnigraph::MappingPath MappingPath; + PosFiller(const Graph &g, MapperPtr mapper, omnigraph::EdgesPositionHandler &edge_pos) : g_(g), mapper_(mapper), edge_pos_(edge_pos) { } - void Process(const Sequence &s, string name) const { - //todo stupid conversion! - return Process(io::SingleRead(name, s.str())); + MappingPath Process(const string &s, const string &name) const { + return Process(io::SingleRead(name, s)); + } + + MappingPath Process(const Sequence &s, const string &name) const { + return Process(s.str(), name); } - void Process(const io::SingleRead &read) const { - omnigraph::MappingPath path = mapper_->MapRead(read); + MappingPath Process(const io::SingleRead &read) const { + MappingPath path = mapper_->MapRead(read); const string name = read.name(); int cur_pos = 0; - TRACE("Contig " << name << " mapped on " << path.size() - << " fragments."); + TRACE("Contig " << name << " mapped on " << path.size() << " fragments."); for (size_t i = 0; i < path.size(); i++) { EdgeId ei = path[i].first; omnigraph::MappingRange mr = path[i].second; int len = (int) (mr.mapped_range.end_pos - mr.mapped_range.start_pos); - if (i > 0) if (path[i - 1].first != ei) if (g_.EdgeStart(ei) != g_.EdgeEnd(path[i - 1].first)) { - TRACE( - "Contig " << name - << " mapped on not adjacent edge. Position in contig is " - << path[i - 1].second.initial_range.start_pos - + 1 - << "--" - << path[i - 1].second.initial_range.end_pos - << " and " - << mr.initial_range.start_pos + 1 - << "--" << mr.initial_range.end_pos); + if (i > 0 && + path[i - 1].first != ei && + g_.EdgeStart(ei) != g_.EdgeEnd(path[i - 1].first)) { + TRACE("Contig " << name + << " mapped on not adjacent edge. Position in contig is " + << path[i - 1].second.initial_range.start_pos + + 1 + << "--" + << path[i - 1].second.initial_range.end_pos + << " and " + << mr.initial_range.start_pos + 1 + << "--" << mr.initial_range.end_pos); } edge_pos_.AddEdgePosition(ei, name, mr.initial_range.start_pos, mr.initial_range.end_pos, @@ -64,6 +69,7 @@ mr.mapped_range.end_pos); cur_pos += len; } + return path; } void Process(io::SingleStream &stream) const { @@ -79,7 +85,7 @@ }; template -void FillPos(gp_t &gp, const string &contig_file, string prefix, bool with_rc = false) { +void FillPos(gp_t &gp, const string &contig_file, string prefix, bool with_rc) { PosFiller pos_filler(gp.g, debruijn_graph::MapperInstance(gp), gp.edge_pos); auto irs = std::make_shared(io::EasyStream(contig_file, with_rc, false), prefix); @@ -87,7 +93,7 @@ } template -void FillPos(gp_t &gp, const Sequence &s, string name) { +void FillPos(gp_t &gp, const string &s, string name) { PosFiller pos_filler(gp.g, debruijn_graph::MapperInstance(gp), gp.edge_pos); pos_filler.Process(s, name); } diff -Nru spades-3.10.1+dfsg/src/common/visualization/visualization_utils.hpp spades-3.11.1+dfsg/src/common/visualization/visualization_utils.hpp --- spades-3.10.1+dfsg/src/common/visualization/visualization_utils.hpp 2017-02-28 14:55:35.000000000 +0000 +++ spades-3.11.1+dfsg/src/common/visualization/visualization_utils.hpp 2017-09-28 09:05:08.000000000 +0000 @@ -40,7 +40,7 @@ size_t sources) { vector short_edges; std::string pics_folder_ = - output_dir + ToString(min_length) + "_" + ToString(sinks) + "_" + ToString(sources) + "_" + + output_dir + std::to_string(min_length) + "_" + std::to_string(sinks) + "_" + std::to_string(sources) + "_" + "pics_polymorphic/"; make_dir(pics_folder_); INFO("Writing pics with components consisting of short edges to " + pics_folder_); @@ -69,14 +69,14 @@ graph_labeler::CoverageGraphLabeler labeler2(component.g()); graph_labeler::CompositeLabeler compositeLabeler(labeler, labeler2); WriteComponentSinksSources(component, - pics_folder_ + ToString(g.int_id(*component.vertices().begin())) + pics_folder_ + std::to_string(g.int_id(*component.vertices().begin())) + ".dot", visualization::graph_colorer::DefaultColorer(g), compositeLabeler); - INFO("Component is written to " + ToString(g.int_id(*component.vertices().begin())) + ".dot"); + INFO("Component is written to " + std::to_string(g.int_id(*component.vertices().begin())) + ".dot"); // PrintComponent(component, // pics_folder_ + "ShortComponents/" -// + ToString(gp.g.int_id(component.vertices_[0])) +// + std::to_string(gp.g.int_id(component.vertices_[0])) // + ".dot"); } } @@ -197,7 +197,7 @@ shared_ptr> resulting_colorer = make_shared>( colorer_, edge_colorer); - string fn = output_folder_ + "/edge_" + ToString(g_.int_id(e)) + add_label + ".dot"; + string fn = output_folder_ + "/edge_" + std::to_string(g_.int_id(e)) + add_label + ".dot"; visualization::visualization_utils::WriteComponent(omnigraph::EdgeNeighborhood(g_, e, 50, 250), fn, resulting_colorer, labeler_); } diff -Nru spades-3.10.1+dfsg/src/projects/cap/assembly_compare.hpp spades-3.11.1+dfsg/src/projects/cap/assembly_compare.hpp --- spades-3.10.1+dfsg/src/projects/cap/assembly_compare.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cap/assembly_compare.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -9,7 +9,7 @@ #include "pipeline/graph_pack.hpp" #include "pipeline/graphio.hpp" -#include "utils/simple_tools.hpp" +#include "utils/stl_utils.hpp" #include "modules/simplification/cleaner.hpp" #include "io/reads/splitting_wrapper.hpp" #include "io/reads/multifile_reader.hpp" diff -Nru spades-3.10.1+dfsg/src/projects/cap/assembly_problem_detection.hpp spades-3.11.1+dfsg/src/projects/cap/assembly_problem_detection.hpp --- spades-3.10.1+dfsg/src/projects/cap/assembly_problem_detection.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cap/assembly_problem_detection.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -189,8 +189,8 @@ // // vector IncidentEdges(VertexId v) { // vector ans; -// push_back_all(ans, gp_.g.IncomingEdges(v)); -// push_back_all(ans, gp_.g.OutgoingEdges(v)); +// utils::push_back_all(ans, gp_.g.IncomingEdges(v)); +// utils::push_back_all(ans, gp_.g.OutgoingEdges(v)); // return ans; // } // @@ -407,7 +407,7 @@ // CompositeLabeler labeler(basic_labeler, pos_labeler); // GraphComponent component = omnigraph::EdgeNeighborhood(g_, e); // auto colorer = coloring_.ConstructColorer(component); -// visualization::visualization_utils::WriteComponent(component, folder + ToString(g_.int_id(e)) + "_loc.dot", colorer, labeler); +// visualization::visualization_utils::WriteComponent(component, folder + std::to_string(g_.int_id(e)) + "_loc.dot", colorer, labeler); // } // //// bool CheckEdges(const vector& edges) { diff -Nru spades-3.10.1+dfsg/src/projects/cap/cap_commands.hpp spades-3.11.1+dfsg/src/projects/cap/cap_commands.hpp --- spades-3.10.1+dfsg/src/projects/cap/cap_commands.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cap/cap_commands.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -11,7 +11,7 @@ #include "cap_environment_manager.hpp" #include "mosaic.hpp" #include "io/reads/sequence_reader.hpp" -#include "utils/path_helper.hpp" +#include "utils/path/path_helper.hpp" namespace online_visualization { @@ -508,10 +508,10 @@ std::string file_from = args[1], file_to = args[2]; - path::make_full_path(file_from); - path::make_full_path(file_to); + file_from = fs::make_full_path(file_from); + file_to = fs::make_full_path(file_to); - std::string dir = path::parent_path(file_to); + std::string dir = fs::parent_path(file_to); cap::utils::MakeDirPath(dir); BlockPrinter::ConvertBlocksToGRIMM(file_from, file_to); diff -Nru spades-3.10.1+dfsg/src/projects/cap/cap_environment_manager.hpp spades-3.11.1+dfsg/src/projects/cap/cap_environment_manager.hpp --- spades-3.10.1+dfsg/src/projects/cap/cap_environment_manager.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cap/cap_environment_manager.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -119,7 +119,7 @@ template void SaveCurrentStreams(const gp_t &/* gp */, const std::string &dir) const { for (size_t i = 0; i < env_->genomes_.size(); ++i) { - std::string output_filename = dir + path::filename(env_->init_genomes_paths_[i]); + std::string output_filename = dir + fs::filename(env_->init_genomes_paths_[i]); if (!output_filename.empty()) { Contig contig; io::osequencestream out_stream(output_filename); diff -Nru spades-3.10.1+dfsg/src/projects/cap/cap_kmer_index.hpp spades-3.11.1+dfsg/src/projects/cap/cap_kmer_index.hpp --- spades-3.10.1+dfsg/src/projects/cap/cap_kmer_index.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cap/cap_kmer_index.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -10,8 +10,8 @@ #include "compare_standard.hpp" #include "longseq.hpp" #include "polynomial_hash.hpp" -#include "common/adt/kmer_map.hpp" -#include "utils/indices/edge_position_index.hpp" +#include "adt/kmer_map.hpp" +#include "assembly_graph/index/edge_position_index.hpp" #include "io/reads/sequence_reader.hpp" #include "utils/mph_index/base_hash.hpp" diff -Nru spades-3.10.1+dfsg/src/projects/cap/compare_standard.hpp spades-3.11.1+dfsg/src/projects/cap/compare_standard.hpp --- spades-3.10.1+dfsg/src/projects/cap/compare_standard.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cap/compare_standard.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -14,9 +14,9 @@ // utils #include "utils/cpp_utils.hpp" -#include "utils/path_helper.hpp" +#include "utils/path/path_helper.hpp" -#include "utils/simple_tools.hpp" +#include "utils/stl_utils.hpp" // longseq #include "longseq.hpp" diff -Nru spades-3.10.1+dfsg/src/projects/cap/comparison_utils.hpp spades-3.11.1+dfsg/src/projects/cap/comparison_utils.hpp --- spades-3.10.1+dfsg/src/projects/cap/comparison_utils.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cap/comparison_utils.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -8,7 +8,7 @@ #pragma once #include "pipeline/graphio.hpp" -#include "utils/simple_tools.hpp" +#include "utils/stl_utils.hpp" #include "assembly_graph/core/graph.hpp" #include "coordinates_handler.hpp" #include "math/xmath.h" @@ -70,7 +70,7 @@ */ inline Sequence ReadGenome(const string& filename) { - path::CheckFileExistenceFATAL(filename); + fs::CheckFileExistenceFATAL(filename); io::FileReadStream genome_stream(filename); return ReadSequence(genome_stream); } @@ -84,7 +84,7 @@ inline vector MakeReads(const vector& ss) { vector ans; for (size_t i = 0; i < ss.size(); ++i) { - ans.push_back(io::SingleRead("read_" + ToString(i), ss[i].str())); + ans.push_back(io::SingleRead("read_" + std::to_string(i), ss[i].str())); } return ans; } @@ -109,7 +109,7 @@ } inline vector ReadContigs(const string& filename) { - path::CheckFileExistenceFATAL(filename); + fs::CheckFileExistenceFATAL(filename); io::FileReadStream genome_stream(filename); return AllSequences(genome_stream); } diff -Nru spades-3.10.1+dfsg/src/projects/cap/deprecated/tools_deprecated.cpp spades-3.11.1+dfsg/src/projects/cap/deprecated/tools_deprecated.cpp --- spades-3.10.1+dfsg/src/projects/cap/deprecated/tools_deprecated.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cap/deprecated/tools_deprecated.cpp 2017-09-28 09:05:11.000000000 +0000 @@ -52,7 +52,7 @@ // string ref = "/home/snurk/Dropbox/lab/mrsa/USA300_FPR3757.fasta"; // // string ref = "assembly_comp/gingi_diff_mask/tdc60.fasta"; -// string output_folder = "assembly_comp/s60_usa300_" + ToString(K) + "/"; +// string output_folder = "assembly_comp/s60_usa300_" + std::to_string(K) + "/"; // remove_dir(output_folder); // make_dir(output_folder); diff -Nru spades-3.10.1+dfsg/src/projects/cap/diff_masking.hpp spades-3.11.1+dfsg/src/projects/cap/diff_masking.hpp --- spades-3.10.1+dfsg/src/projects/cap/diff_masking.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cap/diff_masking.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -273,7 +273,7 @@ size_t current_k = k_values.back(); k_values.pop_back(); - string root = out_root + ToString(current_k) + "/"; + string root = out_root + std::to_string(current_k) + "/"; if (utils::NeedToUseLongSeq(current_k)) { omp_set_num_threads(1); diff -Nru spades-3.10.1+dfsg/src/projects/cap/gene_analysis.hpp spades-3.11.1+dfsg/src/projects/cap/gene_analysis.hpp --- spades-3.10.1+dfsg/src/projects/cap/gene_analysis.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cap/gene_analysis.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -8,7 +8,7 @@ #pragma once #include "utils/standard_base.hpp" -#include "utils/simple_tools.hpp" +#include "utils/stl_utils.hpp" #include "comparison_utils.hpp" #include "boost/tokenizer.hpp" #include "coloring.hpp" @@ -183,7 +183,7 @@ size_t id = 0; for (string name : genome_names) { string filename = genomes_folder + name; - path::CheckFileExistenceFATAL(filename); + fs::CheckFileExistenceFATAL(filename); genomes.insert( make_pair( id, @@ -199,7 +199,7 @@ void LoadGenomes(const string& file_with_genomes, const string& genomes_folder) { - path::CheckFileExistenceFATAL(file_with_genomes); + fs::CheckFileExistenceFATAL(file_with_genomes); ifstream stream(file_with_genomes); set genome_names; string name; @@ -230,7 +230,7 @@ } set LoadGeneIDs(const string& file_with_ids) { - path::CheckFileExistenceFATAL(file_with_ids); + fs::CheckFileExistenceFATAL(file_with_ids); ifstream stream(file_with_ids); set gene_ids; int id; @@ -252,7 +252,7 @@ void LoadGeneInfo(const string& filename, set gene_ids) { using boost::tokenizer; using boost::escaped_list_separator; - path::CheckFileExistenceFATAL(filename); + fs::CheckFileExistenceFATAL(filename); ifstream stream(filename); string line; while (!stream.eof()) { @@ -334,7 +334,7 @@ // const ColorHandler& coloring) { // for (auto it = gene_collection.genes.begin(); // it != gene_collection.genes.end(); ++it) { -//// make_dir(folder + ToString(it->first)); +//// make_dir(folder + std::to_string(it->first)); // const GenePositions& gene_poss = it->second.gene_positions; // // //todo improve later @@ -345,7 +345,7 @@ // total_gene_sequence = total_gene_sequence + genome.Subseq(pos.first.start_pos, pos.first.end_pos); // } // } -// WriteComponentsAlongSequence(gp, folder + ToString(it->first) + "/", +// WriteComponentsAlongSequence(gp, folder + std::to_string(it->first) + "/", // 100000, 50, total_gene_sequence, coloring); // } //} diff -Nru spades-3.10.1+dfsg/src/projects/cap/genome_correction.hpp spades-3.11.1+dfsg/src/projects/cap/genome_correction.hpp --- spades-3.10.1+dfsg/src/projects/cap/genome_correction.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cap/genome_correction.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -12,7 +12,7 @@ #include #include #include -#include "common/adt/bag.hpp" +#include "adt/bag.hpp" namespace cap { @@ -378,14 +378,14 @@ void GenPicAlongPath(const vector path, size_t cnt) { utils::MakeDirPath("ref_correction"); WriteComponentsAlongPath(g_, visualization::graph_labeler::StrGraphLabeler(g_), - "ref_correction/" + ToString(cnt) + ".dot", 100000, 10, + "ref_correction/" + std::to_string(cnt) + ".dot", 100000, 10, TrivialMappingPath(g_, path), *ConstructColorer(coloring_)); } void GenPicAroundEdge(EdgeId e, size_t cnt) { utils::MakeDirPath("ref_correction"); GraphComponent component = omnigraph::EdgeNeighborhood(g_, e, 10, 100000); - visualization::visualization_utils::WriteComponent(g_, "ref_correction/" + ToString(cnt) + ".dot", component, coloring_.GetInstance(), + visualization::visualization_utils::WriteComponent(g_, "ref_correction/" + std::to_string(cnt) + ".dot", component, coloring_.GetInstance(), visualization::graph_labeler::StrGraphLabeler(g_)); } @@ -428,7 +428,7 @@ // // CompositeLabeler labeler(basic_labeler, pos_labeler); // -// string alt_path_folder = folder_ + ToString(g_.int_id(e)) + "/"; +// string alt_path_folder = folder_ + std::to_string(g_.int_id(e)) + "/"; // make_dir(alt_path_folder); // WriteComponentsAlongPath(g_, labeler, alt_path_folder + "path.dot", /*split_length*/ // 1000, /*vertex_number*/15, TrivialMappingPath(g_, genome_path), diff -Nru spades-3.10.1+dfsg/src/projects/cap/longseq.hpp spades-3.11.1+dfsg/src/projects/cap/longseq.hpp --- spades-3.10.1+dfsg/src/projects/cap/longseq.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cap/longseq.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -10,9 +10,9 @@ #include #include #include "polynomial_hash.hpp" -#include "utils/log.hpp" +#include "math/log.hpp" #include "sequence/sequence.hpp" -#include "utils/openmp_wrapper.h" +#include "utils/parallel/openmp_wrapper.h" namespace cap { diff -Nru spades-3.10.1+dfsg/src/projects/cap/main.cpp spades-3.11.1+dfsg/src/projects/cap/main.cpp --- spades-3.10.1+dfsg/src/projects/cap/main.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cap/main.cpp 2017-09-28 09:05:11.000000000 +0000 @@ -12,11 +12,11 @@ #include "utils/segfault_handler.hpp" #include "utils/stacktrace.hpp" #include "pipeline/config_struct.hpp" -#include "utils/simple_tools.hpp" +#include "utils/stl_utils.hpp" #include #include #include -#include "utils/memory_limit.hpp" +#include "utils/mem/memory_limit.hpp" #include "io/dataset_support/read_converter.hpp" #include "cap_online_visualizer.hpp" diff -Nru spades-3.10.1+dfsg/src/projects/cap/mosaic.hpp spades-3.11.1+dfsg/src/projects/cap/mosaic.hpp --- spades-3.10.1+dfsg/src/projects/cap/mosaic.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cap/mosaic.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -9,7 +9,7 @@ #include "io/reads/rc_reader_wrapper.hpp" #include "io/reads/sequence_reader.hpp" #include "diff_masking.hpp" -#include "common/adt/bag.hpp" +#include "adt/bag.hpp" #include "io/reads/vector_reader.hpp" #include "visualization/graph_colorer.hpp" @@ -911,7 +911,7 @@ vector mosaic_names(size_t n) { vector ans; for (size_t i = 0; i < n; ++i) { - ans.push_back("mosaic_" + ToString(i)); + ans.push_back("mosaic_" + std::to_string(i)); } return ans; } @@ -968,8 +968,8 @@ 50 /*numeric_limits::max()*/); - path::remove_if_exists("mosaic_pics"); - path::make_dir("mosaic_pics"); + fs::remove_if_exists("mosaic_pics"); + fs::make_dir("mosaic_pics"); INFO("Writing components"); visualization::visualization_utils::WriteComponents(gp.g, "mosaic_pics/", splitter, visualization::graph_colorer::DefaultColorer(gp.g), labeler); diff -Nru spades-3.10.1+dfsg/src/projects/cap/repeat_masking.hpp spades-3.11.1+dfsg/src/projects/cap/repeat_masking.hpp --- spades-3.10.1+dfsg/src/projects/cap/repeat_masking.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cap/repeat_masking.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -9,7 +9,7 @@ #include "sequence/nucl.hpp" #include "io/reads/modifying_reader_wrapper.hpp" -#include "common/adt/bag.hpp" +#include "adt/bag.hpp" #include #include #include @@ -280,7 +280,7 @@ // size_t iter = 0; // bool no_repeats = false; // while (iter <= max_iter_count) { -// string out_dir = input_dir + ToString(iter) + "/"; +// string out_dir = input_dir + std::to_string(iter) + "/"; // make_dir(out_dir); // no_repeats = MaskRepeatsIteration(input_dir, suffixes, out_dir); // if (no_repeats) { @@ -292,7 +292,7 @@ // string out_dir = input_dir + "masked/"; // make_dir(out_dir); // ModifyAndSave(make_shared(), -// OpenStreams(input_dir + "/" + ToString(iter) + "/", suffixes, +// OpenStreams(input_dir + "/" + std::to_string(iter) + "/", suffixes, // out_dir)); // } else { // WARN("Failed to mask repeats in " << max_iter_count << " iterations"); @@ -310,7 +310,7 @@ while (iter <= max_iter_count) { INFO("------------------------"); INFO("Iteration " << iter); - string out_dir = work_dir + ToString(iter) + "/"; + string out_dir = work_dir + std::to_string(iter) + "/"; make_dir(out_dir); no_repeats = MaskRepeatsIteration(k, input_dir, suffixes, out_dir, rand_nucl); if (no_repeats) { diff -Nru spades-3.10.1+dfsg/src/projects/cap/simple_inversion_finder.hpp spades-3.11.1+dfsg/src/projects/cap/simple_inversion_finder.hpp --- spades-3.10.1+dfsg/src/projects/cap/simple_inversion_finder.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cap/simple_inversion_finder.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -145,9 +145,9 @@ DEBUG("cycle found: " << v_list_str.str()); const std::string edge_pic_name = base_pic_file_name_ + "_" + - ToString(num_cycles_found_) + ".dot"; + std::to_string(num_cycles_found_) + ".dot"; const std::string path_pic_name = base_pic_file_name_ + "_path_" + - ToString(num_cycles_found_) + ".dot"; + std::to_string(num_cycles_found_) + ".dot"; /* PrintColoredGraphAroundEdge(g_, coloring_, edge, gp_.edge_pos, diff -Nru spades-3.10.1+dfsg/src/projects/cap/stats.hpp spades-3.11.1+dfsg/src/projects/cap/stats.hpp --- spades-3.10.1+dfsg/src/projects/cap/stats.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cap/stats.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -11,7 +11,7 @@ #include "assembly_graph/components/graph_component.hpp" #include "assembly_graph/components/splitters.hpp" #include "utils.hpp" -#include "utils/simple_tools.hpp" +#include "utils/stl_utils.hpp" #include "comparison_utils.hpp" #include "assembly_graph/graph_support/basic_graph_stats.hpp" #include "coloring.hpp" @@ -493,8 +493,8 @@ size_t MaxRedBlueIncLength(VertexId v) { vector edges; - push_back_all(edges, g_.IncomingEdges(v)); - push_back_all(edges, g_.OutgoingEdges(v)); + utils::push_back_all(edges, g_.IncomingEdges(v)); + utils::push_back_all(edges, g_.OutgoingEdges(v)); return MaxRedBlueLength(edges); } @@ -533,7 +533,7 @@ GraphComponent component = omnigraph::EdgeNeighborhood(g_, e); visualization::visualization_utils::WriteComponent( component, - folder + prefix + ToString(g_.int_id(v)) + "_loc.dot", + folder + prefix + std::to_string(g_.int_id(v)) + "_loc.dot", coloring_.ConstructColorer(component), labeler); } @@ -586,8 +586,8 @@ ReportBreakpoint( breakpoints[i], folder, - ToString(i) + "_" - + ToString(comp.MaxRedBlueIncLength(breakpoints[i])) + std::to_string(i) + "_" + + std::to_string(comp.MaxRedBlueIncLength(breakpoints[i])) + "_"); } } @@ -700,7 +700,7 @@ visualization::graph_labeler::CompositeLabeler labeler(basic_labeler, pos_labeler); - string alt_path_folder = folder_ + ToString(g_.int_id(e)) + "/"; + string alt_path_folder = folder_ + std::to_string(g_.int_id(e)) + "/"; make_dir(alt_path_folder); WriteComponentsAlongPath(g_, labeler, alt_path_folder + "path.dot", /*split_length*/ 1000, /*vertex_number*/15, TrivialMappingPath(g_, genome_path), @@ -984,16 +984,16 @@ // for (auto it = genome_path_.begin(); it != genome_path_.end(); ++it) { // if (answer.find(*it) == answer.end()) { // curr++; -// answer[*it] = ToString(curr); -// answer[g_.conjugate(*it)] = ToString(-curr); +// answer[*it] = std::to_string(curr); +// answer[g_.conjugate(*it)] = std::to_string(-curr); // } // } // curr = 1000000; // for (auto it = g_.SmartEdgeBegin(); !it.IsEnd(); ++it) { // if (answer.find(*it) == answer.end()) { // curr++; -// answer[*it] = ToString(curr); -// answer[g_.conjugate(*it)] = ToString(-curr); +// answer[*it] = std::to_string(curr); +// answer[g_.conjugate(*it)] = std::to_string(-curr); // } // } // return answer; @@ -1496,7 +1496,7 @@ if (location.first) { locality = !locality; } - ReportLocality(locality, output_dir_ + ToString(i) + ".dot"); + ReportLocality(locality, output_dir_ + std::to_string(i) + ".dot"); } } };} diff -Nru spades-3.10.1+dfsg/src/projects/cclean/adapter_index.cpp spades-3.11.1+dfsg/src/projects/cclean/adapter_index.cpp --- spades-3.10.1+dfsg/src/projects/cclean/adapter_index.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cclean/adapter_index.cpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,50 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#include "adapter_index.hpp" -#include "io/read_processor.hpp" -#include "valid_kmer_generator.hpp" - -#include "io/ireadstream.hpp" -#include "config_struct_cclean.hpp" - -#include - -using namespace cclean; - -void AdapterIndexBuilder::FillAdapterIndex(const std::string &db, AdapterIndex &data) { - data.clear(); - - INFO("Reading adapter database from " << db); - ireadstream irs(db); - while (!irs.eof()) { - Read r; - irs >> r; - const std::string &seq = r.getSequenceString(); - - data.seqs_.push_back(seq); - data.seqs_.push_back(ReverseComplement(seq)); - } - - INFO("Filling adapter index"); - for (size_t i = 0, e = data.seqs_.size(); i !=e; ++i) { - const std::string &seq = data.seqs_[i]; - ValidKMerGenerator gen(seq.c_str(), NULL, seq.size()); - - while (gen.HasMore()) { - KMer kmer = gen.kmer(); - - auto& entry = data.index_[kmer]; - entry.insert(i); - - gen.Next(); - } - } - - INFO("Done. Total " << data.seqs_.size() << " adapters processed. Total " - << data.index_.size() << " unique k-mers."); -} diff -Nru spades-3.10.1+dfsg/src/projects/cclean/adapter_index.hpp spades-3.11.1+dfsg/src/projects/cclean/adapter_index.hpp --- spades-3.10.1+dfsg/src/projects/cclean/adapter_index.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cclean/adapter_index.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,61 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#ifndef CCLEAN_ADAPTERINDEX_HPP -#define CCLEAN_ADAPTERINDEX_HPP - -#include "sequence/seq.hpp" -#include "utils/mph_index/kmer_index.hpp" - -#include -#include -#include - -namespace cclean { -const unsigned K = 10; -typedef Seq KMer; - -class AdapterIndex { - typedef std::set IndexValueType; - std::unordered_map index_; - - public: - AdapterIndex() {} - - void clear() { - index_.clear(); - seqs_.clear(); - } - IndexValueType& operator[](cclean::KMer s) { return index_[s]; } - auto find(cclean::KMer s) const -> decltype(index_.find(s)) { return index_.find(s); } - auto end() const -> decltype(index_.end()) { return index_.end(); } - - bool contains(cclean::KMer s) const { - return index_.find(s) != index_.end(); - } - const std::string& seq(size_t idx) const { return seqs_[idx]; } - - private: - std::vector seqs_; - - friend class AdapterIndexBuilder; -}; - -class AdapterIndexBuilder { - public: - AdapterIndexBuilder() {} - - void FillAdapterIndex(const std::string &db, AdapterIndex &index); - - private: - DECL_LOGGER("Index Building"); -}; - - // end of namespace -} - -#endif // __CCLEAN__ADAPTERINDEX_HPP__ diff -Nru spades-3.10.1+dfsg/src/projects/cclean/additional.cpp spades-3.11.1+dfsg/src/projects/cclean/additional.cpp --- spades-3.10.1+dfsg/src/projects/cclean/additional.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cclean/additional.cpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,69 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#ifndef ADDITIONAL_CPP -#define ADDITIONAL_CPP - -#include "output.hpp" -#include "config_struct_cclean.hpp" -#include "io/read_processor.hpp" - - enum WorkModeType { - NONE = 0, - SINGLE_END = 1, - SINGLE_END_Q = 2, - BRUTE_SIMPLE = 3, - BRUTE_WITH_Q = 4 - }; - - constexpr double MatchScore = 0.6; - constexpr double MismatchScore = 100; - - class AbstractCclean { - // Abstract base class for cclean functors - public: - AbstractCclean(std::ostream &aligned_output, std::ostream &bed, - const std::string &db, - const WorkModeType &mode, - const unsigned mlen, - const bool full_inform = false) - :aligned_(0), full_inform_(full_inform), read_mlen_(mlen), - mismatch_threshold_(cfg::get().mismatch_threshold), - score_threshold_(cfg::get().score_treshold), - aligned_part_fraction_(cfg::get().aligned_part_fraction), - db_name_(db), mode_(mode), aligned_output_stream_(aligned_output), - bad_stream_(bed) {} - virtual Read operator()(const Read &read, bool *ok) = 0; - inline size_t aligned() { return aligned_; } - virtual ~AbstractCclean() {} - - protected: - size_t aligned_; - - const bool full_inform_; - const uint read_mlen_; - const double mismatch_threshold_; // for nonquality mode - const double score_threshold_; // for quality mode - - const double aligned_part_fraction_; - const std::string &db_name_; - const WorkModeType mode_; - - std::ostream &aligned_output_stream_; - std::ostream &bad_stream_; - // Abstract for clean functors - class AbstractCleanFunctor { - public: - inline virtual bool operator()(const Read &r, - const StripedSmithWaterman::Alignment &a, - double aligned_part, const std::string &adapter, - double *best_score) = 0; - virtual ~AbstractCleanFunctor() {} - }; - }; - -#endif // ADDITIONAL_CPP diff -Nru spades-3.10.1+dfsg/src/projects/cclean/brute_force_clean.cpp spades-3.11.1+dfsg/src/projects/cclean/brute_force_clean.cpp --- spades-3.10.1+dfsg/src/projects/cclean/brute_force_clean.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cclean/brute_force_clean.cpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,97 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#include "brute_force_clean.hpp" - -#include -#include -#include - -#include "adapter_index.hpp" -#include // Striped Smith-Waterman aligner -#include "additional.cpp" -#include "output.hpp" - -using std::string; -using std::vector; -using StripedSmithWaterman::Filter; -using StripedSmithWaterman::Aligner; -using StripedSmithWaterman::Alignment; -using cclean_output::print_alignment; -using cclean_output::print_bad; -using cclean_output::print_match; -using cclean_output::print_read; - -static inline bool is_alignment_good(const StripedSmithWaterman::Alignment& a, - const std::string& sequence, - const std::string& query, - double aligned_part_fraction) { - // Сheck that query adjoins or even overlaps the sequence edge - return (std::min(a.query_end - a.query_begin + 1, a.ref_end - a.ref_begin + 1) - / (double) query.size() > aligned_part_fraction) && - (a.ref_begin == 0 || a.ref_end == sequence.size() - 1); -} - -Read BruteForceClean::operator()(const Read &read, bool *ok) { - const string &read_name = read.getName(); - const string &seq_string = read.getSequenceString(); - Filter filter; // SSW filter - Aligner aligner; // SSW aligner - aligner.SetReferenceSequence(seq_string.c_str(), - static_cast(seq_string.size())); - Alignment alignment; - - // It can be many alignment adaps, so we searching the most probable - double best_score; - if (mode_ == BRUTE_SIMPLE) // so in both mode first overlap will initialize as best - best_score = mismatch_threshold_; - if (mode_ == BRUTE_WITH_Q) - best_score = score_threshold_; - std::string best_adapter = ""; - - // For each adapter align read and adapter - for (std::string adapt_string: adap_seqs_) { - - aligner.Align(adapt_string.c_str(), filter, &alignment); - if((*checker)(read, alignment, aligned_part_fraction_, adapt_string, - &best_score)) { - best_adapter = adapt_string; - } - } - - if (!best_adapter.empty()) { - aligner.Align(best_adapter.c_str(), filter, &alignment); - aligned_ += 1; - Read cuted_read = cclean_utils::CutRead(read, alignment.ref_begin, - alignment.ref_end); - if (full_inform_) // If user want full output -# pragma omp critical - print_alignment(aligned_output_stream_, alignment, seq_string, - best_adapter, read_name, db_name_); - - // Cuted read must be >= minimum lenght specified by arg - if (cuted_read.getSequenceString().size() >= read_mlen_) { - if (full_inform_) // If user want full output -# pragma omp critical - print_bad(bad_stream_, read_name, alignment.ref_begin, alignment.ref_end); - (*ok) = true; - return cuted_read; - } - else { - if (full_inform_) -# pragma omp critical - print_bad(bad_stream_, read_name, 0, alignment.ref_end); - (*ok) = false; - return cuted_read; - } - } - else { - // Read was not aligned with any adapter - (*ok) = true; - return read; - } -} diff -Nru spades-3.10.1+dfsg/src/projects/cclean/brute_force_clean.hpp spades-3.11.1+dfsg/src/projects/cclean/brute_force_clean.hpp --- spades-3.10.1+dfsg/src/projects/cclean/brute_force_clean.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cclean/brute_force_clean.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,72 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#ifndef BRUTE_FORCE_CLEAN_HPP -#define BRUTE_FORCE_CLEAN_HPP - -#include "utils.hpp" -#include "additional.cpp" - -class BruteForceClean: public AbstractCclean { - // Class that get read with oper() and clean it, if that possible - public: - BruteForceClean(std::ostream& aligned_output, - std::ostream& bed,const std::string &db, - const WorkModeType &mode, - const uint mlen, - const std::vector &gen, - const bool full_inform = false) - : AbstractCclean(aligned_output, bed, db, mode, mlen, full_inform), - adap_seqs_(gen) { - if(mode == BRUTE_SIMPLE) checker = new BruteCleanFunctor; - if(mode == BRUTE_WITH_Q) checker = new BruteQualityCleanFunctor; - } - virtual ~BruteForceClean() { delete checker; } - // ReadProcessor class put each read in this operator - virtual Read operator()(const Read &read, bool *ok); - - private: - const std::vector &adap_seqs_; - std::string best_adapter_; - AbstractCleanFunctor *checker; // Checks is adapter in read - - // Here goes functors for clean in different modes - class BruteCleanFunctor: public AbstractCleanFunctor { - virtual inline bool operator()(const Read &r, - const StripedSmithWaterman::Alignment &a, - double aligned_part, const std::string &adapter, - double *best_score) { - double cur_score = cclean_utils:: - GetMismatches(r.getSequenceString(), adapter, a); - if (cur_score < (*best_score) && - cclean_utils::is_alignment_good(a, r.getSequenceString(), adapter, - aligned_part)) { - (*best_score) = cur_score; - return true; - } - return false; - } - }; - class BruteQualityCleanFunctor: public AbstractCleanFunctor { - virtual inline bool operator()(const Read &r, - const StripedSmithWaterman::Alignment &a, - double aligned_part, const std::string &adapter, - double *best_score) { - double cur_score = cclean_utils:: - GetScoreWithQuality(a, r.getQuality().str()); - if (cur_score >= (*best_score) && - cclean_utils::is_alignment_good(a, r.getSequenceString(), adapter, - aligned_part)) { - (*best_score) = cur_score; - return true; - } - return false; - } - }; -}; - -#endif // BRUTE_FORCE_CLEAN_HPP diff -Nru spades-3.10.1+dfsg/src/projects/cclean/CMakeLists.txt spades-3.11.1+dfsg/src/projects/cclean/CMakeLists.txt --- spades-3.10.1+dfsg/src/projects/cclean/CMakeLists.txt 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cclean/CMakeLists.txt 1970-01-01 00:00:00.000000000 +0000 @@ -1,30 +0,0 @@ -############################################################################ -# Copyright (c) 2015 Saint Petersburg State University -# Copyright (c) 2011-2014 Saint-Petersburg Academic University -# All Rights Reserved -# See file LICENSE for details. -############################################################################ - -project(cclean CXX) -aux_source_directory(. SRC_LIST) -include_directories(${CMAKE_CURRENT_SOURCE_DIR}) -file(GLOB ${CMAKE_CURRENT_SOURCE_DIR} - "*.hh" - "*.h" - "*.hpp" - "*.cpp" -) -add_executable(${PROJECT_NAME} ${SRC_LIST}) - -target_link_libraries(cclean ssw input cityhash ${COMMON_LIBRARIES}) - -if (SPADES_STATIC_BUILD) - set_target_properties(cclean PROPERTIES LINK_SEARCH_END_STATIC 1) -endif() - -install(TARGETS cclean - DESTINATION bin - COMPONENT runtime) -install(DIRECTORY "${SPADES_CFG_DIR}/cclean" - DESTINATION share/spades/configs - FILES_MATCHING PATTERN "*.info") diff -Nru spades-3.10.1+dfsg/src/projects/cclean/comparator.hpp spades-3.11.1+dfsg/src/projects/cclean/comparator.hpp --- spades-3.10.1+dfsg/src/projects/cclean/comparator.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cclean/comparator.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,18 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#ifndef COMPARATOR_H_ -#define COMPARATOR_H_ - -class Compare { - public: - bool operator() (std::string * lhs, std::string * rhs) const { - return *lhs < *rhs; - } -}; - -#endif /* COMPARATOR_H_ */ diff -Nru spades-3.10.1+dfsg/src/projects/cclean/config_struct_cclean.cpp spades-3.11.1+dfsg/src/projects/cclean/config_struct_cclean.cpp --- spades-3.10.1+dfsg/src/projects/cclean/config_struct_cclean.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cclean/config_struct_cclean.cpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,44 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#include "config_struct_cclean.hpp" -#include "pipeline/config_common.hpp" -#include "utils/openmp_wrapper.h" - -void load(cclean_config& cfg, const std::string &filename) { - boost::property_tree::ptree pt; - boost::property_tree::read_info(filename, pt); - - load(cfg, pt); -} - -void load(cclean_config& cfg, boost::property_tree::ptree const& pt) { - using config_common::load; - load(cfg.use_quality, pt, "use_quality"); - load(cfg.use_bruteforce, pt, "use_bruteforce"); - load(cfg.debug_information, pt, "debug_information"); - - load(cfg.score_treshold, pt, "score_treshold"); - load(cfg.mismatch_threshold, pt, "mismatch_threshold"); - load(cfg.minimum_lenght, pt, "minimum_lenght"); - load(cfg.nthreads, pt, "nthreads"); - load(cfg.aligned_part_fraction, pt, "aligned_part_fraction"); - load(cfg.buffer_size, pt, "buffer_size"); - - load(cfg.dataset_file_name, pt, "dataset"); - load(cfg.database, pt, "database"); - load(cfg.input_working_dir, pt, "input_working_dir"); - load(cfg.output_working_dir, pt, "output_working_dir"); - - std::string file_name = cfg.dataset_file_name; - cfg.dataset.load(file_name); - - // Fix number of threads according to OMP capabilities. - cfg.nthreads = std::min(cfg.nthreads, (unsigned)omp_get_max_threads()); - // Inform OpenMP runtime about this :) - omp_set_num_threads(cfg.nthreads); -} diff -Nru spades-3.10.1+dfsg/src/projects/cclean/config_struct_cclean.hpp spades-3.11.1+dfsg/src/projects/cclean/config_struct_cclean.hpp --- spades-3.10.1+dfsg/src/projects/cclean/config_struct_cclean.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cclean/config_struct_cclean.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,42 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#ifndef CONFIG_STRUCT_CCLEAN_HPP -#define CONFIG_STRUCT_CCLEAN_HPP - -#include "pipeline/config_singl.hpp" -#include -#include "pipeline/library.hpp" - -struct cclean_config { - - bool use_quality; - bool use_bruteforce; - bool debug_information; - - unsigned score_treshold; - unsigned mismatch_threshold; - unsigned minimum_lenght; - unsigned nthreads; - unsigned buffer_size; - double aligned_part_fraction; - - std::string dataset_file_name; - std::string database; - std::string input_working_dir; - std::string output_working_dir; - - io::DataSet<> dataset; -}; - -// main config load function -void load(cclean_config& cfg, const std::string &filename); -void load(cclean_config& cfg, boost::property_tree::ptree const& pt); - -typedef config_common::config cfg; - -#endif diff -Nru spades-3.10.1+dfsg/src/projects/cclean/job_wrappers.cpp spades-3.11.1+dfsg/src/projects/cclean/job_wrappers.cpp --- spades-3.10.1+dfsg/src/projects/cclean/job_wrappers.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cclean/job_wrappers.cpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,97 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#include - -#include "job_wrappers.hpp" -#include "utils/logger/log_writers.hpp" -#include "adapter_index.hpp" -#include "valid_kmer_generator.hpp" -#include "adapter_index.hpp" -#include "output.hpp" -#include "ssw/ssw_cpp.h" -#include "utils.hpp" - -using cclean_output::print_alignment; -using cclean_output::print_bad; -using cclean_output::print_match; -using cclean_output::print_read; - -Read SimpleClean::operator()(const Read &read, bool *ok) -{ - const std::string& name = read.getName(); - const std::string& sequence = read.getSequenceString(); - - std::set to_check; - ValidKMerGenerator gen(sequence.c_str(), NULL, sequence.size()); - while (gen.HasMore()) { - cclean::KMer kmer = gen.kmer(); - - auto it = index_.find(kmer); - if (it != index_.end()) - to_check.insert(it->second.begin(), it->second.end()); - - gen.Next(); - } - - // Try to align the artifacts for corresponding kmers - StripedSmithWaterman::Aligner aligner; - StripedSmithWaterman::Filter filter; - StripedSmithWaterman::Alignment alignment; // why it was in for loop? - aligner.SetReferenceSequence(sequence.c_str(), sequence.size()); - - // Pointer on best match adapter - const std::string *best_adapter = nullptr; - double best_score; - if (mode_ == SINGLE_END) // so in both mode first overlap will initialize as best - best_score = mismatch_threshold_; - if (mode_ == SINGLE_END_Q) - best_score = score_threshold_; - best_adapter = nullptr; - - for (auto it = to_check.begin(), et = to_check.end(); it != et; ++it) { - const std::string &query = index_.seq(*it); - aligner.Align(query.c_str(), filter, &alignment); - // Check is this apapter better then previous best - if((*checker)(read, alignment, aligned_part_fraction_, query, - &best_score)) { - best_adapter = &query; - } - } - - if (best_adapter != nullptr) { - aligner.Align(best_adapter->c_str(), filter, &alignment); - aligned_ += 1; - Read cuted_read = cclean_utils::CutRead(read, alignment.ref_begin, - alignment.ref_end); - if (full_inform_) // If user want full output -# pragma omp critical - print_alignment(aligned_output_stream_, alignment, sequence, - *best_adapter,name, db_name_); - - // Cuted read must be >= minimum lenght specified by arg - if (cuted_read.getSequenceString().size() >= read_mlen_) { - if (full_inform_) -# pragma omp critical - print_bad(bad_stream_, name, alignment.ref_begin, alignment.ref_end); - (*ok) = true; - return cuted_read; - } - else { - if (full_inform_) -# pragma omp critical - print_bad(bad_stream_, name, 0, alignment.ref_end); - (*ok) = false; - return cuted_read; - } - } - else { - // Read was not aligned with any adapter - (*ok) = true; - return read; - } -} diff -Nru spades-3.10.1+dfsg/src/projects/cclean/job_wrappers.hpp spades-3.11.1+dfsg/src/projects/cclean/job_wrappers.hpp --- spades-3.10.1+dfsg/src/projects/cclean/job_wrappers.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cclean/job_wrappers.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,73 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#ifndef JOB_WRAPERS_HPP -#define JOB_WRAPERS_HPP - -#include "additional.cpp" -#include "utils.hpp" - -namespace cclean { - class AdapterIndex; -} - -class SimpleClean: public AbstractCclean { - public: - SimpleClean(std::ostream &aligned_output, - std::ostream &bed, const std::string &db, - const WorkModeType &mode, - const unsigned mlen, - const cclean::AdapterIndex &index, - const bool full_inform = false) - : AbstractCclean(aligned_output, bed, db, mode, mlen, full_inform), - index_(index) { - if(mode_ == SINGLE_END) checker = new SimpleCleanFunctor; - if(mode_ == SINGLE_END_Q) checker = new SimpleQualityCleanFunctor; - } - virtual ~SimpleClean() { delete checker; } - virtual Read operator()(const Read &read, bool *ok); - - private: - const cclean::AdapterIndex &index_; - AbstractCleanFunctor *checker; // Checks is adapter in read - - // Here goes functors for clean in different modes - class SimpleCleanFunctor: public AbstractCleanFunctor { - virtual inline bool operator()(const Read &r, - const StripedSmithWaterman::Alignment &a, - double aligned_part, const std::string &adapter, - double *best_score) { - double cur_score = cclean_utils:: - GetMismatches(r.getSequenceString(), adapter, a); - if (cur_score < (*best_score) && - cclean_utils::is_alignment_good(a, r.getSequenceString(), adapter, - aligned_part)) { - (*best_score) = cur_score; - return true; - } - return false; - } - }; - class SimpleQualityCleanFunctor: public AbstractCleanFunctor { - virtual inline bool operator()(const Read &r, - const StripedSmithWaterman::Alignment &a, - double aligned_part, const std::string &adapter, - double *best_score) { - double cur_score = cclean_utils:: - GetScoreWithQuality(a, r.getQuality().str()); - if (cur_score >= (*best_score) && - cclean_utils::is_alignment_good(a, r.getSequenceString(), adapter, - aligned_part)) { - (*best_score) = cur_score; - return true; - } - return false; - } - }; -}; - -#endif /* JOBWRAPPERS_H_ */ diff -Nru spades-3.10.1+dfsg/src/projects/cclean/main.cpp spades-3.11.1+dfsg/src/projects/cclean/main.cpp --- spades-3.10.1+dfsg/src/projects/cclean/main.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cclean/main.cpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,86 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#include -#include -#include -#include - -#include "sequence/seq.hpp" -#include "utils/logger/log_writers.hpp" -#include "utils/memory_limit.hpp" -#include "running_modes.hpp" -#include "config_struct_cclean.hpp" -#include "utils/simple_tools.hpp" -#include "adapter_index.hpp" -#include "utils.hpp" - -#include "valid_kmer_generator.hpp" -#include "io/read_processor.hpp" -#include "modules/ssw_cpp.h" -#include "additional.cpp" - -#include "job_wrappers.hpp" -#include "brute_force_clean.hpp" - -using logging::logger; -using logging::create_logger; -using logging::console_writer; -using std::string; - -constexpr int CONFIG_FILE_ARG = 1; - -void usage() { - std::cout << "usage: cclean [program config file]" << std::endl; -} - -void create_console_logger() { - logger *lg = create_logger(""); - lg->add_writer(std::make_shared()); - attach_logger(lg); -} - -int main(int argc, char *argv[]) { - - create_console_logger(); - - if (argc < 2) { - usage(); - return EXIT_FAILURE; - } - - std::string config_file = argv[CONFIG_FILE_ARG]; - INFO("Loading config from " << config_file.c_str()); - if (!path::FileExists(config_file)) { - ERROR("File " + config_file + " doesn't exists."); - return EXIT_FAILURE; - } - cfg::create_instance(config_file); - - const std::string &database = cfg::get().database; - if (!path::FileExists(database)) { - ERROR("File " + database + " doesn't exists."); - return EXIT_FAILURE; - } - const std::string &dataset = cfg::get().dataset_file_name; - if (!path::FileExists(dataset)) { - ERROR("File " + dataset + " doesn't exists."); - return EXIT_FAILURE; - } - - clock_t start = clock(); - - Cleaner::ProcessDataset(); // Main work here - - INFO("DONE"); - clock_t ends = clock(); - INFO("Processor Time Spent: " << (double) (ends - start) / CLOCKS_PER_SEC - << " seconds."); - INFO("Goodbye!"); - - return EXIT_SUCCESS; -} diff -Nru spades-3.10.1+dfsg/src/projects/cclean/output.cpp spades-3.11.1+dfsg/src/projects/cclean/output.cpp --- spades-3.10.1+dfsg/src/projects/cclean/output.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cclean/output.cpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,82 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#include -#include -#include -#include -#include "output.hpp" -#include "utils.hpp" - -namespace cclean_output { - -void print_n_times(std::ostream& output, char c, int n) { - for (int i = 0; i < n; ++i) { - output << c; - } -} - -void print_alignment(std::ostream& output, const StripedSmithWaterman::Alignment &data, - const std::string& ref, const std::string& query, - const std::string& name, const std::string& database_name) { - - output << "Alignment: input sequence (first line) " << name << " alignes " - << std::endl - << "sequence from database (last line) " << database_name << std::endl; - - std::string aligned_query, aligned_ref; - cclean_utils::RestoreFromCigar(ref, query, aligned_ref, aligned_query, data); - - // case when pattern's start pos is less than text one - int text_offset = data.ref_begin - data.query_begin < 0 ? data.query_begin - - data.ref_begin : 0; - - // ref = read - print_n_times(output, ' ', text_offset); - output << ref << std::endl; - print_n_times(output, ' ', text_offset + data.ref_begin); - output << aligned_ref << std::endl; - - // vertical dashes - print_n_times(output, ' ', text_offset + data.ref_begin); - for (int i = 0; i < (int)std::min(aligned_query.length(), aligned_ref.length()); ++i) { - aligned_query.at(i) == aligned_ref.at(i) ? output << "|" : output << "*"; - } - output << std::endl; - - // query = contamination - print_n_times(output, ' ', text_offset + data.ref_begin); - output << aligned_query << std::endl; - print_n_times(output, ' ', data.ref_begin - data.query_begin); - output << query << std::endl; - output << std::endl; - } - -void print_match(std::ostream& output, std::ostream& bed, std::map, Compare>& res, const std::string& name, - const std::string& seq, const std::string &db_name) { - for (auto it = res.begin(); it != res.end(); ++it) { - for (auto it_pos = it->second.begin(); it_pos != it->second.end(); ++it_pos) { - - output << "Match: input sequence (first line) " << name << " matches " - << std::endl - << "sequence from database (2nd line) " << db_name << std::endl; - - output << seq << std::endl; - print_n_times(output, ' ', *it_pos); - print_n_times(output, '|', it->first->length()); - output << std::endl; - print_n_times(output, ' ', *it_pos); - output << *(it->first) << std::endl; - output << std::endl; - - print_bad(bed, name, *it_pos, *it_pos + it->first->size()); - } - } -} -//end of namespace -} diff -Nru spades-3.10.1+dfsg/src/projects/cclean/output.hpp spades-3.11.1+dfsg/src/projects/cclean/output.hpp --- spades-3.10.1+dfsg/src/projects/cclean/output.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cclean/output.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,49 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#ifndef OUTPUT_HPP -#define OUTPUT_HPP - -#include -#include -#include -#include -#include -#include "comparator.hpp" -#include "modules/ssw_cpp.h" - -namespace cclean_output { - -void print_n_times(std::ostream& output, char c, int n); - -void print_alignment(std::ostream& output, - const StripedSmithWaterman::Alignment & data, - const std::string& ref, - const std::string& query, const std::string& name, - const std::string& database_name); - -void print_match(std::ostream& output, std::ostream& bed, std::map, Compare>& res, const std::string& name, - const std::string& seq, const std::string &db_name); - -void print_bad(std::ostream& output, const std::string & name, - int start, int stop); - -inline void print_read(std::ostream& output, const Read &read) { - std::ofstream &stream = - reinterpret_cast(output); - read.print(stream, Read::PHRED_OFFSET); -} - -inline void print_bad(std::ostream& output, const std::string & name, - int start, int stop) { - output << name << "\t" << start << "\t" << stop << std::endl; -} - -// end of namespace -} -#endif /* OUTPUT_H_ */ diff -Nru spades-3.10.1+dfsg/src/projects/cclean/running_modes.cpp spades-3.11.1+dfsg/src/projects/cclean/running_modes.cpp --- spades-3.10.1+dfsg/src/projects/cclean/running_modes.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cclean/running_modes.cpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,268 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#include "running_modes.hpp" - -#include -#include -#include - -#include "adapter_index.hpp" -#include "output.hpp" -#include "io/read_processor.hpp" -#include "pipeline/library.hpp" -#include "utils/logger/log_writers.hpp" -#include "job_wrappers.hpp" -#include "brute_force_clean.hpp" - -AbstractCclean *Cleaner::getCleaner(std::ofstream *outf_alig_debug, - std::ofstream *outf_bad_deb, - const std::string &db, WorkModeType mode, - unsigned mlen, - const cclean::AdapterIndex &index, - bool deb_info) { - AbstractCclean *cleaner; // Creating cleaner for reads - if (mode == SINGLE_END || mode == SINGLE_END_Q) - cleaner = new SimpleClean(*outf_alig_debug, *outf_bad_deb, db, - mode, mlen, index, deb_info); - if (mode == BRUTE_SIMPLE || mode == BRUTE_WITH_Q) - cleaner = new BruteForceClean(*outf_alig_debug, *outf_bad_deb, db, - mode, mlen, index.GetSeqs(), deb_info); - return cleaner; -} - -void Cleaner::ProcessDataset() { - // Options proceed - const std::string db = cfg::get().database; - const WorkModeType mode = getMode(); - - cclean::AdapterIndex index; - cclean::AdapterIndexBuilder().FillAdapterIndex(db, index); - - const io::DataSet<> &dataset = cfg::get().dataset; - io::DataSet<> outdataset; - // Proccessing dataset. Iterating through libraries - for (auto it = dataset.library_begin(), et = dataset.library_end(); it != et; ++it) { - const io::SequencingLibrary<> &lib = *it; - io::SequencingLibrary<> outlib = lib; - outlib.clear(); - // Iterating through paired reads in current library lib - for (auto I = lib.paired_begin(), E = lib.paired_end(); I != E; ++I) { - INFO("Correcting pair reads from " << I->first << " and " << I->second); - - const std::string &file_name_l = I->first; - const std::string &file_name_r = I->second; - const std::string outcorl = getReadsFilename(cfg::get().output_working_dir, - file_name_l, "correct_l"); - const std::string outcorr = getReadsFilename(cfg::get().output_working_dir, - file_name_r, "correct_r"); - const std::string unpaired = getPureFilename(file_name_l) + "_" + - getPureFilename(file_name_r); - const std::string outcoru = getReadsFilename(cfg::get().output_working_dir, - unpaired, "correct_u"); - const std::string outbadl = getReadsFilename(cfg::get().output_working_dir, - file_name_l, "bad"); - const std::string outbadr = getReadsFilename(cfg::get().output_working_dir, - file_name_r, "bad"); - - std::ofstream ofcorl(outcorl.c_str()); - std::ofstream ofbadl(outbadl.c_str()); - std::ofstream ofcorr(outcorr.c_str()); - std::ofstream ofbadr(outbadr.c_str()); - std::ofstream ofunp (outcoru.c_str()); - - CorrectPairedReadFiles(index, file_name_l, file_name_r, &ofbadl, &ofcorl, - &ofbadr, &ofcorr, &ofunp, mode); - outlib.push_back_paired(outcorl, outcorr); - outlib.push_back_single(outcoru); - } - - for (auto I = lib.single_begin(), E = lib.single_end(); I != E; ++I) { - INFO("Correcting single reads from " << *I); - - const std::string reads_file_name = *I; - const std::string outcor = getReadsFilename(cfg::get().output_working_dir, - reads_file_name, "correct"); - const std::string outbad = getReadsFilename(cfg::get().output_working_dir, - reads_file_name, "bad"); - - std::ofstream ofgood(outcor.c_str()); - std::ofstream ofbad(outbad.c_str()); - - CorrectReadFile(index, reads_file_name, &ofgood, &ofbad, mode); - outlib.push_back_single(outcor); - } - outdataset.push_back(outlib); - } - - cfg::get_writable().dataset = outdataset; -} - -void Cleaner::CorrectReadFile(const cclean::AdapterIndex &index, - const std::string &fname, std::ofstream *outf_good, - std::ofstream *outf_bad, WorkModeType mode) { - const unsigned nthreads = cfg::get().nthreads; - const std::string db = cfg::get().database; - const unsigned mlen = cfg::get().minimum_lenght; - const size_t read_buffer_size = nthreads * cfg::get().buffer_size; - std::vector reads(read_buffer_size); - std::vector res(read_buffer_size, false); - - const bool deb_info = cfg::get().debug_information; - std::string bad_out_debug = ""; - std::string aligned_out_debug = ""; - if (deb_info) { - // Else ofstreams will be not used, so there is no sense to create empty files - // So ofstreams will be created with empty strings - bad_out_debug = getReadsFilename(cfg::get().output_working_dir, - fname, "debug.bad"); - aligned_out_debug = getReadsFilename(cfg::get().output_working_dir, - fname, "debug.alig"); - } - std::ofstream ofbad_deb(bad_out_debug.c_str()); - std::ofstream ofalig_deb(aligned_out_debug.c_str()); - - unsigned buffer_no = 0; - unsigned count_bad = 0; - unsigned count_total = 0; - - ireadstream irs(fname); - VERIFY(irs.is_open()); - - AbstractCclean *cleaner = getCleaner(&ofalig_deb, &ofbad_deb, db, mode, mlen, - index, deb_info); - - while (!irs.eof()) { - unsigned buf_size = 0; - for (; buf_size < read_buffer_size && !irs.eof(); ++buf_size) { - irs >> reads[buf_size]; - } - if(deb_info) INFO("Prepared batch " << buffer_no << " of " - << buf_size << " reads."); - count_bad += CorrectReadsBatch(cleaner, &res, &reads, buf_size, nthreads); - count_total += buf_size; - if (deb_info) INFO("Processed batch " << buffer_no); - for (size_t i = 0; i < buf_size; ++i) { // Here output reads in files - reads[i].print(*(res[i] ? outf_good : outf_bad), Read::PHRED_OFFSET); - } - if(deb_info) INFO("Written batch " << buffer_no); - ++buffer_no; - } - - delete cleaner; - // Process info about results - const double percent_val = static_cast(count_total) / 100.0; - std::ostringstream percent_bad; - percent_bad << std::fixed << std::setprecision(2) << - (static_cast(count_bad) / percent_val); - INFO("Total proceed " + std::to_string(count_total) + ", " + - std::to_string(count_bad) + " reads (" + percent_bad.str() + - " percents of total) is bad."); -} - -void Cleaner::CorrectPairedReadFiles(const cclean::AdapterIndex &index, - const std::string &fnamel, - const std::string &fnamer, std::ofstream *ofbadl, - std::ofstream *ofcorl, std::ofstream *ofbadr, - std::ofstream *ofcorr, std::ofstream *ofunp, - WorkModeType mode) { - const unsigned nthreads = cfg::get().nthreads; - const std::string db = cfg::get().database; - const unsigned mlen = cfg::get().minimum_lenght; - const size_t read_buffer_size = nthreads * cfg::get().buffer_size; - - std::vector left_reads(read_buffer_size); - std::vector right_reads(read_buffer_size); - std::vector left_res(read_buffer_size, false); - std::vector right_res(read_buffer_size, false); - - ireadstream irsl(fnamel); - ireadstream irsr(fnamer); - VERIFY(irsl.is_open()); - VERIFY(irsr.is_open()); - - const bool deb_info = cfg::get().debug_information; - std::string bad_out_deb_l = ""; - std::string aligned_out_deb_l = ""; - std::string bad_out_deb_r = ""; - std::string aligned_out_deb_r = ""; - if (deb_info) { - // Else ofstreams will be not used, so there is no sense to create empty files - // So ofstreams will be created with empty strings - bad_out_deb_l = getReadsFilename(cfg::get().output_working_dir, - fnamel, "debug.bad"); - aligned_out_deb_l = getReadsFilename(cfg::get().output_working_dir, - fnamel, "debug.alig"); - bad_out_deb_r = getReadsFilename(cfg::get().output_working_dir, - fnamer, "debug.bad"); - aligned_out_deb_r = getReadsFilename(cfg::get().output_working_dir, - fnamer, "debug.alig"); - } - std::ofstream ofbad_deb_l(bad_out_deb_l.c_str()); - std::ofstream ofalig_deb_l(aligned_out_deb_l.c_str()); - std::ofstream ofbad_deb_r(bad_out_deb_r.c_str()); - std::ofstream ofalig_deb_r(aligned_out_deb_r.c_str()); - - AbstractCclean *cleaner_l = getCleaner(&ofalig_deb_l, &ofbad_deb_l, db, mode, - mlen, index, deb_info); - AbstractCclean *cleaner_r = getCleaner(&ofalig_deb_r, &ofbad_deb_r, db, mode, - mlen, index, deb_info); - unsigned buffer_no = 0; - unsigned count_bad_l = 0; - unsigned count_bad_r = 0; - unsigned count_total = 0; - - while (!irsl.eof() && !irsr.eof()) { - unsigned buf_size = 0; - for (; buf_size < read_buffer_size && !irsl.eof() && - !irsr.eof(); ++buf_size) { - irsl >> left_reads[buf_size]; - irsr >> right_reads[buf_size]; - } - if(deb_info) INFO("Prepared batch " << buffer_no << " of " << buf_size - << " reads."); - - count_bad_l += CorrectReadsBatch(cleaner_l, &left_res, &left_reads, - buf_size, nthreads); - count_bad_r += CorrectReadsBatch(cleaner_r, &right_res, &right_reads, - buf_size, nthreads); - count_total += buf_size; - - if(deb_info) INFO("Processed batch " << buffer_no); - for (size_t i = 0; i < buf_size; ++i) { - if (left_res[i] && right_res[i]) { - left_reads[i].print(*ofcorl, Read::PHRED_OFFSET); - right_reads[i].print(*ofcorr, Read::PHRED_OFFSET); - } - else { - left_reads[i].print(*(left_res[i] ? ofunp : ofbadl), - Read::PHRED_OFFSET); - right_reads[i].print(*(right_res[i] ? ofunp : ofbadr), - Read::PHRED_OFFSET); - } - } - if(deb_info) INFO("Written batch " << buffer_no); - ++buffer_no; - } - - delete cleaner_l; - delete cleaner_r; - - // Process info abouts results - const double percent_val = static_cast(count_total) / 100.0; - std::ostringstream percent_bad_l; - std::ostringstream percent_bad_r; - percent_bad_l << std::fixed << std::setprecision(2) << - (static_cast(count_bad_l) / percent_val); - percent_bad_r << std::fixed << std::setprecision(2) << - (static_cast(count_bad_r) / percent_val); - INFO("Total proceed " + std::to_string(count_total) + ", " + - std::to_string(count_bad_l) + " left reads (" + - percent_bad_l.str() + " percents of total) is bad" + ", " + - std::to_string(count_bad_r) + " right reads (" + - percent_bad_r.str() + " percents of total) is bad."); -} diff -Nru spades-3.10.1+dfsg/src/projects/cclean/running_modes.hpp spades-3.11.1+dfsg/src/projects/cclean/running_modes.hpp --- spades-3.10.1+dfsg/src/projects/cclean/running_modes.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cclean/running_modes.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,93 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#ifndef RUNNING_MODES_HPP -#define RUNNING_MODES_HPP - -#include -#include -#include -#include -#include "additional.cpp" -#include "adapter_index.hpp" - -class Cleaner { - - public: - static void ProcessDataset(); - // Correct reads in a given file - static void CorrectReadFile(const cclean::AdapterIndex &index, - const std::string &fname, - std::ofstream *outf_good, std::ofstream *outf_bad, - WorkModeType mode); - // Correct reads in a given pair of files - static void CorrectPairedReadFiles(const cclean::AdapterIndex &index, - const std::string &fnamel, - const std::string &fnamer, - std::ofstream *ofbadl, - std::ofstream *ofcorl, - std::ofstream *ofbadr, - std::ofstream *ofcorr, - std::ofstream *ofunp, - WorkModeType mode); - // Parallel correction of batch of reads - static inline unsigned CorrectReadsBatch(AbstractCclean *cleaner, - std::vector *results, - std::vector *reads, - size_t buf_size, unsigned nthreads) { - unsigned bad = 0; -# pragma omp parallel for shared(reads, results) num_threads(nthreads) - for (size_t i = 0; i < buf_size; ++i) { - bool ok; - (*reads)[i] = (*cleaner)((*reads)[i], &ok); - (*results)[i] = ok; - if (!ok) ++bad; - } - return bad; - } - // Get pure file name without extension - inline static std::string getPureFilename(const std::string &fname) { - std::string tmp = path::filename(fname); - std::string pure_file_name = ""; - size_t pos = tmp.find(".fastq"); - if (pos == std::string::npos) - pure_file_name = tmp; - else - pure_file_name = tmp.substr(0, pos); - return pure_file_name; - } - // Get filename for reads - inline static std::string getReadsFilename(const std::string &dirprefix, - const std::string &fname, - const std::string &suffix) { - const std::string &pure_file_name = getPureFilename(fname); - return (dirprefix + "/" + pure_file_name + "." + suffix + ".fastq"); - } - // Define mode depends on config file data - inline static WorkModeType getMode() { - WorkModeType mode; - if (cfg::get().use_bruteforce) { - if (cfg::get().use_quality) mode = BRUTE_WITH_Q; - else mode = BRUTE_SIMPLE; - } - else { - if (cfg::get().use_quality) mode = SINGLE_END_Q; - else mode = SINGLE_END; - } - return mode; - } - // Create and return cleaner depends on mode - inline static AbstractCclean* getCleaner(std::ofstream *outf_alig_debug, - std::ofstream *outf_bad_deb, - const std::string &db, - WorkModeType mode, unsigned mlen, - const cclean::AdapterIndex &index, - bool deb_info); - -}; - -#endif /* RUNNING_MODES_H_ */ diff -Nru spades-3.10.1+dfsg/src/projects/cclean/utils.cpp spades-3.11.1+dfsg/src/projects/cclean/utils.cpp --- spades-3.10.1+dfsg/src/projects/cclean/utils.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cclean/utils.cpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,136 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#include -#include -#include -#include -#include - -#include "utils.hpp" -#include -#include // Striped Smith-Waterman aligner -#include -#include "additional.cpp" - -namespace cclean_utils { - -inline std::string ReverseComplement(const std::string& read) { - std::map reverse; - reverse['C'] = 'G'; - reverse['G'] = 'C'; - reverse['T'] = 'A'; - reverse['A'] = 'T'; - reverse['N'] = 'N'; - - std::vector res; - for(int i = 0; i < (int) read.length(); ++i) { - res.push_back(reverse[read[i]]); - } - - std::reverse(res.begin(), res.end()); - return std::string(res.begin(), res.end()); -} - -double GetScoreWithQuality(const StripedSmithWaterman::Alignment &a, - const Quality &qual) -{ // Try to get more realistic align score depend on read quality - // Mathes and mismatches get from cigar alignment string below - double score = 0.0; - int ref_pos = 0, query_pos = 0; - for (std::vector::const_iterator it = a.cigar.begin(); - it != a.cigar.end(); ++it) { - - int num = (*it & 0xFFFFFFF0) >> 4; - int op_code = *it & 0x0000000F; - - switch (op_code) { - case 0: { //match - for (int i = 0; i < num; ++i, ++ref_pos, ++query_pos) - score += MatchScore; - break; - } - case 1: { //insert - for (int i = 0; i < num; ++i, ++query_pos) - score -= (double)qual[query_pos] / MismatchScore; - break; - } - case 2: { //del - for (int i = 0; i < num; ++i, ++ref_pos) - score -= (double)qual[query_pos] / MismatchScore; - break; - } - default: - break; - } - } - return score; -} - -Read CutRead(const Read &r, int start_pos, int end_pos) { - if(start_pos > end_pos) return r; - // Step 1: cutting read sequence - Read read = r; - std::string read_seq = read.getSequenceString(); - std::string cuted_read_seq(std::string(read_seq, 0, start_pos) + - std::string(read_seq, end_pos + 1)); - read.setSequence(cuted_read_seq.c_str()); - - // Step 2: cutting read quality string - std::string qual_string = read.getQuality().str(); - if(qual_string.empty()) return read; - std::string cuted_qual_string(std::string(qual_string, 0, start_pos) + - std::string(qual_string, end_pos + 1)); - read.setQuality(cuted_qual_string.c_str(), 0); - return read; -} - -void RestoreFromCigar(const std::string& ref, const std::string& query, - std::string& out_ref, std::string& out_query, - const StripedSmithWaterman::Alignment& a) { - - std::vector aligned_ref, aligned_query; - int ref_pos = 0, query_pos = 0; - for (std::vector::const_iterator it = a.cigar.begin(); - it != a.cigar.end(); ++it) { - int num = (*it & 0xFFFFFFF0) >> 4; - int op_code = *it & 0x0000000F; - - switch (op_code) { - case 0: { //match - for (int i = 0; i < num; ++i) { - aligned_ref.push_back(ref[a.ref_begin + ref_pos++]); - aligned_query.push_back(query[a.query_begin + query_pos++]); - } - break; - } - case 1: { //insert - for (int i = 0; i < num; ++i) { - aligned_ref.push_back('-'); - aligned_query.push_back(query[a.query_begin + query_pos++]); - } - break; - } - case 2: { //del - for (int i = 0; i < num; ++i) { - aligned_ref.push_back(ref[a.ref_begin + ref_pos++]); - aligned_query.push_back('-'); - } - break; - } - default: - break; - } - - } - - out_ref = std::string(aligned_ref.begin(), aligned_ref.end()); - out_query = std::string(aligned_query.begin(), aligned_query.end()); -} - - // end of namespace cclean_utils -} diff -Nru spades-3.10.1+dfsg/src/projects/cclean/utils.hpp spades-3.11.1+dfsg/src/projects/cclean/utils.hpp --- spades-3.10.1+dfsg/src/projects/cclean/utils.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cclean/utils.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,58 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#ifndef UTILS_HPP -#define UTILS_HPP - -#include // Striped Smith-Waterman aligner -#include -#include "additional.cpp" -#include "running_modes.hpp" -#include "adapter_index.hpp" - -namespace cclean_utils { - -std::string ReverseComplement(const std::string& read); - -std::unordered_map ProcessArgs(int argc, char *argv[], - bool *ok, std::string *error); - -double GetScoreWithQuality(const StripedSmithWaterman::Alignment &a, - const Quality &qual); - -inline bool is_alignment_good(const StripedSmithWaterman::Alignment& a, - const std::string& sequence, - const std::string& query, - double aligned_part_fraction) { - // Сheck that query adjoins or even overlaps the sequence edge - return (std::min(a.query_end - a.query_begin + 1, a.ref_end - a.ref_begin + 1) - / (double) query.size() > aligned_part_fraction) /*&& - (a.ref_begin == 0 || a.ref_end == sequence.size() - 1)*/; -} - -// Cut read from start to end position of best aligment with adapter -Read CutRead(const Read &r, int start_pos, int end_pos); -void RestoreFromCigar(const std::string& ref, const std::string& query, - std::string& out_ref, std::string& out_query, - const StripedSmithWaterman::Alignment& a); - -inline double GetMismatches(const std::string &read, const std::string &adapter, - const StripedSmithWaterman::Alignment &a) { - std::string aligned_read; - std::string aligned_adapter; - RestoreFromCigar(read, adapter, aligned_read, aligned_adapter, a); - int size = (int)std::min(aligned_read.length(), aligned_adapter.length()); - int mismatched_score = 0; - for (int i = 0; i < size; ++i) { - if (aligned_read[i] != aligned_adapter[i]) - ++mismatched_score; - } - return static_cast(mismatched_score); -} -// end of namespace -} -#endif /* UTILS_HPP */ diff -Nru spades-3.10.1+dfsg/src/projects/cclean/valid_kmer_generator.hpp spades-3.11.1+dfsg/src/projects/cclean/valid_kmer_generator.hpp --- spades-3.10.1+dfsg/src/projects/cclean/valid_kmer_generator.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/cclean/valid_kmer_generator.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,198 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#ifndef HAMMER_VALIDKMERGENERATOR_HPP_ -#define HAMMER_VALIDKMERGENERATOR_HPP_ - -#include "io/read.hpp" -#include "sequence/seq.hpp" - -#include -#include - -#include -#include - -/** - * This class is designed to iterate through valid k-mers in read. - * @example - * ValidKMerGenerator<2> gen(read, 4); - * while (gen.HasMore()) { - * MyTrickyFunction(gen.kmer()); - * gen.Next(); - * } - * or - * for (ValidKMerGenerator<2> gen(read, 2); gen.HasMore; gen.Next() { - * MyTrickyFunction(gen.kmer(), gen.pos(), gen.correct_probability()); - * } - * @param kK k-mer length. - */ -template -class ValidKMerGenerator { - public: - /** - * @param read Read to generate k-mers from. - * @param bad_quality_threshold This class virtually cuts - * nucleotides with quality lower the threshold from the ends of the - * read. - */ - explicit ValidKMerGenerator(const Read &read, - uint32_t bad_quality_threshold = 2) { - Reset(read.getSequenceString().data(), - read.getQualityString().data(), - read.getSequenceString().size(), - bad_quality_threshold); - } - /** - * @param seq sequence to generate k-mers from. - * @param qual quality string - * @param bad_quality_threshold This class virtually cuts - * nucleotides with quality lower the threshold from the ends of the - * read. - */ - explicit ValidKMerGenerator(const char *seq, const char *qual, - size_t len, - uint32_t bad_quality_threshold = 2) { - Reset(seq, qual, len, bad_quality_threshold); - } - - ValidKMerGenerator() - : kmer_(), seq_(0), qual_(0), - pos_(-1), end_(-1), len_(0), - correct_probability_(1), bad_quality_threshold_(2), - has_more_(false), first(true) {} - - void Reset(const char *seq, const char *qual, - size_t len, - uint32_t bad_quality_threshold = 2) { - kmer_ = Seq(); - seq_ = seq; - qual_ = qual; - pos_ = -1; - end_ = -1; - len_ = len; - correct_probability_ = 1.0; - bad_quality_threshold_ = bad_quality_threshold; - has_more_ = true; - first = true; - - TrimBadQuality(); - Next(); - } - - /** - * @result true if Next() succeed while generating new k-mer, false - * otherwise. - */ - bool HasMore() const { - return has_more_; - } - /** - * @result last k-mer generated by Next(). - */ - const Seq& kmer() const { - return kmer_; - } - /** - * @result last k-mer position in initial read. - */ - int pos() const { - return pos_; - } - /** - * @result probability that last generated k-mer is correct. - */ - double correct_probability() const { - return correct_probability_; - } - /** - * This functions reads next k-mer from the read and sets hasmore to - * if succeeded. You can access k-mer read with kmer(). - */ - void Next(); - private: - void TrimBadQuality(); - double Prob(uint8_t qual) { - return 1 - (qual < 3 ? 0.75 : pow(10.0, -(int)qual / 10.0)); - } - uint32_t GetQual(uint32_t pos) { - if (pos >= len_) { - return 2; - } else { - return qual_[pos]; - } - } - Seq kmer_; - const char* seq_; - const char* qual_; - size_t pos_; - size_t end_; - size_t len_; - double correct_probability_; - uint32_t bad_quality_threshold_; - bool has_more_; - bool first; - - // Disallow copy and assign - ValidKMerGenerator(const ValidKMerGenerator&) = delete; - void operator=(const ValidKMerGenerator&) = delete; -}; - -template -void ValidKMerGenerator::TrimBadQuality() { - pos_ = 0; - if (qual_) - for (; pos_ < len_; ++pos_) { - if (GetQual(pos_) >= bad_quality_threshold_) - break; - } - end_ = len_; - if (qual_) - for (; end_ > pos_; --end_) { - if (GetQual(end_ - 1) >= bad_quality_threshold_) - break; - } -} - -template -void ValidKMerGenerator::Next() { - if (pos_ + kK > end_) { - has_more_ = false; - } else if (first || !is_nucl(seq_[pos_ + kK - 1])) { - // in this case we have to look for new k-mer - correct_probability_ = 1.0; - uint32_t start_hypothesis = pos_; - uint32_t i = pos_; - for (; i < len_; ++i) { - if (i == kK + start_hypothesis) { - break; - } - if (qual_) - correct_probability_ *= Prob(GetQual(i)); - if (!is_nucl(seq_[i])) { - start_hypothesis = i + 1; - correct_probability_ = 1.0; - } - } - if (i == kK + start_hypothesis) { - kmer_ = Seq(seq_ + start_hypothesis, 0, kK, /* raw */ true); - pos_ = start_hypothesis + 1; - } else { - has_more_ = false; - } - } else { - // good case we can just shift our previous answer - kmer_ = kmer_ << seq_[pos_ + kK - 1]; - if (qual_) { - correct_probability_ *= Prob(GetQual(pos_ + kK - 1)); - correct_probability_ /= Prob(GetQual(pos_ - 1)); - } - ++pos_; - } - first = false; -} -#endif // HAMMER_VALIDKMERGENERATOR_HPP__ diff -Nru spades-3.10.1+dfsg/src/projects/corrector/config_struct.cpp spades-3.11.1+dfsg/src/projects/corrector/config_struct.cpp --- spades-3.10.1+dfsg/src/projects/corrector/config_struct.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/corrector/config_struct.cpp 2017-09-28 09:05:12.000000000 +0000 @@ -7,7 +7,7 @@ #include "config_struct.hpp" -#include "utils/openmp_wrapper.h" +#include "utils/parallel/openmp_wrapper.h" #include "llvm/Support/YAMLParser.h" #include "llvm/Support/YAMLTraits.h" @@ -53,6 +53,7 @@ io.mapOptional("max_nthreads", cfg.max_nthreads, 1u); io.mapRequired("strategy", cfg.strat); io.mapOptional("bwa", cfg.bwa, std::string(".")); + io.mapOptional("log_filename", cfg.log_filename, std::string(".")); } }; }} diff -Nru spades-3.10.1+dfsg/src/projects/corrector/config_struct.hpp spades-3.11.1+dfsg/src/projects/corrector/config_struct.hpp --- spades-3.10.1+dfsg/src/projects/corrector/config_struct.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/corrector/config_struct.hpp 2017-09-28 09:05:12.000000000 +0000 @@ -25,6 +25,7 @@ unsigned max_nthreads; Strategy strat; std::string bwa; + std::string log_filename; }; void load(corrector::corrector_config& cfg, const std::string &filename); diff -Nru spades-3.10.1+dfsg/src/projects/corrector/contig_processor.cpp spades-3.11.1+dfsg/src/projects/corrector/contig_processor.cpp --- spades-3.10.1+dfsg/src/projects/corrector/contig_processor.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/corrector/contig_processor.cpp 2017-09-28 09:05:12.000000000 +0000 @@ -13,7 +13,7 @@ #include "io/reads/osequencestream.hpp" #include "io/reads/file_reader.hpp" #include "io/reads/single_read.hpp" -#include "utils/path_helper.hpp" +#include "utils/filesystem/path_helper.hpp" #include @@ -34,7 +34,7 @@ contig_name_ = cur_read.name(); contig_ = cur_read.GetSequenceString(); - output_contig_file_ = path::append_path(path::parent_path(contig_file_), path::basename(contig_file_) + ".ref.fasta"); + output_contig_file_ = fs::append_path(fs::parent_path(contig_file_), fs::basename(contig_file_) + ".ref.fasta"); charts_.resize(contig_.length()); } @@ -74,8 +74,13 @@ if (maxj != maxi) { DEBUG("Interesting positions differ with majority!"); DEBUG("On position " << i << " old: " << old << " majority: " << pos_to_var[maxi] << "interesting: " << pos_to_var[maxj]); - if (strat != Strategy::MajorityOnly) - maxi = maxj; + if (strat != Strategy::MajorityOnly) { + if (charts_[i].votes[maxj] > interesting_weight_cutoff) + maxi = maxj; + else + DEBUG(" alternative interesting position with weight " << charts_[i].votes[maxj] << + " fails weight cutoff"); + } } } if (old != pos_to_var[maxi]) { @@ -257,7 +262,19 @@ } sm.close(); } - + size_t total_coverage = 0; + for (const auto &pos: charts_) + total_coverage += pos.TotalMapped(); + size_t average_coverage = total_coverage / contig_.length(); + size_t different_cov = 0; + for (const auto &pos: charts_) + if ((pos.TotalMapped() < average_coverage / 2) || (pos.TotalMapped() > (average_coverage * 3) / 2)) + different_cov++; + if (different_cov < contig_.length() * 3/ 10) { + interesting_weight_cutoff = int (average_coverage / 2); + DEBUG ("coverage is relatively uniform, average coverage is " << average_coverage + << " setting interesting positions heuristics to " << interesting_weight_cutoff); + } ipp_.FillInterestingPositions(charts_); for (const auto &sf : sam_files_) { MappedSamStream sm(sf.first); @@ -285,7 +302,7 @@ } vector contig_name_splitted; boost::split(contig_name_splitted, contig_name_, boost::is_any_of("_")); - io::osequencestream_simple oss(output_contig_file_); + io::OutputSequenceStream oss(output_contig_file_); for(size_t i = 0; i < contig_name_splitted.size(); i++) { if (contig_name_splitted[i] == "length" && i + 1 < contig_name_splitted.size()) { contig_name_splitted[i + 1] = std::to_string(int(s_new_contig.str().length())); @@ -296,8 +313,7 @@ for(size_t i = 1; i < contig_name_splitted.size(); i++) { new_header += "_" + contig_name_splitted[i]; } - oss.set_header(new_header); - oss << s_new_contig.str(); + oss << io::SingleRead(new_header, s_new_contig.str()); return total_changes; } diff -Nru spades-3.10.1+dfsg/src/projects/corrector/contig_processor.hpp spades-3.11.1+dfsg/src/projects/corrector/contig_processor.hpp --- spades-3.10.1+dfsg/src/projects/corrector/contig_processor.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/corrector/contig_processor.hpp 2017-09-28 09:05:12.000000000 +0000 @@ -15,7 +15,7 @@ #pragma once #include "interesting_pos_processor.hpp" #include "positional_read.hpp" -#include "utils/openmp_wrapper.h" +#include "utils/parallel/openmp_wrapper.h" #include #include @@ -41,12 +41,16 @@ std::vector error_counts_; const size_t kMaxErrorNum = 20; - + int interesting_weight_cutoff; +protected: + DECL_LOGGER("ContigProcessor") public: ContigProcessor(const sam_files_type &sam_files, const std::string &contig_file) : sam_files_(sam_files), contig_file_(contig_file) { ReadContig(); ipp_.set_contig(contig_); +//At least three reads to believe in inexact repeats heuristics. + interesting_weight_cutoff = 2; } size_t ProcessMultipleSamFiles(); private: diff -Nru spades-3.10.1+dfsg/src/projects/corrector/dataset_processor.cpp spades-3.11.1+dfsg/src/projects/corrector/dataset_processor.cpp --- spades-3.10.1+dfsg/src/projects/corrector/dataset_processor.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/corrector/dataset_processor.cpp 2017-09-28 09:05:12.000000000 +0000 @@ -11,9 +11,9 @@ #include "config_struct.hpp" #include "io/reads/file_reader.hpp" -#include "utils/path_helper.hpp" +#include "utils/filesystem/path_helper.hpp" #include "io/reads/osequencestream.hpp" -#include "utils/openmp_wrapper.h" +#include "utils/parallel/openmp_wrapper.h" #include @@ -26,7 +26,7 @@ std::string DatasetProcessor::GetLibDir(const size_t lib_count) { if (lib_dirs_.find(lib_count) != lib_dirs_.end()) return lib_dirs_[lib_count]; - std::string res = path::make_temp_dir(corr_cfg::get().work_dir, "lib" + to_string(lib_count)); + std::string res = fs::make_temp_dir(corr_cfg::get().work_dir, "lib" + to_string(lib_count)); lib_dirs_[lib_count] = res; return res; } @@ -42,13 +42,13 @@ if (all_contigs_.find(contig_name) != all_contigs_.end()) { WARN("Duplicated contig names! Multiple contigs with name" << contig_name); } - string full_path = path::append_path(genome_splitted_dir, contig_name + ".fasta"); - string out_full_path = path::append_path(genome_splitted_dir, contig_name + ".ref.fasta"); - string sam_filename = path::append_path(genome_splitted_dir, contig_name + ".pair.sam"); + string full_path = fs::append_path(genome_splitted_dir, contig_name + ".fasta"); + string out_full_path = fs::append_path(genome_splitted_dir, contig_name + ".ref.fasta"); + string sam_filename = fs::append_path(genome_splitted_dir, contig_name + ".pair.sam"); all_contigs_[contig_name] = {full_path, out_full_path, contig_seq.length(), sam_files_type(), sam_filename, cur_id}; cur_id ++; buffered_reads_[contig_name].clear(); - io::osequencestream oss(full_path); + io::OutputSequenceStream oss(full_path); oss << io::SingleRead(contig_name, contig_seq); DEBUG("full_path " + full_path) } @@ -133,9 +133,9 @@ string DatasetProcessor::RunPairedBwa(const string &left, const string &right, const size_t lib) { string cur_dir = GetLibDir(lib); int run_res = 0; - string tmp_sam_filename = path::append_path(cur_dir, "tmp.sam"); - string bwa_string = path::screen_whitespaces(path::screen_whitespaces(corr_cfg::get().bwa)); - string genome_screened = path::screen_whitespaces(genome_file_); + string tmp_sam_filename = fs::append_path(cur_dir, "tmp.sam"); + string bwa_string = fs::screen_whitespaces(fs::screen_whitespaces(corr_cfg::get().bwa)); + string genome_screened = fs::screen_whitespaces(genome_file_); string index_line = bwa_string + string(" index ") + "-a " + "is " + genome_screened ; INFO("Running bwa index ...: " << index_line); run_res = system(index_line.c_str()); @@ -144,8 +144,8 @@ return ""; } string nthreads_str = to_string(nthreads_); - string last_line = bwa_string + string(" mem ") + " -v 1 -t " + nthreads_str + " "+ genome_screened + " " + path::screen_whitespaces(left) + " " + path::screen_whitespaces(right) + " > " - + path::screen_whitespaces(tmp_sam_filename) ; + string last_line = bwa_string + string(" mem ") + " -v 1 -t " + nthreads_str + " "+ genome_screened + " " + fs::screen_whitespaces(left) + " " + fs::screen_whitespaces(right) + " > " + + fs::screen_whitespaces(tmp_sam_filename) ; INFO("Running bwa mem ...:" << last_line); run_res = system(last_line.c_str()); if (run_res != 0) { @@ -158,9 +158,9 @@ string DatasetProcessor::RunSingleBwa(const string &single, const size_t lib) { int run_res = 0; string cur_dir = GetLibDir(lib); - string tmp_sam_filename = path::append_path(cur_dir, "tmp.sam"); - string bwa_string = path::screen_whitespaces(path::screen_whitespaces(corr_cfg::get().bwa)); - string genome_screened = path::screen_whitespaces(genome_file_); + string tmp_sam_filename = fs::append_path(cur_dir, "tmp.sam"); + string bwa_string = fs::screen_whitespaces(fs::screen_whitespaces(corr_cfg::get().bwa)); + string genome_screened = fs::screen_whitespaces(genome_file_); string index_line = bwa_string + string(" index ") + "-a " + "is " + genome_screened ; INFO("Running bwa index ...: " << index_line); run_res = system(index_line.c_str()); @@ -169,7 +169,7 @@ return ""; } string nthreads_str = to_string(nthreads_); - string last_line = bwa_string + " mem "+ " -v 1 -t " + nthreads_str + " " + genome_screened + " " + path::screen_whitespaces(single) + " > " + path::screen_whitespaces(tmp_sam_filename); + string last_line = bwa_string + " mem "+ " -v 1 -t " + nthreads_str + " " + genome_screened + " " + fs::screen_whitespaces(single) + " > " + fs::screen_whitespaces(tmp_sam_filename); INFO("Running bwa mem ...:" << last_line); run_res = system(last_line.c_str()); if (run_res != 0) { @@ -183,7 +183,7 @@ string out_dir = GetLibDir(lib_count); for (auto &ac : all_contigs_) { auto contig_name = ac.first; - string out_name = path::append_path(out_dir, contig_name + ".sam"); + string out_name = fs::append_path(out_dir, contig_name + ".sam"); ac.second.sam_filenames.push_back(make_pair(out_name, unsplitted_sam_files_[lib_count].second)); BufferedOutputRead("@SQ\tSN:" + contig_name + "\tLN:" + to_string(all_contigs_[contig_name].contig_length), contig_name, lib_count); } diff -Nru spades-3.10.1+dfsg/src/projects/corrector/dataset_processor.hpp spades-3.11.1+dfsg/src/projects/corrector/dataset_processor.hpp --- spades-3.10.1+dfsg/src/projects/corrector/dataset_processor.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/corrector/dataset_processor.hpp 2017-09-28 09:05:12.000000000 +0000 @@ -7,10 +7,9 @@ #pragma once -#include "utils/path_helper.hpp" +#include "utils/filesystem/path_helper.hpp" #include "io/reads/file_reader.hpp" -#include "utils/path_helper.hpp" #include "pipeline/library.hpp" @@ -46,10 +45,14 @@ std::unordered_map lib_dirs_; const size_t kBuffSize = 100000; const size_t kMinContigLengthForInfo = 20000; + +protected: + DECL_LOGGER("DatasetProcessor") + public: DatasetProcessor(const std::string &genome_file, const std::string &work_dir, const std::string &output_dir, const size_t &thread_num) : genome_file_(genome_file), work_dir_(work_dir), nthreads_(thread_num) { - output_contig_file_ = path::append_path(output_dir, "corrected_contigs.fasta"); + output_contig_file_ = fs::append_path(output_dir, "corrected_contigs.fasta"); buffered_count_ = 0; } diff -Nru spades-3.10.1+dfsg/src/projects/corrector/interesting_pos_processor.hpp spades-3.11.1+dfsg/src/projects/corrector/interesting_pos_processor.hpp --- spades-3.10.1+dfsg/src/projects/corrector/interesting_pos_processor.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/corrector/interesting_pos_processor.hpp 2017-09-28 09:05:12.000000000 +0000 @@ -11,6 +11,8 @@ #include #include #include +#include "utils/logger/log_writers.hpp" + namespace corrector { typedef std::vector WeightedReadStorage; @@ -29,6 +31,9 @@ static const size_t kMaxErrorCount = 6; const int error_weight[kMaxErrorCount] = { 100, 10, 8, 5, 2, 1 }; +private: + DECL_LOGGER("InterestingPositionProcessor") + public: InterestingPositionProcessor() {} void set_contig(const std::string &ctg); diff -Nru spades-3.10.1+dfsg/src/projects/corrector/main.cpp spades-3.11.1+dfsg/src/projects/corrector/main.cpp --- spades-3.10.1+dfsg/src/projects/corrector/main.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/corrector/main.cpp 2017-09-28 09:05:12.000000000 +0000 @@ -19,22 +19,27 @@ #include using namespace std; -void create_console_logger() { + +void create_console_logger(const string& dir) { using namespace logging; - logger *lg = create_logger(""); + string log_props_file = corr_cfg::get().log_filename; + + if (!fs::FileExists(log_props_file)) + log_props_file = fs::append_path(dir, corr_cfg::get().log_filename); + cout << log_props_file; + logger *lg = create_logger(fs::FileExists(log_props_file) ? log_props_file : ""); lg->add_writer(std::make_shared()); + //lg->add_writer(std::make_shared(std::make_shared())); attach_logger(lg); } int main(int argc, char** argv) { - perf_counter pc; + utils::perf_counter pc; srand(42); srandom(42); try { - create_console_logger(); - if (argc != 3) { WARN("Wrong argument number"); return 1; @@ -42,11 +47,13 @@ string contig_name(argv[2]); string cfg_file(argv[1]); corr_cfg::create_instance(cfg_file); + string cfg_dir = fs::parent_path(cfg_file); + create_console_logger(cfg_dir); string work_dir = corr_cfg::get().work_dir; - if (!path::check_existence(corr_cfg::get().output_dir)) - path::make_dir(corr_cfg::get().output_dir); - if (!path::check_existence(corr_cfg::get().work_dir)) - path::make_dir(corr_cfg::get().work_dir); + if (!fs::check_existence(corr_cfg::get().output_dir)) + fs::make_dir(corr_cfg::get().output_dir); + if (!fs::check_existence(corr_cfg::get().work_dir)) + fs::make_dir(corr_cfg::get().work_dir); INFO("Starting MismatchCorrector, built from " SPADES_GIT_REFSPEC ", git revision " SPADES_GIT_SHA1); diff -Nru spades-3.10.1+dfsg/src/projects/corrector/positional_read.hpp spades-3.11.1+dfsg/src/projects/corrector/positional_read.hpp --- spades-3.10.1+dfsg/src/projects/corrector/positional_read.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/corrector/positional_read.hpp 2017-09-28 09:05:12.000000000 +0000 @@ -42,7 +42,12 @@ return maxi; } - + size_t TotalMapped() const { + size_t res = 0; + for (size_t i = 0; i < MAX_VARIANTS; i++) + res += votes[i]; + return res; + } std::string str() const; void clear() ; }; diff -Nru spades-3.10.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/consensus_contigs_constructor.hpp spades-3.11.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/consensus_contigs_constructor.hpp --- spades-3.10.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/consensus_contigs_constructor.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/consensus_contigs_constructor.hpp 2017-09-28 09:05:12.000000000 +0000 @@ -82,7 +82,7 @@ } } if(zero_paths != 0) - INFO(ToString(zero_paths) + " contigs with total length " << total_length_unmapped << + INFO(std::to_string(zero_paths) + " contigs with total length " << total_length_unmapped << " have mapped path with zero length"); return map_paths; } @@ -213,13 +213,13 @@ double_contigs->Add(contig); } WriteContigsToFile(double_contigs, - path::append_path(dsp_cfg::get().io.output_dir, "paired_consensus_contigs.fasta").c_str()); + fs::append_path(dsp_cfg::get().io.output_dir, "paired_consensus_contigs.fasta").c_str()); WriteContigsToFile(single_contigs, - path::append_path(dsp_cfg::get().io.output_dir, "unpaired_consensus_contigs.fasta").c_str()); + fs::append_path(dsp_cfg::get().io.output_dir, "unpaired_consensus_contigs.fasta").c_str()); } void WriteAlignedHaplocontigs(){ - string fname = path::append_path(dsp_cfg::get().io.output_dir, "haplocontigs_alignment"); + string fname = fs::append_path(dsp_cfg::get().io.output_dir, "haplocontigs_alignment"); ofstream out(fname.c_str()); INFO("Writing haplocontigs alignment to " << fname); @@ -310,7 +310,7 @@ composite_storage_ = DefineOverlappingContigs(processed_storage); - string consensus_fname(path::append_path(dsp_cfg::get().io.output_dir, "consensus_contigs.fasta").c_str()); + string consensus_fname(fs::append_path(dsp_cfg::get().io.output_dir, "consensus_contigs.fasta").c_str()); WriteContigsToFile(composite_storage_, consensus_fname); WritePairedAndUnpairedContigs(composite_storage_); diff -Nru spades-3.10.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/close_gaps_corrector.hpp spades-3.11.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/close_gaps_corrector.hpp --- spades-3.10.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/close_gaps_corrector.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/close_gaps_corrector.hpp 2017-09-28 09:05:12.000000000 +0000 @@ -75,7 +75,7 @@ if(cur_process_perc > processed_perc) { while(processed_perc + step <= cur_process_perc) processed_perc += step; - INFO(ToString(processed_perc * 100.0) << "% contigs were processed"); + INFO(std::to_string(processed_perc * 100.0) << "% contigs were processed"); processed_perc += step; } } @@ -91,16 +91,16 @@ virtual ContigStoragePtr Correct(ContigStoragePtr storage){ - INFO(ToString(CountContigsWithGaps(storage)) << " contigs from " << - ToString(storage->Size()) << " have gaps before correction"); + INFO(std::to_string(CountContigsWithGaps(storage)) << " contigs from " << + std::to_string(storage->Size()) << " have gaps before correction"); ProcessContigs(storage); - INFO(ToString(num_corr) << " contigs from " << - ToString(storage->Size()) << " with total length " << ToString(connected_length_) + " are correct"); - INFO(ToString(storage->Size() - num_corr) << " contigs from " - << ToString(storage->Size()) << " with total length " << - ToString(disconnected_length_) + " have gaps after correction"); + INFO(std::to_string(num_corr) << " contigs from " << + std::to_string(storage->Size()) << " with total length " << std::to_string(connected_length_) + " are correct"); + INFO(std::to_string(storage->Size() - num_corr) << " contigs from " + << std::to_string(storage->Size()) << " with total length " << + std::to_string(disconnected_length_) + " have gaps after correction"); storage->DeleteByIDs(incorr_contigs); return storage; diff -Nru spades-3.10.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/equal_path_deletion_correction.hpp spades-3.11.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/equal_path_deletion_correction.hpp --- spades-3.10.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/equal_path_deletion_correction.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/equal_path_deletion_correction.hpp 2017-09-28 09:05:12.000000000 +0000 @@ -62,7 +62,7 @@ } RedundancyMapCondenser condenser; res_.redundancy_map = condenser.Condense(res_.redundancy_map); - INFO(ToString(ids_for_deletion.size()) + " contigs from " << contigs->Size() << " are redundant"); + INFO(std::to_string(ids_for_deletion.size()) + " contigs from " << contigs->Size() << " are redundant"); contigs->DeleteByIDs(ids_for_deletion); INFO("Computing redundant equal contigs ends"); diff -Nru spades-3.10.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/incorrect_contig_remover.hpp spades-3.11.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/incorrect_contig_remover.hpp --- spades-3.10.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/incorrect_contig_remover.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/incorrect_contig_remover.hpp 2017-09-28 09:05:12.000000000 +0000 @@ -28,7 +28,7 @@ contigs_for_deletion.insert((*storage)[i]->id()); } } - INFO(ToString(contigs_for_deletion.size()) + " contigs from " << + INFO(std::to_string(contigs_for_deletion.size()) + " contigs from " << storage->Size() << " were deleted"); storage->DeleteByIDs(contigs_for_deletion); return storage; diff -Nru spades-3.10.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/iterative_redundant_contigs_remover.hpp spades-3.11.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/iterative_redundant_contigs_remover.hpp --- spades-3.10.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/iterative_redundant_contigs_remover.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/iterative_redundant_contigs_remover.hpp 2017-09-28 09:05:12.000000000 +0000 @@ -40,14 +40,14 @@ contigs = equal_path_remover.Correct(contigs); res.redundancy_map = equal_path_remover.Result().redundancy_map; index_.Clear(); - INFO(ToString(contigs->Size()) + " contigs will be used further"); + INFO(std::to_string(contigs->Size()) + " contigs will be used further"); } INFO("Iterative loop corrector starts"); { INFO("Only exact match iteration with parameters:"); - INFO("\tMaximal loop length - " + ToString(max_loop_len_)); - INFO("\tMinimal lcs length - " + ToString(min_lcs_length_)); + INFO("\tMaximal loop length - " + std::to_string(max_loop_len_)); + INFO("\tMinimal lcs length - " + std::to_string(min_lcs_length_)); INFO("\tMaximal tail length - 0"); index_.Initialize(contigs); @@ -59,14 +59,14 @@ RedundancyMapMerger map_merger; res.redundancy_map = map_merger.MergeTwoMaps(old_map, new_map); index_.Clear(); - INFO(ToString(contigs->Size()) + " contigs will be used further"); + INFO(std::to_string(contigs->Size()) + " contigs will be used further"); } { INFO("Tails allowing match iteration with parameters:"); - INFO("\tMaximal loop length - " + ToString(max_loop_len_)); - INFO("\tMinimal lcs length - " + ToString(min_lcs_length_)); - INFO("\tMaximal tail length - " + ToString(max_tail_length_)); + INFO("\tMaximal loop length - " + std::to_string(max_loop_len_)); + INFO("\tMinimal lcs length - " + std::to_string(min_lcs_length_)); + INFO("\tMaximal tail length - " + std::to_string(max_tail_length_)); index_.Initialize(contigs); LoopBulgeDeletionCorrector loop_corr(g_, k_value_, max_loop_len_, max_tail_length_, min_lcs_length_, index_); @@ -76,7 +76,7 @@ RedundancyMapMerger map_merger; res.redundancy_map = map_merger.MergeTwoMaps(old_map, new_map); index_.Clear(); - INFO(ToString(contigs->Size()) + " contigs will be used further"); + INFO(std::to_string(contigs->Size()) + " contigs will be used further"); } INFO("Iterative loop corrector ends"); return contigs; diff -Nru spades-3.10.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/overlap_searcher.hpp spades-3.11.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/overlap_searcher.hpp --- spades-3.10.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/overlap_searcher.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/overlap_searcher.hpp 2017-09-28 09:05:12.000000000 +0000 @@ -473,7 +473,7 @@ } } - TRACE("Overlapped contigs map. Size - " << ToString(overlap_map.Size()) << endl << + TRACE("Overlapped contigs map. Size - " << std::to_string(overlap_map.Size()) << endl << overlap_map); FillOverlapGraphByMap(overlap_map, og); @@ -481,16 +481,16 @@ string fname = dsp_cfg::get().io.output_dir + "default_overlap_graph.dot"; OverlapgraphToDot(fname, og, contigs); - INFO("Overlap graph with " + ToString(og.Vertices().size()) + " vertices and " + - ToString(og.Edges().size()) + " edges constructed"); + INFO("Overlap graph with " + std::to_string(og.Vertices().size()) + " vertices and " + + std::to_string(og.Edges().size()) + " edges constructed"); auto og_vertices = og.Vertices(); auto edges = og.Edges(); SimplifyOverlapGraph(og, 10, 5); - INFO("Simplified overlap graph contains " + ToString(og.Vertices().size()) + " vertices and " + - ToString(og.Edges().size()) + " edges"); + INFO("Simplified overlap graph contains " + std::to_string(og.Vertices().size()) + " vertices and " + + std::to_string(og.Edges().size()) + " edges"); fname = dsp_cfg::get().io.output_dir + "simplified_overlap_graph.dot"; OverlapgraphToDot(fname, og, contigs); diff -Nru spades-3.10.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/redundant_contig_remover.hpp spades-3.11.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/redundant_contig_remover.hpp --- spades-3.10.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/redundant_contig_remover.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/redundant_contig_remover.hpp 2017-09-28 09:05:12.000000000 +0000 @@ -856,7 +856,7 @@ if(cur_process_perc > processed_perc) { while(processed_perc + processed_step<= cur_process_perc) processed_perc += processed_step; - INFO(ToString(processed_perc * 100.0) << "% contigs were processed"); + INFO(std::to_string(processed_perc * 100.0) << "% contigs were processed"); processed_perc += processed_step; } } @@ -865,7 +865,7 @@ RedundancyMapCondenser condenser; condenser.Condense(res.redundancy_map); - INFO(ToString(redundant_contigs.size()) + " contigs from " + ToString(contigs->Size()) + " are redundant"); + INFO(std::to_string(redundant_contigs.size()) + " contigs from " + std::to_string(contigs->Size()) + " are redundant"); contigs->DeleteByIDs(redundant_contigs); diff -Nru spades-3.10.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/overlap_graph.hpp spades-3.11.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/overlap_graph.hpp --- spades-3.10.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/overlap_graph.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/dipspades/consensus_contigs_constructor/overlap_graph.hpp 2017-09-28 09:05:12.000000000 +0000 @@ -919,7 +919,7 @@ map > sh_paths = dijkstra.Paths(); if(sh_paths.find(v2) == sh_paths.end()){ -// INFO("Path from " + ToString(v1) + " to " + ToString(v2) + " isn't found"); +// INFO("Path from " + ToString(v1) + " to " + ContainerToString(v2) + " isn't found"); return paths; } else{ @@ -1104,7 +1104,7 @@ TransitiveReductionCorrector tr_corr; tr_res = tr_corr.Correct(overlap_graph); - INFO(ToString(tc_res) + " tips and " + ToString(tr_res) + " transitive edges were deleted in overlap graph"); + INFO(std::to_string(tc_res) + " tips and " + std::to_string(tr_res) + " transitive edges were deleted in overlap graph"); } INFO("Bulge remover starts"); @@ -1112,7 +1112,7 @@ size_t num_bulges = 1; for(size_t i = 0; (i < br_num_iter && num_bulges > 0); i++){ num_bulges = br_corr.Correct(overlap_graph); - INFO(ToString(num_bulges) + " bulges were deleted in overlap graph"); + INFO(std::to_string(num_bulges) + " bulges were deleted in overlap graph"); } } diff -Nru spades-3.10.1+dfsg/src/projects/dipspades/dipspades_config.cpp spades-3.11.1+dfsg/src/projects/dipspades/dipspades_config.cpp --- spades-3.10.1+dfsg/src/projects/dipspades/dipspades_config.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/dipspades/dipspades_config.cpp 2017-09-28 09:05:12.000000000 +0000 @@ -8,7 +8,7 @@ #include "dipspades_config.hpp" #include "pipeline/config_common.hpp" #include "utils/files_utils.hpp" -#include "utils/path_helper.hpp" +#include "utils/filesystem/path_helper.hpp" using namespace dipspades; @@ -58,7 +58,7 @@ io.dataset_name = io.output_dir.substr(0, io.output_dir.length() - 1); io.output_dir = io.output_base + io.output_dir + "/"; io.output_root = io.output_dir; - io.output_suffix = path::MakeLaunchTimeDirName() + "/"; + io.output_suffix = fs::MakeLaunchTimeDirName() + "/"; io.output_dir = io.output_root + io.output_suffix; io.output_saves = io.output_dir + "saves/"; // io.load_from = io.output_root + io.load_from; diff -Nru spades-3.10.1+dfsg/src/projects/dipspades/dipspades.hpp spades-3.11.1+dfsg/src/projects/dipspades/dipspades.hpp --- spades-3.10.1+dfsg/src/projects/dipspades/dipspades.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/dipspades/dipspades.hpp 2017-09-28 09:05:12.000000000 +0000 @@ -84,7 +84,7 @@ void load(debruijn_graph::conj_graph_pack& gp, const std::string &load_from, const char* prefix) { - std::string p = path::append_path(load_from, prefix == NULL ? id() : prefix); + std::string p = fs::append_path(load_from, prefix == NULL ? id() : prefix); INFO("Loading current state from " << p); debruijn_graph::graphio::ScanAll(p, gp, false); @@ -93,7 +93,7 @@ void save(const debruijn_graph::conj_graph_pack& gp, const std::string & save_to, const char* prefix) const { - std::string p = path::append_path(save_to, prefix == NULL ? id() : prefix); + std::string p = fs::append_path(save_to, prefix == NULL ? id() : prefix); INFO("Saving current state to " << p); debruijn_graph::graphio::PrintAll(p, gp); } @@ -116,7 +116,7 @@ void load(debruijn_graph::conj_graph_pack& gp, const std::string &load_from, const char* prefix) { - std::string p = path::append_path(load_from, prefix == NULL ? id() : prefix); + std::string p = fs::append_path(load_from, prefix == NULL ? id() : prefix); INFO("Loading current state from " << p); debruijn_graph::graphio::ScanAll(p, gp, false); INFO("Loading histogram of bulge length"); @@ -127,7 +127,7 @@ void save(const debruijn_graph::conj_graph_pack& gp, const std::string & save_to, const char* prefix) const { - std::string p = path::append_path(save_to, prefix == NULL ? id() : prefix); + std::string p = fs::append_path(save_to, prefix == NULL ? id() : prefix); INFO("Saving current state to " << p); debruijn_graph::graphio::PrintAll(p, gp); storage().bulge_len_histogram.SaveToFile(p + ".hist"); @@ -150,7 +150,7 @@ void load(debruijn_graph::conj_graph_pack& gp, const std::string &load_from, const char* prefix) { - std::string p = path::append_path(load_from, prefix == NULL ? id() : prefix); + std::string p = fs::append_path(load_from, prefix == NULL ? id() : prefix); INFO("Loading current state from " << p); debruijn_graph::graphio::ScanAll(p, gp, false); INFO("Loading histogram of bulge length"); @@ -161,7 +161,7 @@ void save(const debruijn_graph::conj_graph_pack& gp, const std::string & save_to, const char* prefix) const { - std::string p = path::append_path(save_to, prefix == NULL ? id() : prefix); + std::string p = fs::append_path(save_to, prefix == NULL ? id() : prefix); INFO("Saving current state to " << p); debruijn_graph::graphio::PrintAll(p, gp); storage().bulge_len_histogram.SaveToFile(p + ".hist"); @@ -188,7 +188,7 @@ void load(debruijn_graph::conj_graph_pack& gp, const std::string &load_from, const char* prefix) { - std::string p = path::append_path(load_from, prefix == NULL ? id() : prefix); + std::string p = fs::append_path(load_from, prefix == NULL ? id() : prefix); INFO("Loading current state from " << p); debruijn_graph::graphio::ScanAll(p, gp, false); } @@ -196,7 +196,7 @@ void save(const debruijn_graph::conj_graph_pack& gp, const std::string & save_to, const char* prefix) const { - std::string p = path::append_path(save_to, prefix == NULL ? id() : prefix); + std::string p = fs::append_path(save_to, prefix == NULL ? id() : prefix); INFO("Saving current state to " << p); debruijn_graph::graphio::PrintAll(p, gp); storage().bulge_len_histogram.SaveToFile(p + ".hist"); @@ -217,7 +217,7 @@ return; INFO("Diploid graph construction"); conj_graph_pack double_graph_pack(graph_pack.k_value, dsp_cfg::get().io.tmp_dir, - dsp_cfg::get().io.num_libraries, ""); + dsp_cfg::get().io.num_libraries, std::vector()); construct_graph_from_contigs(double_graph_pack); HaplotypeAssembler(graph_pack, double_graph_pack, storage().default_storage, storage().composite_storage, storage().redundancy_map).Run(); @@ -240,7 +240,7 @@ dsp_cfg::get().bp.K, dsp_cfg::get().io.tmp_dir, dsp_cfg::get().io.num_libraries, - "", // reference genome + std::vector(), // reference genome 1); // flanking range conj_gp.kmer_mapper.Attach(); diff -Nru spades-3.10.1+dfsg/src/projects/dipspades/haplotype_assembly/conservative_regions_searcher.hpp spades-3.11.1+dfsg/src/projects/dipspades/haplotype_assembly/conservative_regions_searcher.hpp --- spades-3.10.1+dfsg/src/projects/dipspades/haplotype_assembly/conservative_regions_searcher.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/dipspades/haplotype_assembly/conservative_regions_searcher.hpp 2017-09-28 09:05:12.000000000 +0000 @@ -148,7 +148,7 @@ size_t cons_regions_length = ComputeSummaryLengthOfRegionInStorage(cons_reg_storage_.cons_regions_begin(), cons_reg_storage_.cons_regions_end()); if(cons_regions_length > 0){ - string cons_regions_fname(path::append_path(dsp_cfg::get().io.output_dir, + string cons_regions_fname(fs::append_path(dsp_cfg::get().io.output_dir, "conservative_regions.fasta").c_str()); WriteConservativeRegionsStorageToFile(cons_regions_fname, cons_reg_storage_.cons_regions_begin(), cons_reg_storage_.cons_regions_end()); @@ -159,7 +159,7 @@ size_t poss_cons_regions_length = ComputeSummaryLengthOfRegionInStorage(cons_reg_storage_.poss_cons_regions_begin(), cons_reg_storage_.poss_cons_regions_end()); if(poss_cons_regions_length > 0){ - string poss_cons_regions_fname(path::append_path(dsp_cfg::get().io.output_dir, + string poss_cons_regions_fname(fs::append_path(dsp_cfg::get().io.output_dir, "possibly_conservative_regions.fasta").c_str()); // INFO("Possibly conservative regions written in file " << poss_cons_regions_fname); WriteConservativeRegionsStorageToFile(poss_cons_regions_fname, cons_reg_storage_.poss_cons_regions_begin(), diff -Nru spades-3.10.1+dfsg/src/projects/dipspades/haplotype_assembly/haplotype_assembler.hpp spades-3.11.1+dfsg/src/projects/dipspades/haplotype_assembly/haplotype_assembler.hpp --- spades-3.10.1+dfsg/src/projects/dipspades/haplotype_assembly/haplotype_assembler.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/dipspades/haplotype_assembly/haplotype_assembler.hpp 2017-09-28 09:05:12.000000000 +0000 @@ -42,7 +42,7 @@ INFO("Consensus contigs number: " << composite_storage_->Size()); separator.SeparateContigs(); SignedLabels signed_labels = separator.GetSignedLabels(); - string hapl_output(path::append_path(dsp_cfg::get().io.output_dir, "haplotype_assembly.out").c_str()); + string hapl_output(fs::append_path(dsp_cfg::get().io.output_dir, "haplotype_assembly.out").c_str()); signed_labels.WriteToFile(hapl_output, default_storage_); INFO("Result of haplotype assembly written in file " << hapl_output); INFO("Contigs separation ends"); diff -Nru spades-3.10.1+dfsg/src/projects/dipspades/main.cpp spades-3.11.1+dfsg/src/projects/dipspades/main.cpp --- spades-3.10.1+dfsg/src/projects/dipspades/main.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/dipspades/main.cpp 2017-09-28 09:05:12.000000000 +0000 @@ -11,8 +11,8 @@ #include "utils/logger/log_writers.hpp" #include "utils/segfault_handler.hpp" -#include "utils/memory_limit.hpp" -#include "utils/copy_file.hpp" +#include "utils/perf/memory_limit.hpp" +#include "utils/filesystem/copy_file.hpp" #include "pipeline/graph_pack.hpp" #include "stages/construction.hpp" @@ -35,11 +35,11 @@ if (!make_dir(to)) { WARN("Could not create files use in /tmp directory"); } - path::copy_files_by_ext(path::parent_path(cfg_filename), to, ".info", true); + fs::copy_files_by_ext(fs::parent_path(cfg_filename), to, ".info", true); } void load_config(string cfg_filename) { - path::CheckFileExistenceFATAL(cfg_filename); + fs::CheckFileExistenceFATAL(cfg_filename); dsp_cfg::create_instance(cfg_filename); // string path_to_copy = path::append_path(dsp_cfg::get().io.output_dir, "configs"); // copy_configs(cfg_filename, path_to_copy); @@ -50,23 +50,23 @@ string log_props_file = dsp_cfg::get().io.log_filename; - if (!path::FileExists(log_props_file)){ - log_props_file = path::append_path(path::parent_path(cfg_filename), dsp_cfg::get().io.log_filename); + if (!fs::FileExists(log_props_file)){ + log_props_file = fs::append_path(fs::parent_path(cfg_filename), dsp_cfg::get().io.log_filename); } - logger *lg = create_logger(path::FileExists(log_props_file) ? log_props_file : ""); + logger *lg = create_logger(fs::FileExists(log_props_file) ? log_props_file : ""); lg->add_writer(std::make_shared()); attach_logger(lg); } int main(int /*argc*/, char** argv) { - perf_counter pc; + utils::perf_counter pc; const size_t GB = 1 << 30; srand(42); srandom(42); - segfault_handler sh; + utils::segfault_handler sh; try { using namespace debruijn_graph; @@ -74,7 +74,7 @@ load_config (cfg_filename); make_dirs(); if(dsp_cfg::get().rp.developer_mode) - copy_configs(cfg_filename, path::append_path(dsp_cfg::get().io.output_dir, "configs")); + copy_configs(cfg_filename, fs::append_path(dsp_cfg::get().io.output_dir, "configs")); create_console_logger(cfg_filename); INFO("Loaded config from " << cfg_filename); @@ -82,7 +82,7 @@ VERIFY(dsp_cfg::get().bp.K >= runtime_k::MIN_K && dsp_cfg::get().bp.K < runtime_k::MAX_K); VERIFY(dsp_cfg::get().bp.K % 2 != 0); - limit_memory(dsp_cfg::get().bp.max_memory * GB); + utils::limit_memory(dsp_cfg::get().bp.max_memory * GB); INFO("Starting dipSPAdes, built from " SPADES_GIT_REFSPEC ", git revision " SPADES_GIT_SHA1); INFO("Assembling dataset (" << dsp_cfg::get().io.dataset_name << ") with K=" << dsp_cfg::get().bp.K); diff -Nru spades-3.10.1+dfsg/src/projects/dipspades/polymorphic_bulge_remover/polymorphic_bulge_remover.hpp spades-3.11.1+dfsg/src/projects/dipspades/polymorphic_bulge_remover/polymorphic_bulge_remover.hpp --- spades-3.10.1+dfsg/src/projects/dipspades/polymorphic_bulge_remover/polymorphic_bulge_remover.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/dipspades/polymorphic_bulge_remover/polymorphic_bulge_remover.hpp 2017-09-28 09:05:12.000000000 +0000 @@ -45,8 +45,8 @@ size_t num_glued_bulges = 1; for(size_t num_iter = 1; num_glued_bulges > 0; num_iter++){ num_glued_bulges = spath_br.Run(); - CompressAllVertices(graph_pack_.g, false); - INFO(ToString(num_iter) + " iteration: " + ToString(num_glued_bulges) + " simple bulges were glued"); + CompressAllVertices(graph_pack_.g, 1, false); + INFO(std::to_string(num_iter) + " iteration: " + std::to_string(num_glued_bulges) + " simple bulges were glued"); } INFO("Simple polymorphic bulge remover ends"); } @@ -63,8 +63,8 @@ size_t num_glued_bulges = 1; for(size_t i = 0; (i < num_iters) && (num_glued_bulges != 0); i++){ num_glued_bulges = br.Run(); - CompressAllVertices(graph_pack_.g, false); - INFO(ToString(i + 1) + " iteration: " + ToString(num_glued_bulges) + " complex bulges were glued"); + CompressAllVertices(graph_pack_.g, 1, false); + INFO(std::to_string(i + 1) + " iteration: " + std::to_string(num_glued_bulges) + " complex bulges were glued"); } INFO(bulge_remover_name + " ends"); } diff -Nru spades-3.10.1+dfsg/src/projects/dipspades/utils/bulge_utils.hpp spades-3.11.1+dfsg/src/projects/dipspades/utils/bulge_utils.hpp --- spades-3.10.1+dfsg/src/projects/dipspades/utils/bulge_utils.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/dipspades/utils/bulge_utils.hpp 2017-09-28 09:05:12.000000000 +0000 @@ -105,7 +105,7 @@ string GetPathStr(vector path) { string s1 = ""; for(auto edge = path.begin(); edge != path.end(); edge++) - s1 = ToString(graph_.int_id(*edge)) + "-"; + s1 = std::to_string(graph_.int_id(*edge)) + "-"; return s1.substr(0, s1.size() - 1); } diff -Nru spades-3.10.1+dfsg/src/projects/hammer/config_struct_hammer.cpp spades-3.11.1+dfsg/src/projects/hammer/config_struct_hammer.cpp --- spades-3.10.1+dfsg/src/projects/hammer/config_struct_hammer.cpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/hammer/config_struct_hammer.cpp 2017-09-28 09:05:12.000000000 +0000 @@ -14,7 +14,7 @@ #include "config_struct_hammer.hpp" #include "pipeline/config_common.hpp" -#include "utils/openmp_wrapper.h" +#include "utils/parallel/openmp_wrapper.h" #include #include diff -Nru spades-3.10.1+dfsg/src/projects/hammer/hamcluster.cpp spades-3.11.1+dfsg/src/projects/hammer/hamcluster.cpp --- spades-3.10.1+dfsg/src/projects/hammer/hamcluster.cpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/hammer/hamcluster.cpp 2017-09-28 09:05:12.000000000 +0000 @@ -7,7 +7,7 @@ #include "hamcluster.hpp" -#include "common/adt/concurrent_dsu.hpp" +#include "adt/concurrent_dsu.hpp" #include "io/kmers/mmapped_reader.hpp" #include "parallel_radix_sort.hpp" @@ -65,7 +65,7 @@ } #if 1 -static bool canMerge(const ConcurrentDSU &uf, size_t x, size_t y) { +static bool canMerge(const dsu::ConcurrentDSU &uf, size_t x, size_t y) { size_t szx = uf.set_size(x), szy = uf.set_size(y); const size_t hardthr = 2500; @@ -88,7 +88,7 @@ #endif -static void processBlockQuadratic(ConcurrentDSU &uf, +static void processBlockQuadratic(dsu::ConcurrentDSU &uf, const std::vector::iterator &block, size_t block_size, const KMerData &data, @@ -110,7 +110,7 @@ void KMerHamClusterer::cluster(const std::string &prefix, const KMerData &data, - ConcurrentDSU &uf) { + dsu::ConcurrentDSU &uf) { // First pass - split & sort the k-mers std::string fname = prefix + ".first", bfname = fname + ".blocks", kfname = fname + ".kmers"; std::ofstream bfs(bfname, std::ios::out | std::ios::binary); @@ -209,7 +209,7 @@ FULLY_LOCKED = 3 }; -static bool canMerge2(const ConcurrentDSU &uf, size_t kidx, size_t cidx) { +static bool canMerge2(const dsu::ConcurrentDSU &uf, size_t kidx, size_t cidx) { // If either of indices is fully locked - bail out uint64_t kaux = uf.root_aux(kidx), caux = uf.root_aux(cidx); if (kaux == FULLY_LOCKED || caux == FULLY_LOCKED) @@ -224,7 +224,7 @@ return true; } -static void ClusterChunk(size_t start_idx, size_t end_idx, const KMerData &data, ConcurrentDSU &uf) { +static void ClusterChunk(size_t start_idx, size_t end_idx, const KMerData &data, dsu::ConcurrentDSU &uf) { unsigned nthreads = cfg::get().general_max_nthreads; // INFO("Cluster: " << start_idx << ":" << end_idx); @@ -274,7 +274,7 @@ } } -void TauOneKMerHamClusterer::cluster(const std::string &, const KMerData &data, ConcurrentDSU &uf) { +void TauOneKMerHamClusterer::cluster(const std::string &, const KMerData &data, dsu::ConcurrentDSU &uf) { size_t start_idx = 0; while (start_idx < data.size()) { size_t end_idx = start_idx + 64*1024; diff -Nru spades-3.10.1+dfsg/src/projects/hammer/hamcluster.hpp spades-3.11.1+dfsg/src/projects/hammer/hamcluster.hpp --- spades-3.10.1+dfsg/src/projects/hammer/hamcluster.hpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/hammer/hamcluster.hpp 2017-09-28 09:05:12.000000000 +0000 @@ -17,8 +17,8 @@ #include #include +#include -class ConcurrentDSU; typedef Seq<(hammer::K + 1) / 2, uint32_t> SubKMer; @@ -144,7 +144,7 @@ KMerHamClusterer(unsigned tau) : tau_(tau) {} - void cluster(const std::string &prefix, const KMerData &data, ConcurrentDSU &uf); + void cluster(const std::string &prefix, const KMerData &data, dsu::ConcurrentDSU &uf); private: DECL_LOGGER("Hamming Clustering"); }; @@ -152,7 +152,7 @@ class TauOneKMerHamClusterer { public: TauOneKMerHamClusterer() {} - void cluster(const std::string &prefix, const KMerData &data, ConcurrentDSU &uf); + void cluster(const std::string &prefix, const KMerData &data, dsu::ConcurrentDSU &uf); private: DECL_LOGGER("tau = 1 Hamming Clustering"); }; diff -Nru spades-3.10.1+dfsg/src/projects/hammer/hammer_tools.cpp spades-3.11.1+dfsg/src/projects/hammer/hammer_tools.cpp --- spades-3.10.1+dfsg/src/projects/hammer/hammer_tools.cpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/hammer/hammer_tools.cpp 2017-09-28 09:05:12.000000000 +0000 @@ -52,7 +52,7 @@ ostringstream tmp; tmp.str(""); - tmp << dirprefix.data() << "/" << path::basename(fname) << '.' << std::setfill('0') << std::setw(2) << iter_no << "." << suffix.data(); + tmp << dirprefix.data() << "/" << fs::basename(fname) << '.' << std::setfill('0') << std::setw(2) << iter_no << "." << suffix.data(); return tmp.str(); } diff -Nru spades-3.10.1+dfsg/src/projects/hammer/kmer_cluster.cpp spades-3.11.1+dfsg/src/projects/hammer/kmer_cluster.cpp --- spades-3.10.1+dfsg/src/projects/hammer/kmer_cluster.cpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/hammer/kmer_cluster.cpp 2017-09-28 09:05:12.000000000 +0000 @@ -6,7 +6,7 @@ //*************************************************************************** #include "io/reads/ireadstream.hpp" -#include "utils/openmp_wrapper.h" +#include "utils/parallel/openmp_wrapper.h" #include "hammer_tools.hpp" #include "hamcluster.hpp" @@ -424,16 +424,16 @@ } if (centersInCluster[k] == -1u) { - unsigned new_idx = 0; - #pragma omp critical - { - KMer newkmer(bestCenters[k].center_); - - KMerStat kms(0 /* cnt */, 1.0 /* total quality */, NULL /*quality */); - kms.mark_good(); - new_idx = (unsigned)data_.push_back(newkmer, kms); - if (data_.kmer(data_.seq_idx(newkmer)) != newkmer) + KMer newkmer(bestCenters[k].center_); + size_t new_idx = data_.checking_seq_idx(newkmer); + if (new_idx == -1ULL) { + #pragma omp critical + { + KMerStat kms(0 /* cnt */, 1.0 /* total quality */, NULL /*quality */); + kms.mark_good(); + new_idx = data_.push_back(newkmer, kms); newkmers += 1; + } } v.insert(v.begin(), new_idx); } diff -Nru spades-3.10.1+dfsg/src/projects/hammer/kmer_data.cpp spades-3.11.1+dfsg/src/projects/hammer/kmer_data.cpp --- spades-3.10.1+dfsg/src/projects/hammer/kmer_data.cpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/hammer/kmer_data.cpp 2017-09-28 09:05:12.000000000 +0000 @@ -12,11 +12,12 @@ #include "io/reads/ireadstream.hpp" #include "config_struct_hammer.hpp" -#include "utils/mph_index/kmer_index_builder.hpp" +#include "utils/kmer_mph/kmer_index_builder.hpp" +#include "utils/logger/logger.hpp" #include "io/kmers/kmer_iterator.hpp" -#include "common/adt/bf.hpp" -#include "common/adt/hll.hpp" +#include "adt/bf.hpp" +#include "adt/hll.hpp" using namespace hammer; @@ -35,7 +36,7 @@ }; -class HammerFilteringKMerSplitter : public KMerSortingSplitter { +class HammerFilteringKMerSplitter : public utils::KMerSortingSplitter { public: typedef std::function KMerFilter; @@ -44,7 +45,7 @@ : KMerSortingSplitter(work_dir, hammer::K), filter_(std::move(filter)) {} - path::files_t Split(size_t num_files) override; + fs::files_t Split(size_t num_files, unsigned nthreads) override; private: KMerFilter filter_; @@ -84,13 +85,10 @@ } }; -path::files_t HammerFilteringKMerSplitter::Split(size_t num_files) { - unsigned nthreads = std::min(cfg::get().count_merge_nthreads, cfg::get().general_max_nthreads); +fs::files_t HammerFilteringKMerSplitter::Split(size_t num_files, unsigned nthreads) { size_t reads_buffer_size = cfg::get().count_split_buffer; - INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while."); - - path::files_t out = PrepareBuffers(num_files, nthreads, reads_buffer_size); + fs::files_t out = PrepareBuffers(num_files, nthreads, reads_buffer_size); size_t n = 15, processed = 0; BufferFiller filler(*this); @@ -335,22 +333,22 @@ // FIXME: Reduce code duplication HammerFilteringKMerSplitter splitter(workdir, [&] (const KMer &k) { return mcounter.count(k) > 1; }); - KMerDiskCounter counter(workdir, splitter); + utils::KMerDiskCounter counter(workdir, splitter); - kmers = KMerIndexBuilder(workdir, num_files_, omp_get_max_threads()).BuildIndex(data.index_, counter, /* save final */ true); + kmers = utils::KMerIndexBuilder(workdir, num_files_, omp_get_max_threads()).BuildIndex(data.index_, counter, /* save final */ true); final_kmers = counter.GetFinalKMersFname(); } else { HammerFilteringKMerSplitter splitter(workdir); - KMerDiskCounter counter(workdir, splitter); + utils::KMerDiskCounter counter(workdir, splitter); - kmers = KMerIndexBuilder(workdir, num_files_, omp_get_max_threads()).BuildIndex(data.index_, counter, /* save final */ true); + kmers = utils::KMerIndexBuilder(workdir, num_files_, omp_get_max_threads()).BuildIndex(data.index_, counter, /* save final */ true); final_kmers = counter.GetFinalKMersFname(); } // Check, whether we'll ever have enough memory for running BH and bail out earlier double needed = 1.25 * (double)kmers * (sizeof(KMerStat) + sizeof(hammer::KMer)); - if (needed > (double) get_memory_limit()) + if (needed > (double) utils::get_memory_limit()) FATAL_ERROR("The reads contain too many k-mers to fit into available memory. You need approx. " << needed / 1024.0 / 1024.0 / 1024.0 << "GB of free RAM to assemble your dataset"); diff -Nru spades-3.10.1+dfsg/src/projects/hammer/kmer_data.hpp spades-3.11.1+dfsg/src/projects/hammer/kmer_data.hpp --- spades-3.10.1+dfsg/src/projects/hammer/kmer_data.hpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/hammer/kmer_data.hpp 2017-09-28 09:05:12.000000000 +0000 @@ -9,16 +9,19 @@ #define __HAMMER_KMER_DATA_HPP__ #include "kmer_stat.hpp" -#include "common/adt/array_vector.hpp" -#include "utils/mph_index/kmer_index.hpp" +#include "adt/array_vector.hpp" + +#include "utils/kmer_mph/kmer_index.hpp" +#include "utils/logger/logger.hpp" + #include -typedef KMerIndex > HammerKMerIndex; +typedef utils::KMerIndex > HammerKMerIndex; class KMerData { typedef std::vector KMerDataStorageType; typedef std::vector KMerStorageType; - typedef kmer_index_traits traits; + typedef utils::kmer_index_traits traits; public: KMerData() @@ -114,7 +117,7 @@ } private: - array_vector kmers_; + adt::array_vector kmers_; KMerDataStorageType data_; KMerStorageType kmer_push_back_buffer_; diff -Nru spades-3.10.1+dfsg/src/projects/hammer/main.cpp spades-3.11.1+dfsg/src/projects/hammer/main.cpp --- spades-3.10.1+dfsg/src/projects/hammer/main.cpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/hammer/main.cpp 2017-09-28 09:05:13.000000000 +0000 @@ -20,12 +20,12 @@ #include "kmer_data.hpp" #include "expander.hpp" -#include "common/adt/concurrent_dsu.hpp" +#include "adt/concurrent_dsu.hpp" #include "utils/segfault_handler.hpp" #include "io/reads/read_processor.hpp" #include "io/reads/ireadstream.hpp" -#include "utils/memory_limit.hpp" +#include "utils/perf/memory_limit.hpp" #include "utils/logger/logger.hpp" #include "utils/logger/log_writers.hpp" @@ -70,7 +70,7 @@ } int main(int argc, char * argv[]) { - segfault_handler sh; + utils::segfault_handler sh; srand(42); srandom(42); @@ -86,7 +86,7 @@ // hard memory limit const size_t GB = 1 << 30; - limit_memory(cfg::get().general_hard_memory_limit * GB); + utils::limit_memory(cfg::get().general_hard_memory_limit * GB); // determine quality offset if not specified if (!cfg::get().input_qvoffset_opt) { @@ -150,7 +150,7 @@ // Cluster the Hamming graph std::vector > classes; if (cfg::get().hamming_do || do_everything) { - ConcurrentDSU uf(Globals::kmer_data->size()); + dsu::ConcurrentDSU uf(Globals::kmer_data->size()); std::string ham_prefix = hammer::getFilename(cfg::get().input_working_dir, Globals::iteration_no, "kmers.hamcls"); INFO("Clustering Hamming graph."); if (cfg::get().general_tau > 1) { diff -Nru spades-3.10.1+dfsg/src/projects/hammer/parallel_radix_sort.hpp spades-3.11.1+dfsg/src/projects/hammer/parallel_radix_sort.hpp --- spades-3.10.1+dfsg/src/projects/hammer/parallel_radix_sort.hpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/hammer/parallel_radix_sort.hpp 2017-09-28 09:05:13.000000000 +0000 @@ -36,7 +36,7 @@ #ifndef PARALLEL_RADIX_SORT_H_ #define PARALLEL_RADIX_SORT_H_ -#include "utils/openmp_wrapper.h" +#include "utils/parallel/openmp_wrapper.h" #include #include diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/CMakeLists.txt spades-3.11.1+dfsg/src/projects/ionhammer/CMakeLists.txt --- spades-3.10.1+dfsg/src/projects/ionhammer/CMakeLists.txt 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/CMakeLists.txt 2017-09-28 09:05:13.000000000 +0000 @@ -11,16 +11,30 @@ add_executable(ionhammer kmer_data.cpp - hamcluster.cpp subcluster.cpp err_helper_table.cpp config_struct.cpp - expander.cpp + kmer_helpers.cpp seqeval/BaseHypothesisEvaluator.cpp seqeval/TreephaserLite.cpp - main.cpp) + main.cpp + reference.cpp + quality_metrics.cpp + quality_thresholds_estimator.cpp + hamcluster_1.cpp + gamma_poisson_model.cpp + normal_quality_model.cpp) + +add_executable(kmer_evaluator + kmer_data.cpp + kmer_evaluator.cpp + kmer_helpers.cpp + err_helper_table.cpp) + + target_link_libraries(ionhammer input utils pipeline mph_index BamTools ${COMMON_LIBRARIES}) +target_link_libraries(kmer_evaluator input utils mph_index BamTools ${COMMON_LIBRARIES}) if (SPADES_STATIC_BUILD) set_target_properties(ionhammer PROPERTIES LINK_SEARCH_END_STATIC 1) @@ -28,6 +42,8 @@ install(TARGETS ionhammer RUNTIME DESTINATION bin) +#install(TARGETS kmer_evaluator +# RUNTIME DESTINATION bin) install(DIRECTORY "${SPADES_CFG_DIR}/ionhammer" DESTINATION share/spades/configs FILES_MATCHING PATTERN "*.cfg") diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/comparator.py spades-3.11.1+dfsg/src/projects/ionhammer/comparator.py --- spades-3.10.1+dfsg/src/projects/ionhammer/comparator.py 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/comparator.py 2017-09-28 09:05:13.000000000 +0000 @@ -0,0 +1,215 @@ +#!/usr/bin/env python3 +# -*- coding: UTF-8 -*- +import subprocess +import os +import argparse +import os.path +import sys + +import pandas as pd + +def exit_with_error(message): + sys.stderr.write("Error: {__message}\n".format(__message=message)) + sys.exit(1) + +class Comparator(object): + + @staticmethod + def improved_count(corrected): + return corrected.query('levDistance < levDistance_baseline').shape[0] + + @staticmethod + def comparable_improved_count(corrected): + return corrected.query('levDistance < levDistance_baseline and comparable==True').shape[0] + + @staticmethod + def fill_comparable_aligments_flag(corrected): + corrected.eval('comparable = ((abs(refStart - refStart_baseline) < 15) and (abs(refEnd - refEnd_baseline) < 15))', inplace=True) + return corrected + + + @staticmethod + def corrupted_count(corrected): + corrupted = corrected.query('levDistance > levDistance_baseline and comparable==True') + print("Corrupted") + print(corrupted.index) + return corrupted.shape[0] + + @staticmethod + def full_fixed_count(corrected): + return corrected.query("hammingDistance == 0 and hammingDistance_baseline != 0").shape[0] + + @staticmethod + def comparable_full_fixed_count(corrected): + return corrected.query("hammingDistance == 0 and hammingDistance_baseline != 0 and comparable==True").shape[0] + + @staticmethod + def stats_filename(path): + return path + ".stats.tsv.gz" + + + def print_count(self, count): + return "{__count:.0f} ({__percent:.2f}%)".format(__count=count, __percent=100.0 * count / self.baseline.shape[0]) + + def calc_stats(self, path): + corrected = pd.read_csv(Comparator.stats_filename(path), sep="\t") + #wtf, ids after correction contains some trash values in suffix + corrected["id"]=corrected["id"].apply(lambda x : x.split("_")[0]) + corrected.set_index("id", inplace=True) + # sum_stats = corrected.sum(axis=0) + corrected = self.join_with_baseline(corrected) + corrected = self.fill_comparable_aligments_flag(corrected) + corrected.eval('gain = (hammingDistance_baseline - hammingDistance) / (hammingDistance_baseline + 1)') + mean_stats = corrected.query("comparable==True")[["hammingDistance", "levDistance", "gain"]].mean(axis=0) + print(corrected.shape) + corrected_dist_one = corrected.query("hammingDistance_baseline == 1") + print("Uncomparable reads") + uncomparable = corrected.query('comparable == False') + print(uncomparable.index) + comparable_count = corrected.query('comparable == True').shape[0] + corrupted = self.corrupted_count(corrected) + return pd.Series({"path" : path, + "comparable_count" : self.print_count(comparable_count), + "realigned_count" : self.print_count(uncomparable.shape[0]), + # "meanHammingDistance" : mean_stats["hammingDistance"], + "mean_lev_distance" : "{__dist:.4f} (x{__percent:.2f})".format(__dist=mean_stats["levDistance"], __percent=self.__mean_lev_dist / mean_stats["levDistance"]), + # "gain" : mean_stats["gain"], + # "total_insertions" : sum_stats["insertions"], + # "total_deletions" : sum_stats["deletions"], + # "total_mismatch" : sum_stats["mismatch"], + # "improved_count" : self.improved_count(corrected), + "improved_count" : self.print_count(self.comparable_improved_count(corrected)), + "corrupted_count" : self.print_count(corrupted), + "realigned_corrupted_count" : self.print_count(corrupted + uncomparable.shape[0]), + # "full_fixed_count" : self.full_fixed_count(corrected), + "full_fixed_count" : self.print_count(self.comparable_full_fixed_count(corrected)), + "one_error_corrupted_count" : self.print_count(self.corrupted_count(corrected_dist_one)), + "one_error_full_fixed_count" : self.print_count(self.comparable_full_fixed_count(corrected_dist_one))}) + + def join_with_baseline(self, corrected): + return corrected.join(self.baseline, rsuffix="_baseline") + + + def run_calc_stats_task(self, reads_path): + stats_path = self.stats_filename(reads_path) + if os.path.isfile(stats_path) and not self.force_recalc: + return + cmd = "java -Xmx64G -jar {__comparator_jar} {__reference} {__reads} {__stats_file}".format(__comparator_jar = self.comparator_jar, + __reference=self.reference, + __reads = reads_path, + __stats_file = stats_path) + subprocess.call(cmd, shell=True) + + + def print_baseline_stats(self): + print("Baseline distance stats:") + print("Mean distance") + print(self.baseline[["hammingDistance", "levDistance"]].mean(axis=0)) + self.__mean_lev_dist = self.baseline[["levDistance"]].mean(axis=0)[0] + print("Error sums:") + print(self.baseline.drop(['levDistance', 'hammingDistance'], axis=1).sum()) + + def __init__(self, reference_path, baseline_path, force_recalc, comparator_jar = "~/comparator.jar"): + self.force_recalc = force_recalc + self.reference = reference_path + self.comparator_jar = comparator_jar + self.results = [] + self.__mean_lev_dist = 0 + self.run_calc_stats_task(baseline_path) + self.baseline = pd.read_csv(Comparator.stats_filename(baseline_path), sep="\t") + self.baseline.set_index("id", inplace=True) + self.print_baseline_stats() + + def add(self, path): + self.run_calc_stats_task(path) + self.results.append(self.calc_stats(path)) + + def save_results(self, path): + result = pd.DataFrame(self.results) + result.set_index("path", inplace=True) + result = result.T + result.to_latex(path, float_format="%.4f") + + + +class Mapper(object): + def __init__(self, reference_path, force_remap): + self.samtools_cmd = "tmap mapall -f {__reference}".format(__reference=reference_path) \ + + " -i {__input_type} -o 1 -O 2 -g 3 -s {__output} -o 0 -v stage1 map1 map2 map3 map4" + self.samtools_view = "samtools view -h {__input_file}" + self.zcat_view = "zcat {__input_file}" + self.cat_view = "cat {__input_file}" + self.force_remap = force_remap + + @staticmethod + def get_reads_format(path): + if path.endswith("sam"): + return "sam" + elif path.endswith("bam"): + return "sam" + elif path.endswith("fasta.gz") or path.endswith("fasta"): + return "fasta" + elif path.endswith("fastq"): + return "fastq" + + + @staticmethod + def is_sam_or_bam(path): + return path.endswith("sam") or path.endswith("bam") + + @staticmethod + def is_fasta_gzip(path): + return path.endswith("fasta.gz") + + @staticmethod + def is_fasta_or_fastq(path): + return path.endswith("fasta") or path.endswith("fastq") + + def get_view_cmd(self, path): + if Mapper.is_sam_or_bam(path): + return self.samtools_view + elif Mapper.is_fasta_gzip(path): + return self.zcat_view + elif Mapper.is_fasta_or_fastq(path): + return self.cat_view + else: + exit_with_error("Unknown extension for file " + path) + + def map_reads(self, path): + mapped_path = self.mapped_filename(path) + if os.path.isfile(mapped_path) and not self.force_remap: + return + view_cmd = self.get_view_cmd(path).format(__input_file=path) + map_cmd = self.samtools_cmd.format(__output=self.mapped_filename(path), __input_type=self.get_reads_format(path)) + cmd = "{__view_cmd} | {__map_cmd}".format(__view_cmd=view_cmd, __map_cmd=map_cmd) + print("Running tmap command: " + cmd) + subprocess.call(cmd, shell=True) + + @staticmethod + def mapped_filename(path): + return path + ".mapped.sam" + + def run_task(self, path): + self.map_reads(path) + return self.mapped_filename(path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Compute reads quality for correction') + parser.add_argument('--reference', dest='reference', help="Reference fasta file") + parser.add_argument("--force", help="Force recalc all stats", dest='recalc', action="store_true") + parser.add_argument("--reads", help="Baseline reads", dest='reads') + parser.add_argument("--corrected-reads", help="Corrected reads", dest='corrected_reads', nargs="+") + args = parser.parse_args() + + mapper = Mapper(args.reference, args.recalc) + + mapped_baseline_path = mapper.run_task(args.reads) + comparator = Comparator(args.reference, mapped_baseline_path, args.recalc) + + for corrected_path in args.corrected_reads: + mapped_reads_path = mapper.run_task(corrected_path) + comparator.add(mapped_reads_path) + comparator.save_results("corrections_quality.tex") + + diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/config_struct.cpp spades-3.11.1+dfsg/src/projects/ionhammer/config_struct.cpp --- spades-3.10.1+dfsg/src/projects/ionhammer/config_struct.cpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/config_struct.cpp 2017-09-28 09:05:13.000000000 +0000 @@ -7,7 +7,7 @@ #include "config_struct.hpp" -#include "utils/openmp_wrapper.h" +#include "utils/parallel/openmp_wrapper.h" #include "llvm/Support/YAMLParser.h" #include "llvm/Support/YAMLTraits.h" @@ -16,69 +16,107 @@ using namespace llvm; -namespace llvm { namespace yaml { +namespace llvm { +namespace yaml { template <> struct ScalarEnumerationTraits { - static void enumeration(yaml::IO &io, hammer_config::HammerStage &value) { - io.enumCase(value, "count", hammer_config::HammerStage::KMerCounting); - io.enumCase(value, "hamcluster", hammer_config::HammerStage::HammingClustering); - io.enumCase(value, "subcluster", hammer_config::HammerStage::SubClustering); - io.enumCase(value, "correct", hammer_config::HammerStage::ReadCorrection); - } + static void enumeration(yaml::IO &io, hammer_config::HammerStage &value) { + io.enumCase(value, "count", hammer_config::HammerStage::KMerCounting); + io.enumCase(value, "hamcluster", + hammer_config::HammerStage::HammingClustering); + io.enumCase(value, "subcluster", hammer_config::HammerStage::SubClustering); + io.enumCase(value, "correct", hammer_config::HammerStage::ReadCorrection); + } }; -}} + + template <> + struct ScalarEnumerationTraits { + static void enumeration(yaml::IO &io, hammer_config::CenterType &value) { + io.enumCase(value, "count_max", hammer_config::CenterType::COUNT_ARGMAX); + io.enumCase(value, "consensus", hammer_config::CenterType::CONSENSUS); + io.enumCase(value, "posterior_consensus", hammer_config::CenterType::BY_POSTERIOR_QUALITY); + } + }; +} // namespace yaml +} // namespace llvm // FIXME: This is temporary class DataSetReader { - public: - DataSetReader(yaml::IO&) {} - DataSetReader(yaml::IO&, io::DataSet<>&) {} + public: + DataSetReader(yaml::IO &) {} + DataSetReader(yaml::IO &, io::DataSet<> &) {} - io::DataSet<> denormalize(yaml::IO &) { - return io::DataSet<>(path); - } + io::DataSet<> denormalize(yaml::IO &) { return io::DataSet<>(path); } - std::string path; + std::string path; }; -namespace llvm { namespace yaml { +namespace llvm { +namespace yaml { template <> struct MappingTraits { - static void mapping(yaml::IO &io, hammer_config::hammer_config &cfg) { - yaml::MappingNormalization> dataset(io, cfg.dataset); - - io.mapRequired("dataset", dataset->path); - io.mapOptional("working_dir", cfg.working_dir, std::string(".")); - io.mapOptional("output_dir", cfg.output_dir, std::string(".")); - io.mapRequired("hard_memory_limit", cfg.hard_memory_limit); - io.mapOptional("count_split_buffer", cfg.count_split_buffer, 0ul); - io.mapOptional("max_nthreads", cfg.max_nthreads, 1u); - io.mapRequired("kmer_qual_threshold", cfg.kmer_qual_threshold); - io.mapRequired("center_qual_threshold", cfg.center_qual_threshold); - io.mapRequired("delta_score_threshold", cfg.delta_score_threshold); - io.mapRequired("keep_uncorrected_ends", cfg.keep_uncorrected_ends); - io.mapRequired("tau", cfg.tau); - io.mapOptional("debug_mode", cfg.debug_mode, false); - io.mapOptional("start_stage", cfg.start_stage, hammer_config::HammerStage::KMerCounting); - } + static void mapping(yaml::IO &io, hammer_config::hammer_config &cfg) { + yaml::MappingNormalization> dataset( + io, cfg.dataset); + + io.mapRequired("dataset", dataset->path); + io.mapOptional("working_dir", cfg.working_dir, std::string(".")); + io.mapOptional("output_dir", cfg.output_dir, std::string(".")); + io.mapRequired("hard_memory_limit", cfg.hard_memory_limit); + io.mapOptional("count_split_buffer", cfg.count_split_buffer, 0ul); + io.mapOptional("max_nthreads", cfg.max_nthreads, 1u); + + io.mapOptional("oracle_path", cfg.oracle_path, std::string("")); + io.mapOptional("max_full_del", cfg.max_full_del, 1u); + io.mapOptional("max_second_indel", cfg.max_second_indel, 1u); + io.mapOptional("max_indel", cfg.max_indel, 3u); + io.mapOptional("max_from_zero_insertion", cfg.max_from_zero_insertion, 1u); + + io.mapOptional("sample_rate", cfg.sample_rate, 1.0); + io.mapOptional("subcluster_min_count", cfg.subcluster_min_count, 15u); + io.mapOptional("good_threshold", cfg.good_threshold, -0.69); + io.mapOptional("skip_threshold", cfg.skip_threshold, -0.01); + io.mapOptional("subcluster_threshold", cfg.subcluster_threshold, -0.001); + io.mapOptional("subcluster_filter_by_count", cfg.subcluster_filter_by_count_enabled, true); + io.mapOptional("queue_limit_multiplier", cfg.queue_limit_multiplier, 500); + io.mapOptional("dist_one_subcluster_alpha", cfg.dist_one_subcluster_alpha, 0.51); + io.mapOptional("subcluster_qual_mult", cfg.subcluster_qual_mult, 1.0); + io.mapOptional("subcluster_count_mult", cfg.subcluster_count_mult, 0.3); + io.mapOptional("correction_penalty", cfg.correction_penalty, -7.0); + io.mapOptional("bad_kmer_penalty", cfg.bad_kmer_penalty, -20.0); + io.mapOptional("count_dist_eps", cfg.count_dist_eps, 1e-3); + io.mapOptional("count_dist_skip_quantile", cfg.count_dist_skip_quantile, 0.05); + io.mapOptional("noise_filter_count_threshold", cfg.noise_filter_count_threshold, 3u); + io.mapOptional("center_type", cfg.center_type, hammer_config::CenterType::COUNT_ARGMAX); + + + io.mapRequired("kmer_qual_threshold", cfg.kmer_qual_threshold); + io.mapRequired("center_qual_threshold", cfg.center_qual_threshold); + io.mapRequired("delta_score_threshold", cfg.delta_score_threshold); + io.mapRequired("keep_uncorrected_ends", cfg.keep_uncorrected_ends); + io.mapRequired("tau", cfg.tau); + io.mapOptional("debug_mode", cfg.debug_mode, false); + io.mapOptional("start_stage", cfg.start_stage, + hammer_config::HammerStage::KMerCounting); + } }; -}} +} // namespace yaml +} // namespace llvm namespace hammer_config { -void load(hammer_config& cfg, const std::string &filename) { - ErrorOr> Buf = MemoryBuffer::getFile(filename); - if (!Buf) - throw(std::string("Failed to load config file ") + filename); - - yaml::Input yin(*Buf.get()); - yin >> cfg; - - if (yin.error()) - throw(std::string("Failed to load config file ") + filename); - - // Fix number of threads according to OMP capabilities. - cfg.max_nthreads = std::min(cfg.max_nthreads, (unsigned)omp_get_max_threads()); - // Inform OpenMP runtime about this :) - omp_set_num_threads(cfg.max_nthreads); -} +void load(hammer_config &cfg, const std::string &filename) { + ErrorOr> Buf = MemoryBuffer::getFile(filename); + if (!Buf) throw(std::string("Failed to load config file ") + filename); + + yaml::Input yin(*Buf.get()); + yin >> cfg; + + if (yin.error()) throw(std::string("Failed to load config file ") + filename); + + // Fix number of threads according to OMP capabilities. + cfg.max_nthreads = + std::min(cfg.max_nthreads, (unsigned)omp_get_max_threads()); + // Inform OpenMP runtime about this :) + omp_set_num_threads(cfg.max_nthreads); } +} // namespace hammer_config diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/config_struct.hpp spades-3.11.1+dfsg/src/projects/ionhammer/config_struct.hpp --- spades-3.10.1+dfsg/src/projects/ionhammer/config_struct.hpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/config_struct.hpp 2017-09-28 09:05:13.000000000 +0000 @@ -9,7 +9,6 @@ #define __HAMMER_IT_CONFIG_HPP__ #include "pipeline/config_singl.hpp" - #include "pipeline/library.hpp" namespace hammer_config { @@ -20,7 +19,10 @@ ReadCorrection = 4 }; -struct hammer_config { +enum class CenterType { COUNT_ARGMAX, CONSENSUS, BY_POSTERIOR_QUALITY }; + + + struct hammer_config { io::DataSet<> dataset; std::string working_dir; @@ -39,11 +41,36 @@ bool debug_mode; HammerStage start_stage; + + double sample_rate = 1.0; + unsigned max_full_del = 1; + unsigned max_indel = 3; + unsigned max_second_indel = 1; + unsigned max_from_zero_insertion = 1; + std::string oracle_path = ""; + + unsigned subcluster_min_count = 15; + double good_threshold = -0.69; + double skip_threshold = -0.01; + double subcluster_threshold = -0.001; + bool subcluster_filter_by_count_enabled = true; + int queue_limit_multiplier = 200; + double dist_one_subcluster_alpha = 0.6; + double subcluster_qual_mult = 1.0; + double subcluster_count_mult = 0.4; + + double correction_penalty = -7; + double bad_kmer_penalty = -20; + double count_dist_eps = 1e-3; + double count_dist_skip_quantile = 0.05; + + unsigned noise_filter_count_threshold = 3; + CenterType center_type = CenterType::COUNT_ARGMAX; }; -void load(hammer_config& cfg, const std::string &filename); -} +void load(hammer_config& cfg, const std::string& filename); +} // namespace hammer_config typedef config_common::config cfg; -#endif // __HAMMER_IT_CONFIG_HPP__ +#endif // __HAMMER_IT_CONFIG_HPP__ diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/consensus.hpp spades-3.11.1+dfsg/src/projects/ionhammer/consensus.hpp --- spades-3.10.1+dfsg/src/projects/ionhammer/consensus.hpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/consensus.hpp 2017-09-28 09:05:13.000000000 +0000 @@ -10,14 +10,14 @@ #include "HSeq.hpp" -#include #include +#include namespace hammer { namespace iontorrent { -inline -std::pair consensus(const boost::numeric::ublas::matrix& scores) { +inline std::pair consensus( + const boost::numeric::ublas::matrix& scores) { double inf = -std::numeric_limits::infinity(); double max = inf; @@ -34,7 +34,7 @@ return std::make_pair(hammer::HomopolymerRun(nucl, len), max); } -}; -}; +}; // namespace iontorrent +}; // namespace hammer -#endif // __HAMMER_IT_CONSENSUS_HPP__ +#endif // __HAMMER_IT_CONSENSUS_HPP__ diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/err_helper_table.cpp spades-3.11.1+dfsg/src/projects/ionhammer/err_helper_table.cpp --- spades-3.10.1+dfsg/src/projects/ionhammer/err_helper_table.cpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/err_helper_table.cpp 2017-09-28 09:05:13.000000000 +0000 @@ -25,15 +25,13 @@ // (2 * 4^^2) / 32, // (2 * 4^^4) / 32, // ... -const HelperTable helper_tables[] = { - { 1, helper_table_data }, - { 2, helper_table_data + 1 }, - { 3, helper_table_data + 17 }, - { 4, helper_table_data + 273 }, - { 5, helper_table_data + 4369 } -}; +const HelperTable helper_tables[] = {{1, helper_table_data}, + {2, helper_table_data + 1}, + {3, helper_table_data + 17}, + {4, helper_table_data + 273}, + {5, helper_table_data + 4369}}; -}; // namespace internal +}; // namespace internal -}; // namespace errHelper -}; // namespace hammer +}; // namespace errHelper +}; // namespace hammer diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/err_helper_table.hpp spades-3.11.1+dfsg/src/projects/ionhammer/err_helper_table.hpp --- spades-3.10.1+dfsg/src/projects/ionhammer/err_helper_table.hpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/err_helper_table.hpp 2017-09-28 09:05:13.000000000 +0000 @@ -10,11 +10,11 @@ #include "hkmer.hpp" -#include +#include +#include #include #include -#include -#include +#include #include "utils/logger/logger.hpp" @@ -23,11 +23,7 @@ namespace errHelper { /// Type of error -enum Hint { - kMismatch, - kInsertion, - kDeletion -}; +enum Hint { kMismatch, kInsertion, kDeletion }; namespace internal { @@ -36,38 +32,34 @@ struct HelperTable { const unsigned k_; - const uint32_t* storage_; + const uint32_t *storage_; template - Hint lookupHint(const It1 &x_it, const It2 &y_it, - size_t x_nfront, size_t y_nfront) const { - + Hint lookupHint(const It1 &x_it, const It2 &y_it, size_t x_nfront, + size_t y_nfront) const { VERIFY(k_ <= MAX_K); unsigned x_code = getCode(x_it, x_nfront, k_); unsigned y_code = getCode(y_it, y_nfront, k_); unsigned code = x_code + (y_code << (2 * k_)); - uint32_t bt = storage_[code / 16]; // 16 hints per uint32_t + uint32_t bt = storage_[code / 16]; // 16 hints per uint32_t unsigned shift = (code % 16) * 2; return static_cast((bt >> shift) & 0x3); } template - static unsigned getCode(const HRunIter& x_it, size_t x_nfront, size_t k) { + static unsigned getCode(const HRunIter &x_it, size_t x_nfront, size_t k) { unsigned code = 0; unsigned len = 0; auto nucl = x_it->nucl; - for (len = 0; len < x_nfront && len < k; ++len) - code |= nucl << (2 * len); + for (len = 0; len < x_nfront && len < k; ++len) code |= nucl << (2 * len); - if (len == k) - return code; + if (len == k) return code; - for (HRunIter it = x_it + 1; ; ++it) { + for (HRunIter it = x_it + 1;; ++it) { for (size_t i = 0; i < it->len; ++i) { code |= it->nucl << (2 * len++); - if (len == k) - return code; + if (len == k) return code; } } @@ -83,19 +75,17 @@ const HRunIter &x_end, size_t x_nfront) { size_t n = x_nfront; - if (n >= MAX_K) - return MAX_K; + if (n >= MAX_K) return MAX_K; for (HRunIter it = x_it + 1; it != x_end; ++it) { n += it->len; - if (n >= MAX_K) - return MAX_K; + if (n >= MAX_K) return MAX_K; } return n; } -}; // namespace internal +}; // namespace internal /// Estimate what kind of error occurred at the position template @@ -107,11 +97,11 @@ size_t x_rem = internal::getNumberOfRemainingBases(x_begin, x_end, x_nfront); size_t y_rem = internal::getNumberOfRemainingBases(y_begin, y_end, y_nfront); - auto& table = internal::helper_tables[std::min(x_rem, y_rem) - 1]; + auto &table = internal::helper_tables[std::min(x_rem, y_rem) - 1]; return table.lookupHint(x_begin, y_begin, x_nfront, y_nfront); } -}; // namespace errHelper -}; // namespace hammer +}; // namespace errHelper +}; // namespace hammer -#endif // __HAMMER_ERR_HELPER_TABLE_HPP__ +#endif // __HAMMER_ERR_HELPER_TABLE_HPP__ diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/expander.cpp spades-3.11.1+dfsg/src/projects/ionhammer/expander.cpp --- spades-3.10.1+dfsg/src/projects/ionhammer/expander.cpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/expander.cpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,60 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#include "expander.hpp" - -#include "config_struct.hpp" -#include "kmer_data.hpp" -#include "valid_hkmer_generator.hpp" - -#include "io/reads/file_reader.hpp" - -#include -#include - -bool Expander::operator()(const io::SingleRead &r) { - size_t sz = r.size(); - - std::vector covered_by_solid(sz, false); - std::vector kmer_indices(sz, -1ull); - - ValidHKMerGenerator gen(r); - while (gen.HasMore()) { - hammer::HKMer kmer = gen.kmer(); - size_t idx = data_.seq_idx(kmer), kl = kmer.size(); - size_t read_pos = gen.pos() - kl; - - kmer_indices[read_pos] = idx; - if (data_[idx].changeto == idx && - data_[idx].qual < cfg::get().center_qual_threshold) { - for (size_t j = read_pos; j < read_pos + kl; ++j) { - VERIFY_MSG(j < sz, "read_pos == " << read_pos << ", r.size() == " << r.size() << ", kmer: " << kmer << ", read: " << r.GetSequenceString()); - covered_by_solid[j] = true; - } - } - - gen.Next(); - } - - for (size_t j = 0; j < sz; ++j) { - if (!covered_by_solid[j] || kmer_indices[j] == -1ull) - continue; - - size_t idx = kmer_indices[j]; - auto &kmer_data = data_[idx]; - if (kmer_data.changeto != idx) { -# pragma omp atomic - changed_ += 1; - - kmer_data.lock(); - kmer_data.changeto = static_cast(idx); - kmer_data.unlock(); - } - } - - return false; -} diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/expander.hpp spades-3.11.1+dfsg/src/projects/ionhammer/expander.hpp --- spades-3.10.1+dfsg/src/projects/ionhammer/expander.hpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/expander.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,31 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#ifndef __HAMMER_EXPANDER_HPP__ -#define __HAMMER_EXPANDER_HPP__ - -class KMerData; -namespace io { -class SingleRead; -} - -#include - -class Expander { - KMerData &data_; - size_t changed_; - - public: - Expander(KMerData &data) - : data_(data), changed_(0) {} - - size_t changed() const { return changed_; } - - bool operator()(const io::SingleRead &r); -}; - -#endif diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/flow_space_read.hpp spades-3.11.1+dfsg/src/projects/ionhammer/flow_space_read.hpp --- spades-3.10.1+dfsg/src/projects/ionhammer/flow_space_read.hpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/flow_space_read.hpp 2017-09-28 09:05:13.000000000 +0000 @@ -8,11 +8,11 @@ #ifndef __HAMMER_IT_FLOW_SPACE_READ_HPP__ #define __HAMMER_IT_FLOW_SPACE_READ_HPP__ -#include "io/reads/single_read.hpp" #include "HSeq.hpp" +#include "io/reads/single_read.hpp" -#include #include +#include #include namespace hammer { @@ -21,6 +21,7 @@ class FlowSpaceRead { std::string name_; std::deque runs_; + public: FlowSpaceRead(const io::SingleRead& read) : name_(read.name()) { const auto& seq = read.GetSequenceString(); @@ -28,24 +29,15 @@ } template - FlowSpaceRead(It runs_beg, It runs_end) : - runs_(runs_beg, runs_end) {} + FlowSpaceRead(It runs_beg, It runs_end) : runs_(runs_beg, runs_end) {} - size_t size() const { - return runs_.size(); - } + size_t size() const { return runs_.size(); } - const std::string& name() const { - return name_; - } + const std::string& name() const { return name_; } - HomopolymerRun operator[](size_t index) const { - return runs_[index]; - } + HomopolymerRun operator[](size_t index) const { return runs_[index]; } - HomopolymerRun& operator[](size_t index) { - return runs_[index]; - } + HomopolymerRun& operator[](size_t index) { return runs_[index]; } void TrimLeft(size_t n_runs) { if (n_runs >= runs_.size()) @@ -63,15 +55,12 @@ std::string GetSequenceString() const { std::string seq; - for (size_t i = 0; i < runs_.size(); ++i) - seq += runs_[i].str(); + for (size_t i = 0; i < runs_.size(); ++i) seq += runs_[i].str(); return seq; } - const std::deque& data() const { - return runs_; - } + const std::deque& data() const { return runs_; } }; -} // namespace hammer +} // namespace hammer #endif diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/gamma_poisson_model.cpp spades-3.11.1+dfsg/src/projects/ionhammer/gamma_poisson_model.cpp --- spades-3.10.1+dfsg/src/projects/ionhammer/gamma_poisson_model.cpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/gamma_poisson_model.cpp 2017-09-28 09:05:13.000000000 +0000 @@ -0,0 +1,16 @@ +// +// Created by Vasiliy Ershov on 08/11/2016. +// + +#include "gamma_poisson_model.hpp" + +using namespace n_gamma_poisson_model; + +std::array PoissonGammaDistribution::log_gamma_integer_cache_ = + []() -> std::array { + std::array cache; + for (size_t i = 0; i < cache.size(); ++i) { + cache[i] = boost::math::lgamma(i + 1); + } + return cache; +}(); diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/gamma_poisson_model.hpp spades-3.11.1+dfsg/src/projects/ionhammer/gamma_poisson_model.hpp --- spades-3.10.1+dfsg/src/projects/ionhammer/gamma_poisson_model.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/gamma_poisson_model.hpp 2017-09-28 09:05:13.000000000 +0000 @@ -0,0 +1,869 @@ +// +// Created by Vasiliy Ershov on 08/11/2016. +// + +#ifndef PROJECT_GAMMA_POISSON_MODEL_HPP +#define PROJECT_GAMMA_POISSON_MODEL_HPP + +#include +#include +#include +#include +#include +#include "kmer_data.hpp" +#include "thread_utils.h" +#include "valid_hkmer_generator.hpp" +// + +namespace n_gamma_poisson_model { + +struct QualFunc { + double alpha_; + double beta_; + + double operator()(double x) const { return alpha_ * x + beta_; } + + double GenomicLogLikelihood(double x) const { + const double val = (*this)(x); + const double exp_point = exp(val); + return val - (std::isfinite(exp_point) ? log(1 + exp_point) : val); + } +}; + +class GammaDistribution { + private: + double shape_; + double rate_; + double log_gamma_at_shape_; + + public: + GammaDistribution(const GammaDistribution&) = default; + + GammaDistribution& operator=(const GammaDistribution&) = default; + + GammaDistribution(const double shape = 1, const double rate = 1) + : shape_(shape), rate_(rate) { + log_gamma_at_shape_ = boost::math::lgamma(shape_); + } + + inline double GetShape() const { return shape_; } + + inline double GetRate() const { return rate_; } + + inline double LogGammaAtShape() const { return log_gamma_at_shape_; } +}; + +class GammaMixture { + private: + GammaDistribution first_; + GammaDistribution second_; + double first_weight_; + + public: + GammaMixture() : first_(1, 1), second_(1, 1), first_weight_(-1) {} + + GammaMixture(const GammaDistribution& first, + const GammaDistribution& second, + double firstWeight) + : first_(first), second_(second), first_weight_(firstWeight) {} + + const GammaDistribution& GetFirst() const { return first_; } + + const GammaDistribution& GetSecond() const { return second_; } + + double GetFirstWeight() const { return first_weight_; } +}; + +class PoissonGammaDistribution { + private: + const GammaDistribution& prior_; + static std::array log_gamma_integer_cache_; + + private: + inline double IntLogGamma(size_t count) const { + if (count < log_gamma_integer_cache_.size()) { + return log_gamma_integer_cache_[count]; + } else { + return boost::math::lgamma(((double)count) + 1); + } + } + + public: + PoissonGammaDistribution(const GammaDistribution& prior) : prior_(prior) {} + + inline double PartialLogLikelihood(size_t count) const { + const double a = prior_.GetShape(); + const double b = prior_.GetRate(); + + double ll = 0.0; + ll += a * log(b) - (a + (double)count) * log(b + 1); + ll += + boost::math::lgamma(prior_.GetShape() + (double)count) - prior_.LogGammaAtShape(); + return ll; + } + + inline double LogLikelihood(size_t count) const { + const double a = prior_.GetShape(); + const double b = prior_.GetRate(); + + double ll = 0.0; + ll += a * log(b) - (a + (double)count) * log(b + 1); + ll += boost::math::lgamma(prior_.GetShape() + ((double)count)) - IntLogGamma(count) - + prior_.LogGammaAtShape(); + + return ll; + } + + inline double Quantile(double p) const { + const double a = prior_.GetShape(); + const double b = prior_.GetRate(); + return boost::math::ibeta_inva(a, 1.0 / (1.0 + b), 1.0 - p); + } + + inline double Cumulative(size_t count) const { + const double a = prior_.GetShape(); + const double b = prior_.GetRate(); + + return 1.0 - boost::math::ibeta((double)count + 1, a, 1.0 / (1.0 + b)); + } +}; + +constexpr int RunSizeLimit = 8; + +class ParametricClusterModel { + private: + GammaMixture prior_; + QualFunc qual_func_; + double count_threshold_; + std::array alphas_; + + public: + public: + ParametricClusterModel() : count_threshold_(100000) {} + + double ErrorRate(const int runSize) const { + auto idx = runSize - 1; + idx = std::max(idx, 0); + idx = std::min(idx, (const int)(alphas_.size() - 1)); + return alphas_[idx] * (runSize == 0 ? 0.5 : 1); + } + + double ExpectedErrorRate(const hammer::HKMer& from, + const hammer::HKMer& to) const { + double errRate = 0; + for (uint i = 0; i < hammer::K; ++i) { + errRate += + std::abs(from[i].len - to[i].len) * log(ErrorRate(from[i].len)); + // errRate += std::abs(from[i].len - to[i].len) * + // log(ErrorRate(from[i].len)) - log(1.0 - ErrorRate(from[i].len)); + } + return exp(errRate); + } + + ParametricClusterModel(const GammaMixture& prior, + const QualFunc& qualFunc, + const double countThreshold, + const std::array& alphas) + : prior_(prior), qual_func_(qualFunc), count_threshold_(countThreshold) { + std::copy(alphas.begin(), alphas.end(), alphas_.begin()); + for (uint i = 0; i < RunSizeLimit; ++i) { + INFO("Run length " << i << " estimated error rate " << alphas_[i]); + } + } + + ParametricClusterModel(const ParametricClusterModel& other) = default; + + ParametricClusterModel& operator=(const ParametricClusterModel&) = default; + + double QualityLogPrior(double qual) const { + return max(min(qual_func_.GenomicLogLikelihood(qual), -1e-10), -1000.0); + } + + bool NeedSubcluster(const hammer::KMerStat& stat) const { + return qual_func_.GenomicLogLikelihood(stat.qual) > -0.1 && + stat.count >= count_threshold_; + } + + const GammaDistribution& GenomicPrior() const { return prior_.GetFirst(); } + + const GammaDistribution& NoisePrior() const { return prior_.GetSecond(); } + + double GenerateLogLikelihood(double expectedNoiseCount, + size_t noiseCount) const { + const auto& prior = NoisePrior(); + + GammaDistribution posterior(prior.GetShape() + expectedNoiseCount, + prior.GetRate() + 1); + return PoissonGammaDistribution(posterior).LogLikelihood(noiseCount); + } + + double GenomicLogLikelihood(size_t count) const { + const auto& prior = GenomicPrior(); + const double a = prior.GetShape(); + const double b = prior.GetRate(); + + double ll = a * log(b) - (a + (double)count) * log(b + 1); + ll += boost::math::lgamma(prior.GetShape() + ((double)count)) - + prior.LogGammaAtShape() - boost::math::lgamma(((double)count) + 1); + return ll; + } +}; + +// this class estimate prior distribution. +class TClusterModelEstimator { + private: + const KMerData& data_; + double threshold_; + uint num_threads_; + size_t max_terations_; + bool calc_likelihood_; + + private: + + struct TClusterSufficientStat { + double count_ = 0; + double qualtiy_ = 0; + double genomic_class_prob_ = 0; + }; + + struct TQualityStat { + double quality_ = 0; + double class_ = 0; + + TQualityStat(double quality, double cls) : quality_(quality), class_(cls) {} + }; + + struct TRunErrorStats { + const KMerData* data_; + std::array error_counts_; + std::array total_count_; + + TRunErrorStats(const KMerData& data) + : data_(&data){ + + }; + + std::array EstimateAlphas( + size_t priorSize = 100) const { + const double priors[] = {0.002, 0.004, 0.01, 0.02, + 0.035, 0.05, 0.09, 0.11}; + + std::array alphas; + for (uint i = 0; i < RunSizeLimit; ++i) { + alphas[i] = (error_counts_[i] + priors[i] * (double)priorSize) / + (total_count_[i] + (double)priorSize); + } + alphas[0] *= 2; + return alphas; + }; + + TRunErrorStats& operator+=(const TRunErrorStats& other) { + if (this != &other) { + for (uint i = 0; i < RunSizeLimit; ++i) { + error_counts_[i] += other.error_counts_[i]; + total_count_[i] += other.total_count_[i]; + } + } + return *this; + } + + void Add(const std::vector& indices, size_t centerIdx) { + const auto& center = (*data_)[centerIdx].kmer; + + for (auto idx : indices) { + if (idx == centerIdx) { + continue; + } + double errKmerCount = (double)(*data_)[idx].count; + const auto& errKmer = (*data_)[idx].kmer; + for (uint i = 0; i < hammer::K; ++i) { + if (center[i].len > RunSizeLimit) { + continue; + } + const int len = center[i].len - 1; + total_count_[len] += errKmerCount; + if (center[i].len != errKmer[i].len) { + error_counts_[len] += errKmerCount; + } + } + } + for (uint i = 0; i < hammer::K; ++i) { + if (center[i].len > RunSizeLimit) { + continue; + } + total_count_[center[i].len - 1] += (*data_)[centerIdx].count; + } + } + }; + + inline void Expectation(const PoissonGammaDistribution& first, + const PoissonGammaDistribution& second, + const QualFunc& qualFunc, + TClusterSufficientStat& center) const { + const double logPrior = qualFunc.GenomicLogLikelihood(center.qualtiy_) + + log(boost::math::gamma_q(center.count_, threshold_)); + + const double firstLL = first.PartialLogLikelihood((size_t)center.count_) + logPrior; + const double secondLL = second.PartialLogLikelihood((size_t)center.count_) + + log(max(1.0 - exp(logPrior), 1e-20)); + + const double posterior = 1.0 / (1.0 + exp(secondLL - firstLL)); + center.genomic_class_prob_ = posterior; + } + + inline void QualityExpectation(const QualFunc& qualFunc, + TClusterSufficientStat& center) const { + center.genomic_class_prob_ = + exp(qualFunc.GenomicLogLikelihood(center.qualtiy_)); + } + + inline TClusterSufficientStat Create(const size_t centerIdx) const { + TClusterSufficientStat stat; + stat.genomic_class_prob_ = + data_[centerIdx].count > 0 + ? boost::math::gamma_q(data_[centerIdx].count, threshold_) + : 0; + stat.count_ = data_[centerIdx].count; + stat.qualtiy_ = data_[centerIdx].qual; + return stat; + } + + std::vector CreateSufficientStats( + const std::vector& clusterCenters) const { + std::vector clusterSufficientStat; + clusterSufficientStat.reserve(clusterCenters.size()); + + for (size_t i = 0; i < clusterCenters.size(); ++i) { + const size_t centerIdx = clusterCenters[i]; + auto stat = Create(centerIdx); + if (stat.count_ > 0) { + clusterSufficientStat.push_back(stat); + } + } + return clusterSufficientStat; + } + + std::vector CreateQualityStats( + const std::vector>& clusters, + const std::vector& clusterCenters) const { + std::vector qualities; + qualities.reserve(clusterCenters.size()); + + for (size_t i = 0; i < clusterCenters.size(); ++i) { + const size_t centerIdx = clusterCenters[i]; + if (data_[centerIdx].count >= threshold_) { + for (auto idx : clusters[i]) { + if (idx != centerIdx) { + qualities.push_back(TQualityStat(data_[idx].qual, 0)); + } + } + qualities.push_back(TQualityStat(data_[centerIdx].qual, 1)); + } + } + return qualities; + } + + template + class TCountsStat { + private: + double count_ = 0; + double count2_ = 0; + double weight_ = 0; + + public: + void Add(const TClusterSufficientStat& stat) { + const double w = (WEIGHTED ? stat.genomic_class_prob_ : 1.0); + count_ += w * stat.count_; + count2_ += w * stat.count_ * stat.count_; + weight_ += w; + } + + TCountsStat& operator+=(const TCountsStat& other) { + if (this != &other) { + count_ += other.count_; + count2_ += other.count2_; + weight_ += other.weight_; + } + return *this; + } + + double GetWeightedSum() const { return count_; } + + double GetWeightedSum2() const { return count2_; } + + double GetWeight() const { return weight_; } + }; + + class TLogGammaStat { + private: + double genomic_shape_; + double non_genomic_shape_; + double genomic_log_gamma_sum_ = 0; + double non_genomic_log_gamma_sum_ = 0; + + public: + TLogGammaStat(double genomicShape, double nonGenomicShape) + : genomic_shape_(genomicShape), non_genomic_log_gamma_sum_(nonGenomicShape) {} + + void Add(const TClusterSufficientStat& stat) { + genomic_log_gamma_sum_ += stat.genomic_class_prob_ * + boost::math::lgamma(stat.count_ + genomic_shape_); + non_genomic_log_gamma_sum_ += + (1.0 - stat.genomic_class_prob_) * + boost::math::lgamma(stat.count_ + non_genomic_shape_); + } + + TLogGammaStat& operator+=(const TLogGammaStat& other) { + if (this != &other) { + genomic_log_gamma_sum_ += other.genomic_log_gamma_sum_; + non_genomic_log_gamma_sum_ += other.non_genomic_log_gamma_sum_; + } + return *this; + } + + double GetGenomicLogGammaSum() const { return genomic_log_gamma_sum_; } + + double GetNonGenomicLogGammaSum() const { return non_genomic_log_gamma_sum_; } + }; + + class TQualityLogitLinearRegressionPoint { + private: + // p(genomic) = exp(Alpha qual + beta) / (1.0 + exp(Alpha qual + beta)) + QualFunc func_; + + double likelihood_ = 0; + + double der_alpha_ = 0; + double der_beta_ = 0; + + double der2_alpha_ = 0; + double der2_beta_ = 0; + double der2_alpha_beta_ = 0; + + public: + TQualityLogitLinearRegressionPoint(QualFunc func) : func_(func) {} + + void Add(const TClusterSufficientStat& statistic) { + Add(statistic.genomic_class_prob_, statistic.qualtiy_); + } + + void Add(const TQualityStat& statistic) { + Add(statistic.class_, statistic.quality_); + } + + void Add(const double firstClassProb, double qual) { + const double val = func_(qual); + const double expPoint = exp(val); + const double p = + std::isfinite(expPoint) ? expPoint / (1.0 + expPoint) : 1.0; + + der_alpha_ += (firstClassProb - p) * qual; + der_beta_ += firstClassProb - p; + + der2_alpha_ -= sqr(qual) * p * (1 - p); + der2_beta_ -= p * (1 - p); + der2_alpha_beta_ -= qual * p * (1 - p); + + likelihood_ += firstClassProb * val - + (std::isfinite(expPoint) ? log(1 + expPoint) : val); + } + + TQualityLogitLinearRegressionPoint& operator+=( + const TQualityLogitLinearRegressionPoint& other) { + if (this != &other) { + likelihood_ += other.likelihood_; + + der_alpha_ += other.der_alpha_; + der_beta_ += other.der_beta_; + + der2_alpha_ += other.der2_alpha_; + der2_beta_ += other.der2_beta_; + der2_alpha_beta_ += other.der2_alpha_beta_; + } + return *this; + } + + double GetLikelihood() const { return likelihood_; } + + double GetDerAlpha() const { return der_alpha_; } + + double GetDerBeta() const { return der_beta_; } + + double GetDer2Alpha() const { return der2_alpha_; } + + double GetDer2Beta() const { return der2_beta_; } + + double GetDer2AlphaBeta() const { return der2_alpha_beta_; } + }; + + QualFunc Update(const QualFunc& current, + const TQualityLogitLinearRegressionPoint& pointStats) const { + const double dera = pointStats.GetDerAlpha(); + const double derb = pointStats.GetDerBeta(); + + const double daa = pointStats.GetDer2Alpha() + 1e-3; + const double dbb = pointStats.GetDer2Beta() + 1e-3; + const double dab = pointStats.GetDer2AlphaBeta(); + const double det = daa * dbb - sqr(dab); + + double stepAlpha = (dbb * dera - dab * derb) / det; + double stepBeta = (daa * derb - dab * dera) / det; + + INFO("Quality estimation iteration gradient: " << dera << " " << derb); + INFO("Quality estimation likelihood: " << pointStats.GetLikelihood()); + + return {current.alpha_ - stepAlpha, current.beta_ - stepBeta}; + } + + class TGammaDerivativesStats { + private: + double first_class_shift_; + double second_class_shift_; + + double digamma_sum_first_ = 0; + double trigamma_sum_first_ = 0; + + double digamma_sum_second_ = 0; + double trigamma_sum_second_ = 0; + + public: + TGammaDerivativesStats(double firstShift, double secondShift) + : first_class_shift_(firstShift), second_class_shift_(secondShift) {} + + void Add(const TClusterSufficientStat& statistic) { + const double p = statistic.genomic_class_prob_; + digamma_sum_first_ += + p > 1e-3 ? p * boost::math::digamma(statistic.count_ + first_class_shift_) + : 0; + trigamma_sum_first_ += + p > 1e-3 + ? p * boost::math::trigamma(statistic.count_ + first_class_shift_) + : 0; + + digamma_sum_second_ += + p < (1.0 - 1e-3) ? (1.0 - p) * boost::math::digamma(statistic.count_ + + second_class_shift_) + : 0; + trigamma_sum_second_ += + p < (1.0 - 1e-3) ? (1.0 - p) * boost::math::trigamma(statistic.count_ + + second_class_shift_) + : 0; + } + + TGammaDerivativesStats& operator+=(const TGammaDerivativesStats& other) { + if (this != &other) { + digamma_sum_first_ += other.digamma_sum_first_; + trigamma_sum_first_ += other.trigamma_sum_first_; + + digamma_sum_second_ = other.digamma_sum_second_; + trigamma_sum_second_ += other.trigamma_sum_second_; + } + return *this; + } + + double GetDigammaSumFirst() const { return digamma_sum_first_; } + + double GetTrigammaSumFirst() const { return trigamma_sum_first_; } + + double GetDigammaSumSecond() const { return digamma_sum_second_; } + + double GetTrigammaSumSecond() const { return trigamma_sum_second_; } + }; + + static inline double sqr(double x) { return x * x; } + + struct TDirection { + double Direction; + double GradientNorm; + double Mu; + }; + + static TDirection MoveDirection(double shape, const double weightedSum, + const double weight, const double digammaSum, + const double trigammaSum, + double regularizer = 1e-4) { + const double mu = weight / weightedSum; + const double digammaAtShape = boost::math::digamma(shape); + const double trigammaAtShape = boost::math::trigamma(shape); + + const double b = mu * shape; + + const double der = + weight * (log(b) - log(b + 1) - digammaAtShape) + digammaSum; + const double der2 = + trigammaSum + weight * (1.0 / shape - mu / (b + 1) - trigammaAtShape); + + return {-der / (der2 + regularizer), std::abs(der), mu}; + } + + double Likelihood(GammaDistribution& prior, double weightedSum, + double weight, double lgammaSum) { + const double a = prior.GetShape(); + const double b = prior.GetRate(); + return weight * a * (log(b) - log(b + 1)) + weightedSum * log(b + 1) + + lgammaSum - weight * prior.LogGammaAtShape(); + } + + public: + TClusterModelEstimator(const KMerData& data, double threshold, + uint num_threads = 16, + size_t maxIterations = 40, + bool calcLikelihood = false) + : data_(data), + threshold_(threshold), + num_threads_(num_threads), + max_terations_(maxIterations), + calc_likelihood_(calcLikelihood) {} + + static inline GammaDistribution Update(const GammaDistribution& point, + const TDirection& direction, + double minShape = 0.01) { + double shape = std::max(point.GetShape() + direction.Direction, minShape); + double rate = shape * direction.Mu; + return GammaDistribution(shape, rate); + } + + static GammaDistribution MomentMethodEstimator(const double sum, + const double sum2, + const double weight) { + const double m = sum / weight; + const double var = sum2 / weight - m * m; + const double rate = 1.0 / max(var / m - 1, 1e-3); + const double shape = m * rate; + return GammaDistribution(shape, rate); + } + + ParametricClusterModel Estimate( + const std::vector>& clusters, + const std::vector& clusterCenter, const bool useEM = false, + const size_t sample = 0) { + if (sample && clusters.size() > sample) { + std::vector> sampledClusters; + std::vector sampledCenters; + } + + const auto qualityFunc = [&]() -> QualFunc { + auto qualStats = CreateQualityStats(clusters, clusterCenter); + + QualFunc cursor = {-1e-5, 0.0}; + + for (uint i = 0; i < 15; ++i) { + const auto qualDerStats = + n_computation_utils::TAdditiveStatisticsCalcer< + TQualityStat, TQualityLogitLinearRegressionPoint>(qualStats, + num_threads_) + .Calculate([&]() -> TQualityLogitLinearRegressionPoint { + return TQualityLogitLinearRegressionPoint(cursor); + }); + + cursor = Update(cursor, qualDerStats); + + if ((std::abs(qualDerStats.GetDerAlpha()) + + std::abs(qualDerStats.GetDerBeta())) < 1e-2) { + break; + } + } + + INFO("Quality function: " << cursor.alpha_ << "q + " << cursor.beta_); + return cursor; + }(); + + auto alphas = [&]() -> std::array { + TRunErrorStats errorStats = + n_computation_utils::ParallelStatisticsCalcer( + num_threads_) + .Calculate( + clusters.size(), + [&]() -> TRunErrorStats { return TRunErrorStats(data_); }, + [&](TRunErrorStats& stat, size_t k) { + if (data_[clusterCenter[k]].count >= threshold_) { + stat.Add(clusters[k], clusterCenter[k]); + } + }); + return errorStats.EstimateAlphas(); + }(); + + std::vector clusterSufficientStat = + CreateSufficientStats(clusterCenter); + + const auto totalStats = + n_computation_utils::TAdditiveStatisticsCalcer>( + clusterSufficientStat, num_threads_) + .Calculate([]() -> TCountsStat { + return TCountsStat(); + }); + +#pragma omp parallel for num_threads(num_threads_) + for (size_t k = 0; k < clusterSufficientStat.size(); ++k) { + QualityExpectation(qualityFunc, clusterSufficientStat[k]); + } + + auto countsStats = + n_computation_utils::TAdditiveStatisticsCalcer>( + clusterSufficientStat, num_threads_) + .Calculate([]() -> TCountsStat { + return TCountsStat(); + }); + + GammaDistribution genomicPrior = [&]() -> GammaDistribution { + const double m = countsStats.GetWeightedSum() / countsStats.GetWeight(); + const double var = + countsStats.GetWeightedSum2() / countsStats.GetWeight() - m * m; + const double rate = 1.0 / max(var / m - 1, 1e-3); + const double shape = m * rate; + return GammaDistribution(shape, rate); + }(); + + GammaDistribution nonGenomicPrior = [&]() -> GammaDistribution { + const double m = + (totalStats.GetWeightedSum() - countsStats.GetWeightedSum()) / + (totalStats.GetWeight() - countsStats.GetWeight()); + const double var = + (totalStats.GetWeightedSum2() - countsStats.GetWeightedSum2()) / + (totalStats.GetWeight() - countsStats.GetWeight()) - + m * m; + const double rate = 1.0 / max(var / m - 1, 1e-3); + const double shape = m * rate; + return GammaDistribution(shape, rate); + }(); + + for (uint i = 0, steps = 0; i < max_terations_; ++i, ++steps) { + auto gammaDerStats = + n_computation_utils::TAdditiveStatisticsCalcer( + clusterSufficientStat, num_threads_) + .Calculate([&]() -> TGammaDerivativesStats { + return TGammaDerivativesStats(genomicPrior.GetShape(), + nonGenomicPrior.GetShape()); + }); + + auto genomicDirection = MoveDirection( + genomicPrior.GetShape(), countsStats.GetWeightedSum(), + countsStats.GetWeight(), gammaDerStats.GetDigammaSumFirst(), + gammaDerStats.GetTrigammaSumFirst()); + + auto nonGenomicDirection = MoveDirection( + nonGenomicPrior.GetShape(), + totalStats.GetWeightedSum() - countsStats.GetWeightedSum(), + totalStats.GetWeight() - countsStats.GetWeight(), + gammaDerStats.GetDigammaSumSecond(), + gammaDerStats.GetTrigammaSumSecond()); + + auto gradientNorm = + genomicDirection.GradientNorm + nonGenomicDirection.GradientNorm; + + INFO("Iteration #" << i << " gradient norm " << gradientNorm); + + genomicPrior = Update(genomicPrior, genomicDirection); + nonGenomicPrior = Update(nonGenomicPrior, nonGenomicDirection); + + if (calc_likelihood_) { + auto logGammaStats = + n_computation_utils::TAdditiveStatisticsCalcer( + clusterSufficientStat, num_threads_) + .Calculate([&]() -> TLogGammaStat { + return TLogGammaStat(genomicPrior.GetShape(), + nonGenomicPrior.GetShape()); + }); + + INFO("Genomic likelihood: " << Likelihood( + genomicPrior, countsStats.GetWeightedSum(), + countsStats.GetWeight(), + logGammaStats.GetGenomicLogGammaSum())); + + INFO("NonGenomic likelihood: " << Likelihood( + nonGenomicPrior, + totalStats.GetWeightedSum() - countsStats.GetWeightedSum(), + totalStats.GetWeight() - countsStats.GetWeight(), + logGammaStats.GetNonGenomicLogGammaSum())); + } + + { + INFO("Genomic gamma prior estimation step: shape " + << genomicPrior.GetShape() << " and rate " + << genomicPrior.GetRate()); + INFO("Nongenomic gamma prior estimation step: shape " + << nonGenomicPrior.GetShape() << " and rate " + << nonGenomicPrior.GetRate()); + } + + double shapeDiff = std::abs(genomicDirection.Direction) + + std::abs(nonGenomicDirection.Direction); + + if (useEM) { + if ((shapeDiff < 1e-2) || gradientNorm < 1e-1 || + (steps == 5 && (i < max_terations_ - 10))) { + PoissonGammaDistribution genomic(genomicPrior); + PoissonGammaDistribution nonGenomic(nonGenomicPrior); +#pragma omp parallel for num_threads(num_threads_) + for (size_t k = 0; k < clusterSufficientStat.size(); ++k) { + Expectation(genomic, nonGenomic, qualityFunc, + clusterSufficientStat[k]); + } + + countsStats = n_computation_utils::TAdditiveStatisticsCalcer< + TClusterSufficientStat, TCountsStat>( + clusterSufficientStat, num_threads_) + .Calculate([]() -> TCountsStat { + return TCountsStat(); + }); + steps = 0; + } + } else { + if ((shapeDiff < 1e-4) || gradientNorm < 1e-2) { + break; + } + } + } + + INFO("Genomic gamma prior genomic estimated with shape " + << genomicPrior.GetShape() << " and rate " << genomicPrior.GetRate()); + INFO("Nongenomic Gamma prior estimated with shape " + << nonGenomicPrior.GetShape() << " and rate " + << nonGenomicPrior.GetRate()); + + return ParametricClusterModel( + GammaMixture(genomicPrior, nonGenomicPrior, + countsStats.GetWeight() / totalStats.GetWeight()), + qualityFunc, threshold_, alphas); + } + + static GammaDistribution EstimatePrior(const std::vector& counts) { + const size_t observations = counts.size(); + double sum = 0; + double sum2 = 0; + for (auto count : counts) { + sum += (double)count; + sum2 += (double)count * (double)count; + } + + GammaDistribution prior = + TClusterModelEstimator::MomentMethodEstimator(sum, sum2, (double)observations); + + for (uint i = 0, steps = 0; i < 10; ++i, ++steps) { + double digammaSum = 0; + double trigammaSum = 0; + for (auto count : counts) { + digammaSum += boost::math::digamma((double)count + prior.GetShape()); + trigammaSum += boost::math::trigamma((double)count + prior.GetShape()); + } + + auto direction = MoveDirection(prior.GetShape(), sum, (double)observations, + digammaSum, trigammaSum); + + const double shapeDiff = std::abs(direction.Direction); + if (shapeDiff < 1e-3 || (direction.GradientNorm < 1e-4)) { + break; + } + prior = Update(prior, direction, 1e-2); + } + return prior; + } +}; + +} // namespace NGammaPoissonModel + +#endif // PROJECT_GAMMA_POISSON_MODEL_HPP diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/hamcluster_1.cpp spades-3.11.1+dfsg/src/projects/ionhammer/hamcluster_1.cpp --- spades-3.10.1+dfsg/src/projects/ionhammer/hamcluster_1.cpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/hamcluster_1.cpp 2017-09-28 09:05:13.000000000 +0000 @@ -0,0 +1,5 @@ +// +// Created by Vasiliy Ershov on 25/09/16. +// + +#include "hamcluster_1.h" diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/hamcluster_1.h spades-3.11.1+dfsg/src/projects/ionhammer/hamcluster_1.h --- spades-3.10.1+dfsg/src/projects/ionhammer/hamcluster_1.h 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/hamcluster_1.h 2017-09-28 09:05:13.000000000 +0000 @@ -0,0 +1,81 @@ +// +// Created by Vasiliy Ershov on 25/09/16. +// + +#ifndef PROJECT_HAMCLUSTER_1_H +#define PROJECT_HAMCLUSTER_1_H + +#include +#include +#include "HSeq.hpp" +#include "kmer_data.hpp" +#include "utils/logger/logger.hpp" +#include "valid_hkmer_generator.hpp" + +namespace hammer { + +using HRun = HomopolymerRun; + +class TOneErrorClustering { + private: + const KMerData& data_; + dsu::ConcurrentDSU clusters_; + + bool TryMergeClusters(const HKMer& source, + const size_t source_idx, + const HKMer& fixed) { + auto fixed_idx = data_.checking_seq_idx(fixed); + if (fixed_idx == (-1ULL)) { + return false; + } + if (data_[fixed_idx].count > 0) { + clusters_.unite(source_idx, fixed_idx); + auto rSource = !source; + auto rFixed = !fixed; + clusters_.unite(data_.seq_idx(rSource), data_.seq_idx(rFixed)); + return true; + } else { + return false; + } + } + + void TryCorrection(const KMerStat& source_stat, size_t source_idx) { + const auto& source = source_stat.kmer; + auto fixed = source; + for (uint k = 0; k < K; ++k) { + for (uint i = (uint)std::max(source[k].len - 1, 1); + i <= (uint)(source[k].len + 1); ++i) { + if (i == source[k].len) { + continue; + } + fixed[k].len = i & 0x3F; + + TryMergeClusters(source, source_idx, fixed); + } + fixed[k].len = source[k].len; + } + } + + public: + + TOneErrorClustering(const KMerData& data, + const uint num_threads = 16) + : data_(data), clusters_(data.size()) { + + (void)num_threads; // stupid compiler +#pragma omp parallel for num_threads(num_threads) + for (size_t idx = 0; idx < data_.size(); ++idx) { + if (data_[idx].count > 0) { + TryCorrection(data_[idx], idx); + } + } + } + + void FillClasses(std::vector >& clusters) { + clusters_.get_sets(clusters); + } +}; + +} // namespace hammer + +#endif // PROJECT_HAMCLUSTER_1_H diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/hamcluster.cpp spades-3.11.1+dfsg/src/projects/ionhammer/hamcluster.cpp --- spades-3.10.1+dfsg/src/projects/ionhammer/hamcluster.cpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/hamcluster.cpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,219 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#include "hamcluster.hpp" - -#include "hkmer_distance.hpp" -#include "common/adt/concurrent_dsu.hpp" -#include "io/kmers/mmapped_reader.hpp" - -#include -#include - -#ifdef USE_GLIBCXX_PARALLEL -#include -#endif - -struct SubKMerComparator { - bool operator()(const SubKMerData &lhs, const SubKMerData &rhs) { - return SubKMer::less2_fast()(lhs.data, rhs.data); - } -}; - -std::pair SubKMerSplitter::split() { - std::vector data; - - MMappedReader ifs(ifname_, /* unlink */ true); - std::ofstream ofs(ofname_, std::ios::out | std::ios::binary); - VERIFY(ofs.good()); - size_t icnt = 0, ocnt = 0; - while (ifs.good()) { - SubKMerComparator comp; - - deserialize(data, ifs); - -#ifdef USE_GLIBCXX_PARALLEL - // Explicitly force a call to parallel sort routine. - __gnu_parallel::sort(data.begin(), data.end(), comp); -#else - std::sort(data.begin(), data.end(), comp); -#endif - for (auto start = data.begin(), end = data.end(); start != end;) { - auto chunk_end = std::upper_bound(start + 1, data.end(), *start, comp); - serialize(ofs, start, chunk_end); - start = chunk_end; - ocnt += 1; - } - icnt += 1; - } - VERIFY(!ofs.fail()); - - ofs.close(); - - return std::make_pair(icnt, ocnt); -} - -#if 1 -static bool canMerge(const ConcurrentDSU &uf, unsigned x, unsigned y) { - size_t szx = uf.set_size(x), szy = uf.set_size(y); - const size_t hardthr = 2500; - - // Global threshold - no cluster larger than hard threshold - if (szx + szy > hardthr) - return false; - - // If one of the clusters is moderately large, than attach "almost" singletons - // only. - if ((szx > hardthr * 3 / 4 && szy > 50) || - (szy > hardthr * 3 / 4 && szx > 50)) - return false; - - return true; -} -#else -static bool canMerge(const ConcurrentDSU &uf, unsigned x, unsigned y) { - return (uf.set_size(x) + uf.set_size(y)) < 10000; -} -#endif - - -static void processBlockQuadratic(ConcurrentDSU &uf, - const std::vector &block, - const KMerData &data, - unsigned tau) { - size_t blockSize = block.size(); - for (size_t i = 0; i < blockSize; ++i) { - auto x = static_cast(block[i]); - hammer::HKMer kmerx = data[x].kmer; - hammer::HKMer rkmerx = !kmerx; - auto rcx = static_cast(data.seq_idx(rkmerx)); - - for (size_t j = i + 1; j < blockSize; j++) { - auto y = static_cast(block[j]); - hammer::HKMer kmery = data[y].kmer; - hammer::HKMer rkmery = !kmery; - auto rcy = static_cast(data.seq_idx(rkmery)); - if ((uf.find_set(x) != uf.find_set(y) || uf.find_set(rcx) != - uf.find_set(rcy)) && - (canMerge(uf, x, y) || canMerge(uf, rcx, rcy)) && - (hammer::distanceHKMer(kmerx.begin(), kmerx.end(), - kmery.begin(), kmery.end(), tau) <= tau || - hammer::distanceHKMer(rkmerx.begin(), rkmerx.end(), - rkmery.begin(), rkmery.end(), tau) <= tau)) { - uf.unite(x, y); - uf.unite(rcx, rcy); - } - } - } -} - -void KMerHamClusterer::cluster(const std::string &prefix, - const KMerData &data, - ConcurrentDSU &uf) { - // First pass - split & sort the k-mers - std::ostringstream tmp; - tmp << prefix << ".first"; - std::string fname(tmp.str()); - std::ofstream ofs(fname, std::ios::out | std::ios::binary); - VERIFY(ofs.good()); - - INFO("Serializing sub-kmers."); - for (unsigned i = 0; i < tau_ + 1; ++i) { - // size_t from = (*Globals::subKMerPositions)[i]; - // size_t to = (*Globals::subKMerPositions)[i+1]; - size_t from = 0 + i*hammer::K / (tau_ + 1); - size_t to = 0 + (i+1)*hammer::K / (tau_ + 1); - - INFO("Serializing: [" << from << ", " << to << ")"); - serialize(ofs, data, NULL, - SubKMerPartSerializer(from, to)); - } - VERIFY(!ofs.fail()); - ofs.close(); - - size_t big_blocks1 = 0; - { - INFO("Splitting sub-kmers, pass 1."); - SubKMerSplitter Splitter(fname, fname + ".blocks"); - std::pair stat = Splitter.split(); - INFO("Splitting done." - " Processed " << stat.first << " blocks." - " Produced " << stat.second << " blocks."); - - // Sanity check - there cannot be more blocks than tau + 1 times of total - // kmer number. And on the first pass we have only tau + 1 input blocks! - VERIFY(stat.first == tau_ + 1); - VERIFY(stat.second <= (tau_ + 1) * data.size()); - - // Ok, now in the files we have everything grouped in blocks in the output files. - - std::vector block; - - INFO("Merge sub-kmers, pass 1"); - SubKMerBlockFile blocks(fname + ".blocks", /* unlink */ true); - - std::ostringstream tmp; - tmp << prefix << ".second"; - fname = tmp.str(); - - ofs.open(fname, std::ios::out | std::ios::binary); - VERIFY(ofs.good()); - while (blocks.get_block(block)) { - // unsigned block_thr = cfg::get().hamming_blocksize_quadratic_threshold; - unsigned block_thr = 50; - if (block.size() < block_thr) { - // Merge small blocks. - processBlockQuadratic(uf, block, data, tau_); - } else { - big_blocks1 += 1; - // Otherwise - dump for next iteration. - for (unsigned i = 0; i < tau_ + 1; ++i) { - serialize(ofs, data, &block, - SubKMerStridedSerializer(i, tau_ + 1)); - } - } - } - VERIFY(!ofs.fail()); - ofs.close(); - INFO("Merge done, total " << big_blocks1 << " new blocks generated."); - } - - size_t big_blocks2 = 0; - { - INFO("Spliting sub-kmers, pass 2."); - SubKMerSplitter Splitter(fname, fname + ".blocks"); - std::pair stat = Splitter.split(); - INFO("Splitting done." - " Processed " << stat.first << " blocks." - " Produced " << stat.second << " blocks."); - - // Sanity check - there cannot be more blocks than tau + 1 times of total - // kmer number. And there should be tau + 1 times big_blocks input blocks. - VERIFY(stat.first == (tau_ + 1)*big_blocks1); - VERIFY(stat.second <= (tau_ + 1) * (tau_ + 1) * data.size()); - - INFO("Merge sub-kmers, pass 2"); - SubKMerBlockFile blocks(fname + ".blocks", /* unlink */ true); - std::vector block; - - size_t nblocks = 0; - while (blocks.get_block(block)) { - if (block.size() > 50) { - big_blocks2 += 1; -#if 0 - for (size_t i = 0; i < block.size(); ++i) { - std::string s(Globals::blob + data[block[i]], K); - INFO("" << block[i] << ": " << s); - } -#endif - } - processBlockQuadratic(uf, block, data, tau_); - nblocks += 1; - } - INFO("Merge done, saw " << big_blocks2 << " big blocks out of " << nblocks << " processed."); - } -} diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/hamcluster.hpp spades-3.11.1+dfsg/src/projects/ionhammer/hamcluster.hpp --- spades-3.10.1+dfsg/src/projects/ionhammer/hamcluster.hpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/hamcluster.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,191 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#ifndef HAMMER_SUBKMER_SORTER_HPP -#define HAMMER_SUBKMER_SORTER_HPP - -#include "kmer_data.hpp" -#include "io/kmers/mmapped_reader.hpp" - -#include "utils/logger/logger.hpp" -#include "HSeq.hpp" - -#include -#include - -class ConcurrentDSU; - -typedef hammer::HSeq<(hammer::K + 1) / 2> SubKMer; - -struct SubKMerData { - uint64_t idx; - SubKMer data; -}; - -template -inline void binary_read(Reader &is, SubKMerData &s) { - SubKMer::DataType seq_data[SubKMer::DataSize]; - - is.read((char*)&s.idx, sizeof(s.idx)); - is.read((char*)seq_data, sizeof(seq_data)); - - s.data = SubKMer(seq_data, seq_data + SubKMer::DataSize); -} - -template -inline Writer &binary_write(Writer &os, const SubKMerData &s) { - os.write((char*)&s.idx, sizeof(s.idx)); - os.write((char*)s.data.data(), SubKMer::TotalBytes); - - return os; -} - -static_assert(sizeof(SubKMerData) == 16, "Too big SubKMer"); - -class SubKMerPartSerializer{ - size_t from_; - size_t to_; - -public: - SubKMerPartSerializer(size_t from, size_t to) - :from_(from), to_(to) { VERIFY(to_ - from_ <= hammer::K); } - - SubKMerData serialize(hammer::HKMer k, size_t fidx) const { - SubKMerData s; - - s.idx = fidx; - s.data = SubKMer(k.data() + from_, k.data() + to_); - - // Yay for NRVO! - return s; - } -}; - -class SubKMerStridedSerializer{ - size_t from_; - size_t stride_; - -public: - SubKMerStridedSerializer(size_t from, size_t stride) - :from_(from), stride_(stride) { VERIFY(from_ + stride_ <= hammer::K); } - - SubKMerData serialize(hammer::HKMer k, size_t fidx) const { - SubKMerData s; - - s.idx = fidx; - - size_t sz = (hammer::K - from_ + stride_ - 1) / stride_; - - std::vector v(sz); - for (size_t i = from_, j = 0; i < hammer::K; i+= stride_, ++j) - v[j] = k[i]; - - s.data = SubKMer(&v[0], &v[0] + sz); - - // Yay for NRVO! - return s; - } -}; - -class SubKMerBlockFile { - MMappedReader ifs_; - - public: - SubKMerBlockFile(const std::string &fname, bool unlink = false) - : ifs_(fname, unlink) { } - - bool get_block(std::vector &block) { - block.clear(); -#if 0 - block.shrink_to_fit(); -#else - std::vector().swap(block); -#endif - - if (!ifs_.good()) - return false; - - size_t sz; - ifs_.read((char*)&sz, sizeof(sz)); - block.resize(sz); - for (size_t i = 0; i < sz; ++i) { - SubKMerData s; - binary_read(ifs_, s); - block[i] = s.idx; - } - - return true; - } -}; - -template -void serialize(Writer &os, - const KMerData &data, const std::vector *block = NULL, - const SubKMerSerializer &serializer = SubKMerSerializer()) { - size_t sz = (block == NULL ? data.size() : block->size()); - os.write((char*)&sz, sizeof(sz)); - for (size_t i = 0, e = sz; i != e; ++i) { - size_t idx = (block == NULL ? i : (*block)[i]); - SubKMerData s = serializer.serialize(data[idx].kmer, idx); - binary_write(os, s); - } -} - -class SubKMerSplitter { - const std::string ifname_; - const std::string ofname_; - - public: - SubKMerSplitter(const std::string &ifname, const std::string &ofname) - : ifname_(ifname), ofname_(ofname) {} - - template - void serialize(Writer &os, - const std::vector::iterator &start, - const std::vector::iterator &end) { - size_t sz = end - start; - - os.write((char*)&sz, sizeof(sz)); - for (auto I = start, E = end; I != E; ++I) - binary_write(os, *I); - } - - template - void deserialize(std::vector &res, - Reader &is) { - res.clear(); -#if 0 - res.shrink_to_fit(); -#else - std::vector().swap(res); -#endif - - size_t sz; - is.read((char*)&sz, sizeof(sz)); - res.resize(sz); - - for (size_t i = 0, e = sz; i != e; ++i) - binary_read(is, res[i]); - } - - std::pair split(); -}; - -class KMerHamClusterer { - unsigned tau_; - - public: - KMerHamClusterer(unsigned tau) - : tau_(tau) {} - - void cluster(const std::string &prefix, const KMerData &data, ConcurrentDSU &uf); - private: - DECL_LOGGER("Hamming Clustering"); -}; - -#endif // HAMMER_SUBKMER_SORTER_HPP diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/hkmer_distance.hpp spades-3.11.1+dfsg/src/projects/ionhammer/hkmer_distance.hpp --- spades-3.10.1+dfsg/src/projects/ionhammer/hkmer_distance.hpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/hkmer_distance.hpp 2017-09-28 09:05:13.000000000 +0000 @@ -8,8 +8,8 @@ #ifndef __HAMMER_HKMER_DISTANCE_HPP__ #define __HAMMER_HKMER_DISTANCE_HPP__ -#include "hkmer.hpp" #include "err_helper_table.hpp" +#include "hkmer.hpp" namespace hammer { @@ -29,9 +29,8 @@ unsigned length; }; -template +template class IonPairAligner { - It1 x_it_; It1 x_end_; It2 y_it_; @@ -40,17 +39,16 @@ bool empty_; hammer::HomopolymerRun cx_, cy_; int end_diff_; - bool at_the_start_; // turned off once we find a pair of runs with same nucleotide + bool at_the_start_; // turned off once we find a pair of runs with same + // nucleotide IonPairAlignEvent front_; // true iff alignment process is not yet finished bool checkForZeroLengthRuns() { - if (x_it_ == x_end_ || y_it_ == y_end_) - return false; + if (x_it_ == x_end_ || y_it_ == y_end_) return false; - if (cx_.len > 0 && cy_.len > 0) - return true; + if (cx_.len > 0 && cy_.len > 0) return true; bool result = true; while (cx_.len == 0) { @@ -74,16 +72,14 @@ bool fetchNextX() { ++x_it_; - if (x_it_ == x_end_) - return false; + if (x_it_ == x_end_) return false; cx_ = *x_it_; return true; } bool fetchNextY() { ++y_it_; - if (y_it_ == y_end_) - return false; + if (y_it_ == y_end_) return false; cy_ = *y_it_; return true; } @@ -118,19 +114,25 @@ void finishAlignmentProcess() { empty_ = true; if (x_it_ != x_end_) { - end_diff_ += x_end_ - x_it_; + end_diff_ += int(x_end_ - x_it_); } if (y_it_ != y_end_) { - end_diff_ -= y_end_ - y_it_; + end_diff_ -= int(y_end_ - y_it_); } } public: - IonPairAligner(const It1 &x_begin, const It1 &x_end, - const It2 &y_begin, const It2 &y_end) - : x_it_(x_begin), x_end_(x_end), y_it_(y_begin), y_end_(y_end), - empty_(false), cx_(*x_it_), cy_(*y_it_), end_diff_(0), at_the_start_(true) - { + IonPairAligner(const It1 &x_begin, const It1 &x_end, const It2 &y_begin, + const It2 &y_end) + : x_it_(x_begin), + x_end_(x_end), + y_it_(y_begin), + y_end_(y_end), + empty_(false), + cx_(*x_it_), + cy_(*y_it_), + end_diff_(0), + at_the_start_(true) { popFront(); } @@ -178,8 +180,7 @@ at_the_start_ = false; } - if (!end) - break; + if (!end) break; } if (!checkForZeroLengthRuns()) { @@ -193,7 +194,6 @@ VERIFY(y_it_ < y_end_); if (cx_.nucl == cy_.nucl) { - if (cx_.len >= 4 && cy_.len >= 4) { if (cx_.len < cy_.len) yieldBaseInsertion(); @@ -217,7 +217,6 @@ yieldMismatch(); return; } else { - using namespace hammer::errHelper; auto hint = getHint(x_it_, x_end_, y_it_, y_end_, cx_.len, cy_.len); @@ -239,20 +238,18 @@ } }; - // returns distance between two homopolymer sequences; // optionally, fills *end_diff: // [ --------- X ----------- ] // [---------- Y -------]###### // \____/ // end_diff -template +template inline unsigned distanceHKMer(const It1 &x_begin, const It1 &x_end, const It2 &y_begin, const It2 &y_end, - unsigned tau = -1, int *end_diff=NULL) { + unsigned tau = -1, int *end_diff = NULL) { unsigned dist = 0; IonPairAligner aligner(x_begin, x_end, y_begin, y_end); @@ -261,24 +258,28 @@ auto event = aligner.front(); switch (event.type) { case kIonEventMismatch: - dist += kMismatchCost * event.length; break; + dist += kMismatchCost * event.length; + break; case kIonEventBaseInsertion: - dist += kBaseInsertionCost * event.length; break; + dist += kBaseInsertionCost * event.length; + break; case kIonEventBaseDeletion: - dist += kBaseDeletionCost * event.length; break; + dist += kBaseDeletionCost * event.length; + break; case kIonEventRunInsertion: - dist += kRunInsertionCost * event.length; break; + dist += kRunInsertionCost * event.length; + break; case kIonEventRunDeletion: - dist += kRunDeletionCost * event.length; break; - default: break; - } - if (dist > tau && end_diff == NULL) + dist += kRunDeletionCost * event.length; + break; + default: break; + } + if (dist > tau && end_diff == NULL) break; aligner.popFront(); } - if (end_diff != NULL) - *end_diff = aligner.endDiff(); + if (end_diff != NULL) *end_diff = aligner.endDiff(); return dist; } @@ -287,92 +288,90 @@ #include namespace unittest { - namespace detail { +namespace detail { - typedef hammer::HKMer::StorageType::const_iterator It; +typedef hammer::HKMer::StorageType::const_iterator It; - inline unsigned distanceHKMer(It beg1, It end1, It beg2, It end2) { - unsigned dist = 0; +inline unsigned distanceHKMer(It beg1, It end1, It beg2, It end2) { + unsigned dist = 0; - IonPairAligner aligner(beg1, end1, beg2, end2); + IonPairAligner aligner(beg1, end1, beg2, end2); - while (!aligner.empty()) { - auto event = aligner.front(); - switch (event.type) { - case kIonEventMismatch: - std::cerr << event.length << 'X'; - dist += event.length; break; - case kIonEventBaseInsertion: - std::cerr << event.length << 'I'; - dist += event.length; break; - case kIonEventBaseDeletion: - std::cerr << event.length << 'D'; - dist += event.length; break; - case kIonEventRunInsertion: - std::cerr << event.length << 'I'; - dist += event.length; break; - case kIonEventRunDeletion: - std::cerr << event.length << 'D'; - dist += event.length; break; - default: break; - } - aligner.popFront(); - } - - std::cerr << " (end. diff. = " << aligner.endDiff() << ")" << std::endl; - return dist; + while (!aligner.empty()) { + auto event = aligner.front(); + switch (event.type) { + case kIonEventMismatch: + std::cerr << event.length << 'X'; + dist += event.length; + break; + case kIonEventBaseInsertion: + std::cerr << event.length << 'I'; + dist += event.length; + break; + case kIonEventBaseDeletion: + std::cerr << event.length << 'D'; + dist += event.length; + break; + case kIonEventRunInsertion: + std::cerr << event.length << 'I'; + dist += event.length; + break; + case kIonEventRunDeletion: + std::cerr << event.length << 'D'; + dist += event.length; + break; + default: + break; } + aligner.popFront(); + } - inline unsigned distance(const std::string &s, const std::string &t) { - using namespace hammer; - HKMer k1, k2; - for (size_t i = 0; i < s.size(); ++i) - k1 <<= s[i]; - for (size_t i = 0; i < t.size(); ++i) - k2 <<= t[i]; + std::cerr << " (end. diff. = " << aligner.endDiff() << ")" << std::endl; + return dist; +} - return distanceHKMer(k1.begin(), k1.end(), k2.begin(), k2.end()); - } - } +inline unsigned distance(const std::string &s, const std::string &t) { + using namespace hammer; + HKMer k1, k2; + for (size_t i = 0; i < s.size(); ++i) k1 <<= s[i]; + for (size_t i = 0; i < t.size(); ++i) k2 <<= t[i]; - inline void hkmer_distance() { - using namespace detail; + return distanceHKMer(k1.begin(), k1.end(), k2.begin(), k2.end()); +} +} // namespace detail - assert(distance("ACGTACGTACGTACGT", - "CGTACGTACGTACGTA") > 1); +inline void hkmer_distance() { + using namespace detail; - assert(distance("AACGTACGTACGTACGT", - "CGTACGTACGTACGTA") > 1); + assert(distance("ACGTACGTACGTACGT", "CGTACGTACGTACGTA") > 1); - assert(distance("GATAGCGATTTGTTCGGTTTAGGGGGGG", - "GATAGCGATTTGTTCGTTTAG") >= 7); + assert(distance("AACGTACGTACGTACGT", "CGTACGTACGTACGTA") > 1); - assert(distance("ATAGCGATTTGTTCGGTTTAGGGGGGGT", - "ATAGCGATTTGTTCGTTTAGA") >= 7); + assert(distance("GATAGCGATTTGTTCGGTTTAGGGGGGG", "GATAGCGATTTGTTCGTTTAG") >= + 7); - assert(distance("GATTTGTTCGGTTTAGGGGGGGTAGGGGGATTA", - "GATTTGTTCGTTTAGGGGGGGTAGGGGGATTA") == 1); + assert(distance("ATAGCGATTTGTTCGGTTTAGGGGGGGT", "ATAGCGATTTGTTCGTTTAGA") >= + 7); - assert(distance("TTAAGGCTTACAAAGACTGCGTTT", - "TTAAGGCTTACAAAGACTGCGTTTT") == 1); + assert(distance("GATTTGTTCGGTTTAGGGGGGGTAGGGGGATTA", + "GATTTGTTCGTTTAGGGGGGGTAGGGGGATTA") == 1); - assert(distance("AAGGCTTACAAAGACTGCGTTTAA", - "AAGGCTTACAAAGACTGCGTA") >= 2); + assert(distance("TTAAGGCTTACAAAGACTGCGTTT", "TTAAGGCTTACAAAGACTGCGTTTT") == + 1); - assert(distance("ACAAAGACTGCGTTTAAGAGC", - "ACAAAGACTGCGTTTTAAGAGC") == 1); + assert(distance("AAGGCTTACAAAGACTGCGTTTAA", "AAGGCTTACAAAGACTGCGTA") >= 2); - assert(distance("CTAGGAATGAAAAAGAGAACAAGAA", - "CTAGGAATGAAAAAGAGAAAAAAGAATG") == 2); + assert(distance("ACAAAGACTGCGTTTAAGAGC", "ACAAAGACTGCGTTTTAAGAGC") == 1); - assert(distance("ACACACAGGGTTTTTGAACTGGATT", - "ACACACAGGGTTTTGAACTGGATT") == 1); + assert(distance("CTAGGAATGAAAAAGAGAACAAGAA", + "CTAGGAATGAAAAAGAGAAAAAAGAATG") == 2); - assert(distance("ACATAAGCCTTTGTACTTAGC", - "ACATAAGCCTTTGACTTAGCA") == 1); - } -} + assert(distance("ACACACAGGGTTTTTGAACTGGATT", "ACACACAGGGTTTTGAACTGGATT") == + 1); + assert(distance("ACATAAGCCTTTGTACTTAGC", "ACATAAGCCTTTGACTTAGCA") == 1); +} +} // namespace unittest -}; -#endif // __HAMMER_HKMER_DISTANCE_HPP__ +}; // namespace hammer +#endif // __HAMMER_HKMER_DISTANCE_HPP__ diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/hkmer.hpp spades-3.11.1+dfsg/src/projects/ionhammer/hkmer.hpp --- spades-3.10.1+dfsg/src/projects/ionhammer/hkmer.hpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/hkmer.hpp 2017-09-28 09:05:13.000000000 +0000 @@ -8,13 +8,43 @@ #ifndef __HAMMER_HKMER_HPP__ #define __HAMMER_HKMER_HPP__ +#include +#include #include "HSeq.hpp" namespace hammer { const uint32_t K = 16; -typedef HSeq HKMer; +using HKMer = HSeq; +struct HKMerDistanceResult { + double hamming_ = 0; + double levenshtein_ = 0; + + HKMerDistanceResult(double hamming = 0, double lev = 0) + : hamming_(hamming), levenshtein_(lev) {} }; -#endif // __HAMMER_HKMER_HPP__ +inline HKMerDistanceResult hkmerDistance(const HKMer& left, + const HKMer& right) { + HKMerDistanceResult dist = {0, 0}; + + for (uint32_t i = 0; i < K; ++i) { + if (left[i].nucl != right[i].nucl) { + return {std::numeric_limits::infinity(), + std::numeric_limits::infinity()}; + } + + if (left[i].len != right[i].len) { + dist.hamming_ += 1; + dist.levenshtein_ += std::abs(left[i].len - right[i].len); + } + } + return dist; +} + + + +}; // namespace hammer + +#endif // __HAMMER_HKMER_HPP__ diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/HSeq.hpp spades-3.11.1+dfsg/src/projects/ionhammer/HSeq.hpp --- spades-3.10.1+dfsg/src/projects/ionhammer/HSeq.hpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/HSeq.hpp 2017-09-28 09:05:13.000000000 +0000 @@ -8,13 +8,13 @@ #ifndef __HAMMER_HSEQ_HPP__ #define __HAMMER_HSEQ_HPP__ -#include "sequence/nucl.hpp" #include +#include "sequence/nucl.hpp" #include +#include #include #include -#include #include @@ -23,56 +23,50 @@ union HomopolymerRun { uint8_t raw; struct { - uint8_t len : 6; + uint8_t len : 6; uint8_t nucl : 2; }; - HomopolymerRun() - : raw(0) {} - HomopolymerRun(uint8_t nucl, uint8_t len) + HomopolymerRun() : raw(0) {} + HomopolymerRun(uint8_t nucl, uint8_t len = 1) : len(len & 63), nucl(nucl & 3) {} - bool operator==(const HomopolymerRun &that) const { - return raw == that.raw; - } + bool operator==(const HomopolymerRun &that) const { return raw == that.raw; } - bool operator!=(const HomopolymerRun &that) const { - return raw != that.raw; - } + bool operator!=(const HomopolymerRun &that) const { return raw != that.raw; } - bool operator<(const HomopolymerRun &that) const { - return raw < that.raw; - } + bool operator<(const HomopolymerRun &that) const { return raw < that.raw; } - std::string str() const { - return std::string(len, ::nucl(nucl)); - } + inline char Nucl() const { return nucl; } + + inline char Len() const { return len; } + + std::string str() const { return std::string(len, ::nucl(nucl)); } }; namespace iontorrent { - // Container shall have push_back method - template - void toHomopolymerRuns(const std::string &seq, Container& runs) { - if (seq.empty()) - return; - - char nucl = seq[0]; - uint8_t len = 1; - for (size_t i = 1; i < seq.size(); ++i) { - if (seq[i] != nucl) { - runs.push_back(HomopolymerRun(dignucl(nucl), len)); - len = 1; - nucl = seq[i]; - } else { - ++len; - } - } - if (len > 0) { +// Container shall have push_back method +template +void toHomopolymerRuns(const std::string &seq, Container &runs) { + if (seq.empty()) return; + + char nucl = seq[0]; + uint8_t len = 1; + for (size_t i = 1; i < seq.size(); ++i) { + if (seq[i] != nucl) { runs.push_back(HomopolymerRun(dignucl(nucl), len)); + len = 1; + nucl = seq[i]; + } else { + ++len; } } + if (len > 0) { + runs.push_back(HomopolymerRun(dignucl(nucl), len)); + } +} -}; +}; // namespace iontorrent template class HSeq { @@ -82,14 +76,12 @@ private: StorageType data_; - const static size_t PrimeNum = 239; - public: HSeq() {} - HSeq(typename StorageType::const_iterator Start, - typename StorageType::const_iterator End) { - std::copy(Start, End, data_.begin()); + template + HSeq(Iterator start, Iterator end) { + std::copy(start, end, data_.begin()); } typedef HomopolymerRun DataType; @@ -101,13 +93,9 @@ return N * sizeof(HomopolymerRun); } - typename StorageType::const_iterator begin() const { - return data_.begin(); - } + typename StorageType::const_iterator begin() const { return data_.begin(); } - typename StorageType::const_iterator end() const { - return data_.end(); - } + typename StorageType::const_iterator end() const { return data_.end(); } typename StorageType::const_reverse_iterator rbegin() const { return data_.rbegin(); @@ -117,21 +105,13 @@ return data_.rend(); } - const HomopolymerRun *data() const { - return data_.data(); - } + const HomopolymerRun *data() const { return data_.data(); } - size_t data_size() const { - return DataSize; - } + size_t data_size() const { return DataSize; } - HomopolymerRun &operator[](size_t idx) { - return data_[idx]; - } + HomopolymerRun &operator[](size_t idx) { return data_[idx]; } - const HomopolymerRun &operator[](size_t idx) const { - return data_[idx]; - } + const HomopolymerRun &operator[](size_t idx) const { return data_[idx]; } HSeq operator!() const { HSeq res(*this); @@ -144,56 +124,97 @@ res[N - i - 1] = front; } - if (N & 1) - res[N/2].nucl = complement(res[N/2].nucl) & 3; + if (N & 1) res[N / 2].nucl = complement(res[N / 2].nucl) & 3; return res; } HSeq operator<<(char nucl) const { - if (is_nucl(nucl)) - nucl = dignucl(nucl); + if (is_nucl(nucl)) nucl = dignucl(nucl); HSeq res(*this); // Easy case - just add to run - HomopolymerRun &last = res[N-1]; + HomopolymerRun &last = res[N - 1]; if (last.nucl == nucl) { last.len += 1; return res; } // Hard case - have to shift the stuff - for (size_t i = 0; i < N - 1; ++i) - res[i] = res[i + 1]; + for (size_t i = 0; i < N - 1; ++i) res[i] = res[i + 1]; res[N - 1].nucl = nucl; res[N - 1].len = 1; return res; } - HSeq& operator<<=(char nucl) { - if (is_nucl(nucl)) - nucl = dignucl(nucl); + HSeq operator<<(HomopolymerRun run) const { + HSeq res(*this); + // Easy case - just add to run + HomopolymerRun &last = res[N - 1]; + if (last.nucl == run.nucl) { + last.len += run.len; + return res; + } + + // Hard case - have to shift the stuff + for (size_t i = 0; i < N - 1; ++i) res[i] = res[i + 1]; + res[N - 1] = run; + + return res; + } + + HSeq &operator<<=(char nucl) { + if (is_nucl(nucl)) nucl = dignucl(nucl); // Easy case - just add to run - HomopolymerRun &last = data_[N-1]; + HomopolymerRun &last = data_[N - 1]; if (last.nucl == nucl) { last.len = (last.len + 1) & 63; return *this; } // Hard case - have to shift the stuff - for (size_t i = 0; i < N - 1; ++i) - data_[i] = data_[i + 1]; + for (size_t i = 0; i < N - 1; ++i) data_[i] = data_[i + 1]; data_[N - 1].nucl = nucl & 3; data_[N - 1].len = 1; return *this; } + HSeq &operator<<=(HomopolymerRun run) { + // Easy case - just add to run + HomopolymerRun &last = data_[N - 1]; + if (last.nucl == run.nucl) { + last.len = (last.len + run.len) & 63; + return *this; + } + + // Hard case - have to shift the stuff + for (size_t i = 0; i < N - 1; ++i) data_[i] = data_[i + 1]; + data_[N - 1] = run; + return *this; + } + + HSeq operator>>(HomopolymerRun run) const { + HSeq res(*this); + // Easy case - just add to run + HomopolymerRun &first = res[0]; + if (first.nucl == run.nucl) { + first.len += run.len; + return res; + } + + // Hard case - have to shift the stuff + for (size_t i = 0; i < N - 1; ++i) res[i + 1] = res[i]; + res[0].nucl = run.nucl; + res[0].len = run.len; + + return res; + } + HSeq operator>>(char nucl) const { - if (is_nucl(nucl)) - nucl = dignucl(nucl); + if (is_nucl(nucl)) nucl = dignucl(nucl); HSeq res(*this); // Easy case - just add to run @@ -204,39 +225,40 @@ } // Hard case - have to shift the stuff - for (size_t i = 0; i < N - 1; ++i) - res[i + 1] = res[i]; + for (size_t i = 0; i < N - 1; ++i) res[i + 1] = res[i]; res[0].nucl = nucl; res[0].len = 1; return res; } - bool operator==(const HSeq &that) const { - return (data_ == that.data_); - } - bool operator!=(const HSeq &that) const { - return (data_ != that.data_); - } + bool operator==(const HSeq &that) const { return (data_ == that.data_); } + bool operator!=(const HSeq &that) const { return (data_ != that.data_); } size_t size() const { size_t res = 0; - for (size_t i = 0; i < N; ++i) - res += data_[i].len; + for (size_t i = 0; i < N; ++i) res += data_[i].len; + + return res; + } + size_t max_run_length() const { + size_t res = 0; + for (size_t i = 0; i < N; ++i) res = std::max((size_t)(data_[i].len), res); return res; } std::string str() const { std::string res; - for (size_t i = 0; i < N; ++i) - res += data_[i].str(); + for (size_t i = 0; i < N; ++i) res += data_[i].str(); return res; } - static size_t GetHash(const DataType *data, size_t sz = DataSize, uint32_t seed = 0) { - return CityHash64WithSeed((const char*)data, sz * sizeof(DataType), 0x9E3779B9 ^ seed); + static size_t GetHash(const DataType *data, size_t sz = DataSize, + uint32_t seed = 0) { + return CityHash64WithSeed((const char *)data, sz * sizeof(DataType), + 0x9E3779B9 ^ seed); } size_t GetHash(uint32_t seed = 0) const { @@ -248,7 +270,8 @@ return seq.GetHash(seed); } - size_t operator()(const DataType *data, size_t sz = DataSize, uint32_t seed = 0) const { + size_t operator()(const DataType *data, size_t sz = DataSize, + uint32_t seed = 0) const { return GetHash(data, sz, seed); } }; @@ -257,8 +280,7 @@ bool operator()(const HSeq &l, const HSeq &r) const { for (size_t i = 0; i < N; ++i) { const uint8_t lr = l[i].raw, rr = r[i].raw; - if (lr != rr) - return lr < rr; + if (lr != rr) return lr < rr; } return false; @@ -266,24 +288,24 @@ }; }; -template -std::ostream& operator<<(std::ostream& os, const HSeq &seq) { +template +std::ostream &operator<<(std::ostream &os, const HSeq &seq) { os << seq.str(); return os; } namespace internal { - template - inline size_t getSize(const hammer::HSeq &) { - return N; - } +template +inline size_t getSize(const hammer::HSeq &) { + return N; +} - template - inline size_t getSize(const T& a) { - return a.size(); - } +template +inline size_t getSize(const T &a) { + return a.size(); } +} // namespace internal -}; +}; // namespace hammer -#endif // __HAMMER_HSEQ_HPP__ +#endif // __HAMMER_HSEQ_HPP__ diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/io_read_corrector.hpp spades-3.11.1+dfsg/src/projects/ionhammer/io_read_corrector.hpp --- spades-3.10.1+dfsg/src/projects/ionhammer/io_read_corrector.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/io_read_corrector.hpp 2017-09-28 09:05:13.000000000 +0000 @@ -0,0 +1,230 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#ifndef __HAMMER_IT_IO_READ_CORRECTOR_HPP__ +#define __HAMMER_IT_IO_READ_CORRECTOR_HPP__ + +#include "HSeq.hpp" +#include "config_struct.hpp" +#include "consensus.hpp" +#include "flow_space_read.hpp" +#include "hkmer_distance.hpp" +#include "valid_hkmer_generator.hpp" + +#include +#include +#include + +#include +#include +#include "kmer_data.hpp" +#include "seqeval/BaseHypothesisEvaluator.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if 1 +#include +#include +#endif + +namespace hammer { +namespace correction { + +template +class SingleReadCorrector { + const KMerData& hkmer_data_; + using PenaltyCalcer = typename ReadCorrector::PenaltyCalcer; + using Factory = typename PenaltyCalcer::PenaltyCalcerFactory; + + public: + struct ReadSelectionPredicate { + virtual bool operator()(const io::SingleRead& read) = 0; + }; + + struct DebugOutputPredicate : public ReadSelectionPredicate {}; + + struct NoDebug : public DebugOutputPredicate { + virtual bool operator()(const io::SingleRead&) { return false; } + }; + + struct FullDebug : public DebugOutputPredicate { + virtual bool operator()(const io::SingleRead&) { return true; } + }; + + class DebugIfContains : public DebugOutputPredicate { + Sequence Needle; + Sequence NeedleRc; + + public: + DebugIfContains(const Sequence& seq) : Needle(seq), NeedleRc(!seq) {} + + virtual bool operator()(const io::SingleRead& read) { + auto readSeq = read.sequence(); + if (readSeq.size() < Needle.size()) return false; + if (readSeq.find(Needle, 0) != -1ULL) return true; + return readSeq.find(NeedleRc, 0) != -1ULL ? true : false; + } + }; + + struct SelectPredicate : public ReadSelectionPredicate {}; + + struct SelectAll : public SelectPredicate { + virtual bool operator()(const io::SingleRead&) { return true; } + }; + + class SelectByName : public SelectPredicate { + std::set Names; + + public: + SelectByName(const std::set& names) : Names(names) {} + + virtual bool operator()(const io::SingleRead& r) { + return Names.find(r.name()) != Names.end(); + } + }; + + private: + BamTools::SamHeader* sam_header_ptr_; + DebugOutputPredicate& debug_predicate_; + SelectPredicate& select_predicate_; + const Factory& penalty_factory_; + ReadCorrector read_corrector_; + + public: + SingleReadCorrector(const KMerData& kmer_data, + const Factory& penalty_factory, + BamTools::SamHeader* sam_header, + DebugOutputPredicate& debug, SelectPredicate& select) + : hkmer_data_(kmer_data), + sam_header_ptr_(sam_header), + debug_predicate_(debug), + select_predicate_(select), + penalty_factory_(penalty_factory), + read_corrector_(hkmer_data_, penalty_factory_) {} + + SingleReadCorrector(const KMerData& kmer_data, + const Factory& penalty_factory, + DebugOutputPredicate& debug, SelectPredicate& select) + : hkmer_data_(kmer_data), + sam_header_ptr_(NULL), + debug_predicate_(debug), + select_predicate_(select), + penalty_factory_(penalty_factory), + read_corrector_(hkmer_data_, penalty_factory_) {} + + std::unique_ptr operator()(std::unique_ptr r) { + return SingleReadCorrector::operator()(*r); + } + + std::unique_ptr operator()(const io::SingleRead& read) { + if (!select_predicate_(read)) { + return nullptr; + } + + bool debug_mode = debug_predicate_(read); + if (debug_mode) { + std::cerr << "=============================================" << std::endl; + std::cerr << '>' << read.name() << '\n' + << read.GetSequenceString() << std::endl; + } + auto corected_seq = read_corrector_.Correct( + read, cfg::get().keep_uncorrected_ends, debug_mode); + + if (corected_seq.empty()) { + return nullptr; + } + + auto result = std::unique_ptr(new io::SingleRead(read.name(), corected_seq)); + return result; + } + + std::unique_ptr operator()(std::unique_ptr alignment) { + VERIFY(sam_header_ptr_); + io::SingleRead r(alignment->Name, alignment->QueryBases); + // reverse strand means we're working with a mapped BAM, might be + // the case for datasets downloaded from IonCommunity + if (alignment->IsReverseStrand()) r = !r; + auto corrected_r = SingleReadCorrector::operator()(r); + std::string rg; + if (!alignment->GetTag("RG", rg) || !corrected_r) return nullptr; + auto flow_order = sam_header_ptr_->ReadGroups[rg].FlowOrder; + + float delta_score, fit_score; + auto seq = corrected_r->GetSequenceString(); + if (alignment->IsReverseStrand()) { + std::reverse(seq.begin(), seq.end()); + for (auto it = seq.begin(); it != seq.end(); ++it) { + switch (*it) { + case 'A': + *it = 'T'; + break; + case 'C': + *it = 'G'; + break; + case 'G': + *it = 'C'; + break; + case 'T': + *it = 'A'; + break; + default: + break; + } + } + } + + BaseHypothesisEvaluator(*alignment, flow_order, seq, delta_score, fit_score, + 0); + std::stringstream ss; + ss << alignment->Name << "_" << delta_score << "_" << fit_score; + alignment->Name = ss.str(); + if (delta_score >= cfg::get().delta_score_threshold) + return std::unique_ptr(new io::BamRead(*alignment)); + + BamTools::BamAlignment corrected(*alignment); + corrected.QueryBases = corrected_r->GetSequenceString(); + return std::unique_ptr(new io::BamRead(corrected)); + } +}; + +template +class PairedReadCorrector : public SingleReadCorrector { +public: + + using PenaltyCalcer = typename ReadCorrector::PenaltyCalcer; + using Factory = typename PenaltyCalcer::PenaltyCalcerFactory; + + public: + PairedReadCorrector( + const KMerData& kmerData, const Factory& penaltyFactory, + typename SingleReadCorrector::DebugOutputPredicate& debug, + typename SingleReadCorrector::SelectPredicate& select) + : SingleReadCorrector(kmerData, penaltyFactory, debug, + select) {} + + std::unique_ptr operator()(std::unique_ptr r) { + auto corrected_r = SingleReadCorrector::operator()(r->first()); + auto corrected_l = SingleReadCorrector::operator()(r->second()); + + if (!corrected_r || !corrected_l) return nullptr; + + return std::unique_ptr( + new io::PairedRead(*corrected_r, *corrected_l, 0)); + } +}; + +}; // namespace correction +}; // namespace hammer +#endif // __HAMMER_IT_IO_READ_CORRECTOR_HPP__ diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/kmer_data.cpp spades-3.11.1+dfsg/src/projects/ionhammer/kmer_data.cpp --- spades-3.10.1+dfsg/src/projects/ionhammer/kmer_data.cpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/kmer_data.cpp 2017-09-28 09:05:13.000000000 +0000 @@ -9,8 +9,10 @@ #include "config_struct.hpp" #include "valid_hkmer_generator.hpp" -#include "utils/mph_index/kmer_index_builder.hpp" +#include "utils/kmer_mph/kmer_index_builder.hpp" +#include +#include #include "io/kmers/mmapped_writer.hpp" #include "io/reads/file_reader.hpp" #include "io/reads/read_processor.hpp" @@ -19,12 +21,12 @@ class BufferFiller; -class HammerKMerSplitter : public KMerSortingSplitter { +class HammerKMerSplitter : public utils::KMerSortingSplitter { public: HammerKMerSplitter(const std::string &work_dir) : KMerSortingSplitter(work_dir, hammer::K) {} - path::files_t Split(size_t num_files) override; + fs::files_t Split(size_t num_files, unsigned nthreads) override; friend class BufferFiller; }; @@ -35,22 +37,22 @@ public: BufferFiller(HammerKMerSplitter &splitter) - : processed_(0), splitter_(splitter) {} + : processed_(0), splitter_(splitter) {} size_t processed() const { return processed_; } - bool operator()(std::unique_ptr r) { + bool operator()(std::unique_ptr r) { ValidHKMerGenerator gen(*r); unsigned thread_id = omp_get_thread_num(); -# pragma omp atomic +#pragma omp atomic processed_ += 1; bool stop = false; while (gen.HasMore()) { HKMer seq = gen.kmer(); - stop |= splitter_.push_back_internal( seq, thread_id); + stop |= splitter_.push_back_internal(seq, thread_id); stop |= splitter_.push_back_internal(!seq, thread_id); gen.Next(); @@ -60,13 +62,10 @@ } }; -path::files_t HammerKMerSplitter::Split(size_t num_files) { - unsigned nthreads = cfg::get().max_nthreads; +fs::files_t HammerKMerSplitter::Split(size_t num_files, unsigned nthreads) { size_t reads_buffer_size = cfg::get().count_split_buffer; - INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while."); - - path::files_t out = PrepareBuffers(num_files, nthreads, reads_buffer_size); + fs::files_t out = PrepareBuffers(num_files, nthreads, reads_buffer_size); size_t n = 15; BufferFiller filler(*this); @@ -93,48 +92,71 @@ } static inline void Merge(KMerStat &lhs, const KMerStat &rhs) { - if (lhs.count == 0) - lhs.kmer = rhs.kmer; + if (lhs.count == 0) lhs.kmer = rhs.kmer; lhs.count += rhs.count; - lhs.qual *= rhs.qual; + lhs.qual += rhs.qual; } static void PushKMer(KMerData &data, HKMer kmer, double qual) { KMerStat &kmc = data[kmer]; kmc.lock(); - Merge(kmc, KMerStat(1, kmer, qual)); + Merge(kmc, KMerStat(1, kmer, (float)qual)); kmc.unlock(); } static void PushKMerRC(KMerData &data, HKMer kmer, double qual) { - kmer = !kmer; - - KMerStat &kmc = data[kmer]; - kmc.lock(); - Merge(kmc, KMerStat(1, kmer, qual)); - kmc.unlock(); + PushKMer(data, !kmer, qual); } class KMerDataFiller { - KMerData &data_; + KMerData &Data; + mutable std::default_random_engine RandomEngine; + mutable std::uniform_real_distribution UniformRandGenerator; + mutable std::mutex Lock; + double SampleRate; public: - KMerDataFiller(KMerData &data) - : data_(data) {} + KMerDataFiller(KMerData &data, double sampleRate = 1.0) + : Data(data), + RandomEngine(42), + UniformRandGenerator(0, 1), + SampleRate(sampleRate) {} + + double NextUniform() const { + std::lock_guard guard(Lock); + return UniformRandGenerator(RandomEngine); + } - bool operator()(std::unique_ptr r) const { + bool operator()(std::unique_ptr &&r) const { ValidHKMerGenerator gen(*r); - while (gen.HasMore()) { - HKMer kmer = gen.kmer(); - double correct = gen.correct_probability(); - PushKMer(data_, kmer, 1 - correct); - PushKMerRC(data_, kmer, 1 - correct); + // tiny quality regularization + const double decay = 0.9999; + double prior = 1.0; - gen.Next(); + bool skipRead = SampleRate < 1.0 && (NextUniform() > SampleRate); + + if (skipRead) { + return false; } + while (gen.HasMore()) { + const HKMer kmer = gen.kmer(); + const double p = gen.correct_probability(); + gen.Next(); + + assert(p < 1.0); + assert(p >= 0); + const double correct = p * prior; + + prior *= decay; + { + PushKMer(Data, kmer, log(1 - correct)); + + PushKMerRC(Data, kmer, log(1 - correct)); + } + } // Do not stop return false; } @@ -142,31 +164,44 @@ void KMerDataCounter::FillKMerData(KMerData &data) { HammerKMerSplitter splitter(cfg::get().working_dir); - KMerDiskCounter counter(cfg::get().working_dir, splitter); - size_t sz = KMerIndexBuilder(cfg::get().working_dir, num_files_, cfg::get().max_nthreads).BuildIndex(data.index_, counter); + utils::KMerDiskCounter counter(cfg::get().working_dir, splitter); + + size_t sz = utils::KMerIndexBuilder(cfg::get().working_dir, num_files_, cfg::get().max_nthreads).BuildIndex(data.index_, counter); + // Now use the index to fill the kmer quality information. INFO("Collecting K-mer information, this takes a while."); data.data_.resize(sz); - const auto& dataset = cfg::get().dataset; - for (auto it = dataset.reads_begin(), et = dataset.reads_end(); it != et; ++it) { + const auto &dataset = cfg::get().dataset; + for (auto it = dataset.reads_begin(), et = dataset.reads_end(); it != et; + ++it) { INFO("Processing " << *it); io::FileReadStream irs(*it, io::PhredOffset); - KMerDataFiller filler(data); + KMerDataFiller filler(data, cfg::get().sample_rate); hammer::ReadProcessor(cfg::get().max_nthreads).Run(irs, filler); } INFO("Collection done, postprocessing."); size_t singletons = 0; + size_t skipped = 0; for (size_t i = 0; i < data.size(); ++i) { - VERIFY(data[i].count); - - if (data[i].count == 1) + if (data[i].count == 1) { singletons += 1; + } + if (data[i].count == 0) { + skipped += 1; + } } - INFO("Merge done. There are " << data.size() << " kmers in total. " - "Among them " << singletons << " (" << 100.0 * double(singletons) / double(data.size()) << "%) are singletons."); + INFO("Merge done. There are " + << data.size() + << " kmers in total. " + "Among them " + << singletons << " (" << 100.0 * double(singletons) / double(data.size()) + << "%) are singletons." + << "Among them " << skipped << " (" + << 100.0 * double(skipped) / double(data.size()) + << "%) are skipped during sampling."); } diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/kmer_data.hpp spades-3.11.1+dfsg/src/projects/ionhammer/kmer_data.hpp --- spades-3.10.1+dfsg/src/projects/ionhammer/kmer_data.hpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/kmer_data.hpp 2017-09-28 09:05:13.000000000 +0000 @@ -8,7 +8,11 @@ #ifndef __HAMMER_KMER_DATA_HPP__ #define __HAMMER_KMER_DATA_HPP__ -#include "utils/mph_index/kmer_index.hpp" + +#include "config_struct.hpp" +#include "utils/kmer_mph/kmer_index.hpp" +#include "utils/logger/logger.hpp" + #include "hkmer.hpp" #include @@ -18,28 +22,39 @@ namespace hammer { struct KMerStat { - size_t count; + int count; HKMer kmer; - double qual; - unsigned changeto; + float qual; + float posterior_genomic_ll = -10000; + bool dist_one_subcluster = false; uint8_t lock_; - KMerStat(size_t count = 0, HKMer kmer = HKMer(), double qual = 1.0, unsigned changeto = -1) - : count(count), kmer(kmer), qual(qual), changeto(changeto), lock_(0) { } + KMerStat(int count = 0, HKMer kmer = HKMer(), float qual = 0.0) + : count(count), kmer(kmer), qual(qual), lock_(0) {} void lock() { - while (__sync_val_compare_and_swap(&lock_, 0, 1) == 1) - sched_yield(); + while (__sync_val_compare_and_swap(&lock_, 0, 1) == 1) sched_yield(); } void unlock() { lock_ = 0; __sync_synchronize(); } + + bool good() const { + return posterior_genomic_ll > goodThreshold(); // log(0.5) + } + + static double goodThreshold() { return cfg::get().good_threshold; } + + bool skip() const { + return posterior_genomic_ll > cfg::get().skip_threshold && !dist_one_subcluster; // log(0.9) + } + }; - -}; -typedef KMerIndex > HammerKMerIndex; +}; // namespace hammer + +typedef utils::KMerIndex > HammerKMerIndex; class KMerData { typedef std::vector KMerDataStorageType; @@ -55,7 +70,7 @@ KMerDataStorageType().swap(data_); KMerDataStorageType().swap(push_back_buffer_); } - size_t push_back(const hammer::KMerStat &k) { + size_t push_back(const hammer::KMerStat& k) { push_back_buffer_.push_back(k); return data_.size() + push_back_buffer_.size() - 1; @@ -69,24 +84,35 @@ size_t dsz = data_.size(); return (idx < dsz ? data_[idx] : push_back_buffer_[idx - dsz]); } - hammer::KMerStat& operator[](hammer::HKMer s) { return operator[](index_.seq_idx(s)); } - const hammer::KMerStat& operator[](hammer::HKMer s) const { return operator[](index_.seq_idx(s)); } - size_t seq_idx(hammer::HKMer s) const { return index_.seq_idx(s); } + hammer::KMerStat& operator[](const hammer::HKMer& s) { + return operator[](index_.seq_idx(s)); + } + const hammer::KMerStat& operator[](const hammer::HKMer& s) const { + return operator[](index_.seq_idx(s)); + } + size_t seq_idx(const hammer::HKMer& s) const { return index_.seq_idx(s); } + + size_t checking_seq_idx(const hammer::HKMer& s) const { + size_t idx = seq_idx(s); + if (idx >= size()) return -1ULL; + + return (s == operator[](idx).kmer ? idx : -1ULL); + } template - void binary_write(Writer &os) { + void binary_write(Writer& os) { size_t sz = data_.size(); os.write((char*)&sz, sizeof(sz)); - os.write((char*)&data_[0], sz*sizeof(data_[0])); + os.write((char*)&data_[0], sz * sizeof(data_[0])); index_.serialize(os); } template - void binary_read(Reader &is) { + void binary_read(Reader& is) { size_t sz = 0; is.read((char*)&sz, sizeof(sz)); data_.resize(sz); - is.read((char*)&data_[0], sz*sizeof(data_[0])); + is.read((char*)&data_[0], sz * sizeof(data_[0])); index_.deserialize(is); } @@ -99,13 +125,14 @@ }; struct CountCmp { - const KMerData &kmer_data_; + const KMerData& kmer_data_; - CountCmp(const KMerData &kmer_data) - : kmer_data_(kmer_data) {} + CountCmp(const KMerData& kmer_data) : kmer_data_(kmer_data) {} bool operator()(unsigned lhs, unsigned rhs) { - return kmer_data_[lhs].count > kmer_data_[rhs].count; + return (kmer_data_[lhs].count != kmer_data_[rhs].count) + ? kmer_data_[lhs].count > kmer_data_[rhs].count + : kmer_data_[lhs].kmer.size() < kmer_data_[rhs].kmer.size(); } }; @@ -115,10 +142,10 @@ public: KMerDataCounter(unsigned num_files) : num_files_(num_files) {} - void FillKMerData(KMerData &data); + void FillKMerData(KMerData& data); private: DECL_LOGGER("K-mer Counting"); }; -#endif // __HAMMER_KMER_DATA_HPP__ +#endif // __HAMMER_KMER_DATA_HPP__ diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/kmer_evaluator.cpp spades-3.11.1+dfsg/src/projects/ionhammer/kmer_evaluator.cpp --- spades-3.10.1+dfsg/src/projects/ionhammer/kmer_evaluator.cpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/kmer_evaluator.cpp 2017-09-28 09:05:13.000000000 +0000 @@ -0,0 +1,53 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include +#include "hkmer.hpp" +#include "io/reads/read_processor.hpp" +#include "kmer_helpers.h" + +void printUsage() { + std::cerr << "usage: ./kmer_evaluator " + << std::endl; +} + +void runComparison(const HKMerSet& reference_kmers, + const HKMerSet& contig_kmers) { + size_t total_genomic = reference_kmers.size(); + size_t total_contig = contig_kmers.size(); + + size_t contig_genomic = 0; + + for (auto it = reference_kmers.cbegin(), et = reference_kmers.cend(); + it != et; ++it) + if (contig_kmers.find(*it) != contig_kmers.end()) ++contig_genomic; + + long contig_non_genomic = total_contig - contig_genomic; + + std::cout << "Reference kmers: " << total_genomic << std::endl; + std::cout << "Contig kmers: " << total_contig << std::endl; + std::cout << " Genomic: " << contig_genomic << " (" + << ((double)contig_genomic * 100.0 / (double)total_genomic) << "%)" << std::endl; + std::cout << " Non-genomic: " << contig_non_genomic << std::endl; +} + +int main(int argc, char** argv) { + if (argc < 3) { + printUsage(); + return 0; + } + + HKMerSet reference, contigs; + reference.reserve(10000000); + contigs.reserve(200000000); + std::cout << "Filling set of reference kmers..." << std::endl; + FillSet(reference, argv[1]); + std::cout << "Filling set of contig kmers..." << std::endl; + FillSet(contigs, argv[2]); + std::cout << "Running comparison " << std::endl; + runComparison(reference, contigs); +} diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/kmer_helpers.cpp spades-3.11.1+dfsg/src/projects/ionhammer/kmer_helpers.cpp --- spades-3.10.1+dfsg/src/projects/ionhammer/kmer_helpers.cpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/kmer_helpers.cpp 2017-09-28 09:05:13.000000000 +0000 @@ -0,0 +1,5 @@ +// +// Created by Vasiliy Ershov on 10/07/16. +// + +#include "kmer_helpers.h" diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/kmer_helpers.h spades-3.11.1+dfsg/src/projects/ionhammer/kmer_helpers.h --- spades-3.10.1+dfsg/src/projects/ionhammer/kmer_helpers.h 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/kmer_helpers.h 2017-09-28 09:05:13.000000000 +0000 @@ -0,0 +1,69 @@ +// +// Created by Vasiliy Ershov on 10/07/16. +// + +#ifndef PROJECT_KMER_HELPERS_H +#define PROJECT_KMER_HELPERS_H + +#include +#include +#include "hkmer.hpp" +#include "io/reads/file_reader.hpp" +#include "io/reads/read_processor.hpp" +#include "valid_hkmer_generator.hpp" + +using HKMerSet = std::unordered_set; + +namespace std { +template <> +struct hash > { + size_t operator()(hammer::HSeq seq) const { return seq.GetHash(); } +}; +} // namespace std + +class SetFiller { + private: + std::unordered_set& kmers_; + std::mutex mutex_; + + private: + void ProcessString(const std::string& seq) { + if (seq.empty()) { + return; + } + std::vector kmers; + kmers.reserve(seq.size()); + ValidHKMerGenerator generator(seq.data(), nullptr, seq.size()); + while (generator.HasMore()) { + kmers.push_back(generator.kmer()); + kmers.push_back(!generator.kmer()); + generator.Next(); + } + PushKMers(kmers); + } + + void PushKMers(const std::vector& hkmers) { + std::lock_guard lock(mutex_); + for (auto it = hkmers.begin(); it != hkmers.end(); ++it) { + auto& hkmer = *it; + kmers_.insert(hkmer); + } + } + + public: + SetFiller(std::unordered_set& kmers) : kmers_(kmers) {} + + bool operator()(std::unique_ptr&& read) { + ProcessString(read->GetSequenceString()); + return false; + } +}; + +inline void FillSet(HKMerSet& kmers, const char* filename) { + const unsigned num_threads = 16; + SetFiller filler(kmers); + io::FileReadStream irs(filename, io::PhredOffset); + hammer::ReadProcessor(num_threads).Run(irs, filler); +} + +#endif // PROJECT_KMER_HELPERS_H diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/main.cpp spades-3.11.1+dfsg/src/projects/ionhammer/main.cpp --- spades-3.10.1+dfsg/src/projects/ionhammer/main.cpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/main.cpp 2017-09-28 09:05:13.000000000 +0000 @@ -13,22 +13,24 @@ #include "io/reads/osequencestream.hpp" #include "io/reads/read_processor.hpp" -#include "common/adt/concurrent_dsu.hpp" +#include "adt/concurrent_dsu.hpp" #include "utils/segfault_handler.hpp" -#include "utils/memory_limit.hpp" +#include "utils/perf/memory_limit.hpp" #include "HSeq.hpp" +#include "config_struct.hpp" +#include "err_helper_table.hpp" +#include "io_read_corrector.hpp" #include "kmer_data.hpp" -#include "hamcluster.hpp" +#include "penalty_estimator.hpp" +#include "read_corrector_new.hpp" #include "subcluster.hpp" -#include "err_helper_table.hpp" -#include "read_corrector.hpp" -#include "expander.hpp" -#include "config_struct.hpp" -#include "utils/openmp_wrapper.h" +#include "utils/parallel/openmp_wrapper.h" +#include "hamcluster_1.h" +#include "quality_metrics.h" #include "version.hpp" #include @@ -37,266 +39,373 @@ #include #include +#include "gamma_poisson_model.hpp" +#include "normal_quality_model.hpp" + void create_console_logger() { using namespace logging; - logger *lg = create_logger(""); + logger* lg = create_logger(""); lg->add_writer(std::make_shared()); attach_logger(lg); } struct UfCmp { - bool operator()(const std::vector &lhs, - const std::vector &rhs) { + bool operator()(const std::vector& lhs, + const std::vector& rhs) { return lhs.size() > rhs.size(); } }; -// This is weird workaround for bug in gcc 4.4.7 -static bool stage(hammer_config::HammerStage start, hammer_config::HammerStage current) { - switch (start) { - case hammer_config::HammerStage::KMerCounting: - return true; - case hammer_config::HammerStage::HammingClustering: - return current != hammer_config::HammerStage::KMerCounting; - case hammer_config::HammerStage::SubClustering: - return (current != hammer_config::HammerStage::KMerCounting && - current != hammer_config::HammerStage::HammingClustering); - case hammer_config::HammerStage::ReadCorrection: - return current == hammer_config::HammerStage::ReadCorrection; +using namespace n_gamma_poisson_model; + +namespace hammer { +namespace correction { + +using namespace n_gamma_poisson_model; +using namespace n_normal_model; + +class TKMerDataEstimator { + KMerData& Data; + const uint NumFiles; + const hammer_config::hammer_config& Config; + std::vector > Classes; + NormalClusterModel ClusterModel; + + // This is weird workaround for bug in gcc 4.4.7 + static bool stage(hammer_config::HammerStage start, + hammer_config::HammerStage current) { + switch (start) { + case hammer_config::HammerStage::KMerCounting: + return true; + case hammer_config::HammerStage::HammingClustering: + return current != hammer_config::HammerStage::KMerCounting; + case hammer_config::HammerStage::SubClustering: + return (current != hammer_config::HammerStage::KMerCounting && + current != hammer_config::HammerStage::HammingClustering); + case hammer_config::HammerStage::ReadCorrection: + return current == hammer_config::HammerStage::ReadCorrection; + } + assert(0); } - assert(0); -} -int main(int argc, char** argv) { - segfault_handler sh; + void SaveKMerData(const std::string &filename = "count.kmdata") { + INFO("Debug mode on. Saving K-mer index."); + std::ofstream ofs(fs::append_path(cfg::get().working_dir, filename), std::ios::binary); + Data.binary_write(ofs); + } - srand(42); - srandom(42); + void SaveClusters() { + INFO("Debug mode on. Writing down clusters."); + std::ofstream ofs(fs::append_path(Config.working_dir, "hamming.cls"), + std::ios::binary); + const size_t num_classes = Classes.size(); + ofs.write((char*)&num_classes, sizeof(num_classes)); + for (size_t i = 0; i < Classes.size(); ++i) { + size_t sz = Classes[i].size(); + ofs.write((char*)&sz, sizeof(sz)); + ofs.write((char*)&Classes[i][0], sz * sizeof(Classes[i][0])); + } + } - try { - create_console_logger(); + void LoadKMerData(std::string filename) { + INFO("Loading K-mer index."); + std::ifstream ifs(fs::append_path(Config.working_dir, filename), + std::ios::binary); + VERIFY(ifs.good()); + Data.binary_read(ifs); + INFO("Total " << Data.size() << " entries were loader"); + } - std::string config_file = "hammer-it.cfg"; - if (argc > 1) config_file = argv[1]; - INFO("Starting IonHammer, built from " SPADES_GIT_REFSPEC ", git revision " SPADES_GIT_SHA1); - INFO("Loading config from " << config_file.c_str()); - cfg::create_instance(config_file); + void CountKMers() { KMerDataCounter(NumFiles).FillKMerData(Data); } - // hard memory limit - const size_t GB = 1 << 30; - limit_memory(cfg::get().hard_memory_limit * GB); + void ClusterHammingGraph() { + INFO("Clustering Hamming graph."); + { + const auto num_threads = cfg::get().max_nthreads; + TOneErrorClustering oneErrorClustering(Data, num_threads); + oneErrorClustering.FillClasses(Classes); + } + const size_t num_classes = Classes.size(); + INFO("Clustering done. Total clusters: " << num_classes); + } + + void LoadClusters() { + INFO("Loading clusters."); + std::ifstream ifs(fs::append_path(Config.working_dir, "hamming.cls"), + std::ios::binary); + VERIFY(ifs.good()); + + size_t num_classes = 0; + ifs.read((char*)&num_classes, sizeof(num_classes)); + Classes.resize(num_classes); + + for (size_t i = 0; i < num_classes; ++i) { + size_t sz = 0; + ifs.read((char*)&sz, sizeof(sz)); + Classes[i].resize(sz); + ifs.read((char*)&Classes[i][0], sz * sizeof(Classes[i][0])); + } + } + + void EstimateGenomicCenters() { + const auto num_threads = cfg::get().max_nthreads; + QualityTransform trans; + n_normal_model::ModelEstimator priorEstimator(Data, cfg::get().max_nthreads, + 50, false); + + ClusterModel = priorEstimator.Estimate(Classes); + + INFO("Subclustering."); + TGenomicHKMersEstimator genomicHKMersEstimator(Data, ClusterModel, cfg::get().center_type); + +#pragma omp parallel for num_threads(num_threads) + for (size_t i = 0; i < Classes.size(); ++i) { + auto& cluster = Classes[i]; + genomicHKMersEstimator.ProceedCluster(cluster); + } + } - KMerData kmer_data; - if (stage(cfg::get().start_stage, hammer_config::HammerStage::KMerCounting)) { - // FIXME: Actually it's num_files here - KMerDataCounter(32).FillKMerData(kmer_data); - if (cfg::get().debug_mode) { - INFO("Debug mode on. Saving K-mer index."); - std::ofstream ofs(path::append_path(cfg::get().working_dir, "count.kmdata"), std::ios::binary); - kmer_data.binary_write(ofs); + void CalcGenomicEstimationQuality(ClusteringQuality& quality) { + const auto num_threads = cfg::get().max_nthreads; + (void)num_threads; +#pragma omp parallel for num_threads(num_threads) + for (size_t idx = 0; idx < Data.size(); ++idx) { + if (Data[idx].count > 3) { + quality.AddKMer(idx); + } + } + } + + public: + TKMerDataEstimator(KMerData& kmerData, + const hammer_config::hammer_config& config, + const uint numFiles = 32) + : Data(kmerData), NumFiles(numFiles), Config(config) {} + + void Estimate() { + if (stage(Config.start_stage, hammer_config::HammerStage::KMerCounting)) { + CountKMers(); + if (Config.debug_mode) { + SaveKMerData("count.kmdata"); } } else { - INFO("Loading K-mer index."); - std::ifstream ifs(path::append_path(cfg::get().working_dir, "count.kmdata"), std::ios::binary); - VERIFY(ifs.good()); - kmer_data.binary_read(ifs); - INFO("Total " << kmer_data.size() << " entries were loader"); - } - - std::vector > classes; - if (stage(cfg::get().start_stage, hammer_config::HammerStage::HammingClustering)) { - ConcurrentDSU uf(kmer_data.size()); - KMerHamClusterer clusterer(cfg::get().tau); - INFO("Clustering Hamming graph."); - clusterer.cluster(path::append_path(cfg::get().working_dir, "kmers.hamcls"), kmer_data, uf); - uf.get_sets(classes); - size_t num_classes = classes.size(); - INFO("Clustering done. Total clusters: " << num_classes); - - if (cfg::get().debug_mode) { - INFO("Debug mode on. Writing down clusters."); - std::ofstream ofs(path::append_path(cfg::get().working_dir, "hamming.cls"), std::ios::binary); - - ofs.write((char*)&num_classes, sizeof(num_classes)); - for (size_t i=0; i < classes.size(); ++i) { - size_t sz = classes[i].size(); - ofs.write((char*)&sz, sizeof(sz)); - ofs.write((char*)&classes[i][0], sz * sizeof(classes[i][0])); - } + LoadKMerData("count.kmdata"); + } + + if (stage(Config.start_stage, + hammer_config::HammerStage::HammingClustering)) { + ClusterHammingGraph(); + if (Config.debug_mode) { + SaveClusters(); } } else { - INFO("Loading clusters."); - std::ifstream ifs(path::append_path(cfg::get().working_dir, "hamming.cls"), std::ios::binary); - VERIFY(ifs.good()); - - size_t num_classes = 0; - ifs.read((char*)&num_classes, sizeof(num_classes)); - classes.resize(num_classes); - - for (size_t i = 0; i < num_classes; ++i) { - size_t sz = 0; - ifs.read((char*)&sz, sizeof(sz)); - classes[i].resize(sz); - ifs.read((char*)&classes[i][0], sz * sizeof(classes[i][0])); + LoadClusters(); + } + + std::unique_ptr oracle; + std::unique_ptr clusteringQuality; + std::string oraclePath = cfg::get().oracle_path; + + if (oraclePath.length()) { + oracle.reset(new TGenomReferenceOracle(oraclePath)); + clusteringQuality.reset(new ClusteringQuality(*oracle, Data)); + for (size_t i = 0; i < Classes.size(); ++i) { + clusteringQuality->AddCluster(Classes[i]); } } - size_t singletons = 0; - for (size_t i = 0; i < classes.size(); ++i) - if (classes[i].size() == 1) - singletons += 1; - INFO("Singleton clusters: " << singletons); - - if (stage(cfg::get().start_stage, hammer_config::HammerStage::SubClustering)) { - size_t nonread = 0; -#if 1 - INFO("Subclustering."); -# pragma omp parallel for shared(nonread, classes, kmer_data) - for (size_t i = 0; i < classes.size(); ++i) { - auto& cluster = classes[i]; + if (stage(Config.start_stage, hammer_config::HammerStage::SubClustering)) { + EstimateGenomicCenters(); -# pragma omp atomic - nonread += subcluster(kmer_data, cluster); + if (clusteringQuality) { + CalcGenomicEstimationQuality(*clusteringQuality); + clusteringQuality->Info(); } -#else - INFO("Assigning centers"); -# pragma omp parallel for shared(nonread, classes, kmer_data) - for (size_t i = 0; i < classes.size(); ++i) { - const auto& cluster = classes[i]; -# pragma omp atomic - nonread += assign(kmer_data, cluster); - } -#endif - INFO("Total " << nonread << " nonread kmers were generated"); - if (cfg::get().debug_mode) { - INFO("Debug mode on. Saving K-mer index."); - std::ofstream ofs(path::append_path(cfg::get().working_dir, "cluster.kmdata"), std::ios::binary); - kmer_data.binary_write(ofs); + if (Config.debug_mode) { + SaveKMerData("cluster.kmdata"); } } else { - INFO("Loading K-mer index."); - std::ifstream ifs(path::append_path(cfg::get().working_dir, "cluster.kmdata"), std::ios::binary); - VERIFY(ifs.good()); - kmer_data.binary_read(ifs); - INFO("Total " << kmer_data.size() << " entries were loader"); + LoadKMerData("cluster.kmdata"); } + } -#if 0 - INFO("Starting solid k-mers expansion in " << cfg::get().max_nthreads << " threads."); - while (true) { - Expander expander(kmer_data); - const io::DataSet<> &dataset = cfg::get().dataset; - for (auto I = dataset.reads_begin(), E = dataset.reads_end(); I != E; ++I) { - io::FileReadStream irs(*I, io::PhredOffset); - hammer::ReadProcessor rp(cfg::get().max_nthreads); - rp.Run(irs, expander); - VERIFY_MSG(rp.read() == rp.processed(), "Queue unbalanced"); - } - INFO("" << expander.changed() << " solid k-mers were generated"); - if (expander.changed() == 0) - break; - } -#endif + NormalClusterModel GetClusterModel() const { return ClusterModel; } -#if 0 + void SaveCenters() { std::ofstream fasta_ofs("centers.fasta"); fasta_ofs << std::fixed << std::setprecision(6) << std::setfill('0'); - std::sort(classes.begin(), classes.end(), UfCmp()); - for (size_t i = 0; i < classes.size(); ++i) { - auto& cluster = classes[i]; - std::sort(cluster.begin(), cluster.end(), CountCmp(kmer_data)); - hammer::HKMer c = center(kmer_data, cluster); - size_t idx = kmer_data.seq_idx(c); - if (kmer_data[idx].kmer == c) { - fasta_ofs << '>' << std::setw(6) << i - << "-cov_" << std::setw(0) << kmer_data[idx].count - << "-qual_" << 1.0 - kmer_data[idx].qual; + std::sort(Classes.begin(), Classes.end(), UfCmp()); + for (size_t i = 0; i < Classes.size(); ++i) { + auto& cluster = Classes[i]; + std::sort(cluster.begin(), cluster.end(), CountCmp(Data)); + hammer::HKMer c = TGenomicHKMersEstimator::Center(Data, cluster); + size_t idx = Data.seq_idx(c); + if (Data[idx].kmer == c) { + fasta_ofs << '>' << std::setw(6) << i << "-cov_" << std::setw(0) + << Data[idx].count << "-qual_" << std::setw(14) + << 1.0 - Data[idx].qual; - if (cluster.size() == 1) + if (cluster.size() == 1) { fasta_ofs << "_singleton"; + } fasta_ofs << '\n' << c << '\n'; } } + } + +#if 0 + void SolidKMerExpansion() { + INFO("Starting solid k-mers expansion in " << Config.max_nthreads << " threads."); + while (true) { + Expander expander(Data); + const io::DataSet<> &dataset = Config.dataset; + for (auto I = dataset.reads_begin(), E = dataset.reads_end(); I != E; ++I) { + io::FileReadStream irs(*I, io::PhredOffset); + hammer::ReadProcessor rp(Config.max_nthreads); + rp.Run(irs, expander); + VERIFY_MSG(rp.read() == rp.processed(), "Queue unbalanced"); + } + INFO("" << expander.changed() << " solid k-mers were generated"); + if (expander.changed() == 0) + break; + } + } #endif +}; + +}; // namespace correction +}; // namespace hammer + +int main(int argc, char** argv) { + using namespace hammer::correction; + using TCorrector = ReadCorrector; + using SingleReadsCorrector = SingleReadCorrector; + using PairedReadsCorrector = PairedReadCorrector; + + utils::segfault_handler sh; + srand(42); + srandom(42); + + try { + create_console_logger(); + std::string config_file = "hammer-it.cfg"; + if (argc > 1) config_file = argv[1]; + INFO("Starting IonHammer, built from " SPADES_GIT_REFSPEC + ", git revision " SPADES_GIT_SHA1); + INFO("Loading config from " << config_file.c_str()); + cfg::create_instance(config_file); + + // hard memory limit + const size_t GB = 1 << 30; + utils::limit_memory(cfg::get().hard_memory_limit * GB); + + KMerData kmerData; + NormalClusterModel clusterModel; + + { + TKMerDataEstimator estimator(kmerData, cfg::get()); + estimator.Estimate(); + clusterModel = estimator.GetClusterModel(); + } + + GammaPoissonLikelihoodCalcer::Factory calcerFactory(kmerData); INFO("Correcting reads."); using namespace hammer::correction; - SingleReadCorrector::NoDebug debug_pred; - SingleReadCorrector::SelectAll select_pred; + typename SingleReadsCorrector::NoDebug debug_pred; + typename SingleReadsCorrector::SelectAll select_pred; const auto& dataset = cfg::get().dataset; io::DataSet<> outdataset; size_t ilib = 0; - for (auto it = dataset.library_begin(), et = dataset.library_end(); it != et; ++it, ++ilib) { + for (auto it = dataset.library_begin(), et = dataset.library_end(); + it != et; ++it, ++ilib) { const auto& lib = *it; auto outlib = lib; outlib.clear(); size_t iread = 0; // First, correct all the paired FASTQ files - for (auto I = lib.paired_begin(), E = lib.paired_end(); I != E; ++I, ++iread) { - if (path::extension(I->first) == ".bam" || path::extension(I->second) == ".bam") - continue; + for (auto I = lib.paired_begin(), E = lib.paired_end(); I != E; + ++I, ++iread) { + if (fs::extension(I->first) == ".bam" || + fs::extension(I->second) == ".bam") { + continue; + } - INFO("Correcting pair of reads: " << I->first << " and " << I->second); + INFO("Correcting pair of reads: " << I->first << " and " << I->second); - std::string usuffix = std::to_string(ilib) + "_" + - std::to_string(iread) + ".cor.fasta"; + std::string usuffix = + std::to_string(ilib) + "_" + std::to_string(iread) + ".cor.fasta"; - std::string outcorl = path::append_path(cfg::get().output_dir, path::basename(I->first) + usuffix); - std::string outcorr = path::append_path(cfg::get().output_dir, path::basename(I->second) + usuffix); + std::string outcorl = fs::append_path( + cfg::get().output_dir, fs::basename(I->first) + usuffix); + std::string outcorr = fs::append_path( + cfg::get().output_dir, fs::basename(I->second) + usuffix); - io::PairedOutputSequenceStream ors(outcorl, outcorr); + io::PairedOutputSequenceStream ors(outcorl, outcorr); - io::SeparatePairedReadStream irs(I->first, I->second, 0, false, false); - PairedReadCorrector read_corrector(kmer_data, debug_pred, select_pred); - hammer::ReadProcessor(cfg::get().max_nthreads).Run(irs, read_corrector, ors); + io::SeparatePairedReadStream irs(I->first, I->second, 0, false, false); + PairedReadsCorrector read_corrector(kmerData, calcerFactory, debug_pred, + select_pred); + hammer::ReadProcessor(cfg::get().max_nthreads) + .Run(irs, read_corrector, ors); - outlib.push_back_paired(outcorl, outcorr); + outlib.push_back_paired(outcorl, outcorr); } // Second, correct all the single FASTQ files - for (auto I = lib.single_begin(), E = lib.single_end(); I != E; ++I, ++iread) { - if (path::extension(*I) == ".bam") - continue; - - INFO("Correcting " << *I); + for (auto I = lib.single_begin(), E = lib.single_end(); I != E; + ++I, ++iread) { + if (fs::extension(*I) == ".bam") { + continue; + } - std::string usuffix = std::to_string(ilib) + "_" + - std::to_string(iread) + ".cor.fasta"; + INFO("Correcting " << *I); - std::string outcor = path::append_path(cfg::get().output_dir, path::basename(*I) + usuffix); - io::osequencestream ors(outcor); + std::string usuffix = + std::to_string(ilib) + "_" + std::to_string(iread) + ".cor.fasta"; - io::FileReadStream irs(*I, io::PhredOffset); - SingleReadCorrector read_corrector(kmer_data, debug_pred, select_pred); - hammer::ReadProcessor(cfg::get().max_nthreads).Run(irs, read_corrector, ors); + std::string outcor = fs::append_path(cfg::get().output_dir, + fs::basename(*I) + usuffix); + io::OutputSequenceStream ors(outcor); + + io::FileReadStream irs(*I, io::PhredOffset); + SingleReadsCorrector read_corrector(kmerData, calcerFactory, debug_pred, + select_pred); + hammer::ReadProcessor(cfg::get().max_nthreads) + .Run(irs, read_corrector, ors); - outlib.push_back_single(outcor); + outlib.push_back_single(outcor); } // Finally, correct all the BAM stuff in a row - for (auto I = lib.reads_begin(), E = lib.reads_end(); I != E; ++I, ++iread) { - if (path::extension(*I) != ".bam") - continue; + for (auto I = lib.reads_begin(), E = lib.reads_end(); I != E; + ++I, ++iread) { + if (fs::extension(*I) != ".bam") { + continue; + } INFO("Correcting " << *I); - std::string usuffix = std::to_string(ilib) + "_" + - std::to_string(iread) + ".cor.fasta"; + std::string usuffix = + std::to_string(ilib) + "_" + std::to_string(iread) + ".cor.fasta"; - std::string outcor = path::append_path(cfg::get().output_dir, path::basename(*I) + usuffix); - io::osequencestream ors(outcor); + std::string outcor = fs::append_path(cfg::get().output_dir, + fs::basename(*I) + usuffix); + io::OutputSequenceStream ors(outcor); BamTools::BamReader bam_reader; bam_reader.Open(*I); auto header = bam_reader.GetHeader(); bam_reader.Close(); - SingleReadCorrector read_corrector(kmer_data, &header, debug_pred, select_pred); + SingleReadsCorrector read_corrector(kmerData, calcerFactory, &header, + debug_pred, select_pred); io::UnmappedBamStream irs(*I); - hammer::ReadProcessor(cfg::get().max_nthreads).Run(irs, read_corrector, ors); + hammer::ReadProcessor(cfg::get().max_nthreads) + .Run(irs, read_corrector, ors); outlib.push_back_single(outcor); } @@ -305,20 +414,12 @@ } cfg::get_writable().dataset = outdataset; - std::string fname = path::append_path(cfg::get().output_dir, "corrected.yaml"); + std::string fname = fs::append_path(cfg::get().output_dir, "corrected.yaml"); INFO("Saving corrected dataset description to " << fname); cfg::get_writable().dataset.save(fname); - -#if 0 - std::sort(classes.begin(), classes.end(), UfCmp()); - for (size_t i = 0; i < classes.size(); ++i) { - auto& cluster = classes[i]; - std::sort(cluster.begin(), cluster.end(), CountCmp(kmer_data)); - dump(kmer_data, cluster); - } -#endif } catch (std::bad_alloc const& e) { - std::cerr << "Not enough memory to run IonHammer. " << e.what() << std::endl; + std::cerr << "Not enough memory to run IonHammer. " << e.what() + << std::endl; return EINTR; } catch (std::exception const& e) { std::cerr << "Exception caught " << e.what() << std::endl; diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/normal_quality_model.cpp spades-3.11.1+dfsg/src/projects/ionhammer/normal_quality_model.cpp --- spades-3.10.1+dfsg/src/projects/ionhammer/normal_quality_model.cpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/normal_quality_model.cpp 2017-09-28 09:05:13.000000000 +0000 @@ -0,0 +1,14 @@ +// +// Created by Vasiliy Ershov on 27/03/2017. +// + +#include "normal_quality_model.hpp" + +using namespace n_normal_model; + +std::vector NormalClusterModel::left_likelihoods_ = { + -9.98, -4.95, -3.95, -3.5, -3, -2.5, -2.2, -2}; +std::vector NormalClusterModel::equal_likelihoods_ = { + -0.001, -0.001, -0.019, -0.05, -0.07, -0.15, -0.2, -0.25}; +std::vector NormalClusterModel::right_likelihoods_ = { + -5.99, -5.95, -5, -4.35, -3.8, -3, -2.8, -2.5}; \ No newline at end of file diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/normal_quality_model.hpp spades-3.11.1+dfsg/src/projects/ionhammer/normal_quality_model.hpp --- spades-3.10.1+dfsg/src/projects/ionhammer/normal_quality_model.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/normal_quality_model.hpp 2017-09-28 09:05:13.000000000 +0000 @@ -0,0 +1,490 @@ +// +// Created by Vasiliy Ershov on 08/11/2016. +// + +#ifndef PROJECT_NORMAL_QUALITY_MODEL_HPP +#define PROJECT_NORMAL_QUALITY_MODEL_HPP + +#include +#include +#include +#include +#include +#include +#include "config_struct.hpp" +#include "kmer_data.hpp" +#include "quality_thresholds_estimator.h" +#include "thread_utils.h" +#include "valid_hkmer_generator.hpp" +// + +namespace n_normal_model { + +struct QualityTransform { + double bias_; + + QualityTransform(double bias = 60.0) : bias_(bias) {} + + double Apply(double quality, double count) const { + return quality / (count + 60); + } +}; + +class NormalDistribution { + private: + double mean_; + double sigma_sqr_; + + public: + NormalDistribution(const NormalDistribution&) = default; + + NormalDistribution& operator=(const NormalDistribution&) = default; + + NormalDistribution(const double mean = 0, const double sigma = 1) + : mean_(mean), sigma_sqr_(sigma) {} + + inline double GetMean() const { return mean_; } + + inline double GetSigmaSqr() const { return sigma_sqr_; } + + double LogLikelihood(double x) const { + return -0.5 * + ((x - mean_) * (x - mean_) / sigma_sqr_ + log(2 * M_PI * sigma_sqr_)); + } + + double LogLikelihoodFromStats(const double sum, + const double sum2, + const double weight) const { + return -0.5 * ((sum2 - 2 * sum * mean_ + weight * mean_ * mean_) / sigma_sqr_ + + weight * log(2 * M_PI * sigma_sqr_)); + } + + static NormalDistribution FromStats(const double sum, + const double sum2, + const double weight) { + const double mu = sum / weight; + const double var = sum2 / weight - mu * mu; + return NormalDistribution(mu, var); + } +}; + +class NormalMixture { + private: + NormalDistribution first_; + NormalDistribution second_; + double first_weight_; + + public: + NormalMixture() : first_weight_(0) {} + + NormalMixture(const NormalDistribution& first, + const NormalDistribution& second, + double weight) + : first_(first), second_(second), first_weight_(weight) {} + + const NormalDistribution& GetFirst() const { return first_; } + + const NormalDistribution& GetSecond() const { return second_; } + + double GetFirstWeight() const { return first_weight_; } + + double LogLikelihood(double x) const { + return log(first_weight_ * exp(first_.LogLikelihood(x)) + + (1 - first_weight_) * exp(second_.LogLikelihood(x))); + } + + double FirstComponentPosterior(double x) const { + double firstLL = first_.LogLikelihood(x) + log(first_weight_); + double secondLL = second_.LogLikelihood(x) + log(1.0 - first_weight_); + const double expDiff = exp(secondLL - firstLL); + + return std::isfinite(expDiff) ? -log(1.0 + exp(secondLL - firstLL)) + : firstLL - secondLL; + } +}; + +class Binarizer { + private: + std::vector borders_; + + public: + Binarizer() { + for (int i = 17; i < 30; ++i) { + borders_.push_back(i); + } + } + + Binarizer(const vector& borders) : borders_(borders) {} + + int GetBin(double value) const { + uint index = 0; + while (index < borders_.size() && value > borders_[index]) { + ++index; + } + return index; + } + + size_t GetBinCount() const { return borders_.size() + 1; } + + double GetBorder(int bin) { + --bin; + bin = std::min(bin, (const int)(borders_.size() - 1)); + if (bin < 0) { + return 0; + } + return borders_[bin]; + } +}; + +class NormalClusterModel { + private: + std::vector mixtures_; + Binarizer binarizer_; + std::vector median_qualities_; + QualityTransform trans_; + double lower_quality_threshold_; + + static std::vector left_likelihoods_; + static std::vector equal_likelihoods_; + static std::vector right_likelihoods_; + + public: + NormalClusterModel() {} + + NormalClusterModel(const std::vector& mixtures, + const Binarizer& binarizer, + const std::vector& medianQualities, + const QualityTransform& trans) + : mixtures_(mixtures), + binarizer_(binarizer), + median_qualities_(medianQualities), + trans_(trans) { + lower_quality_threshold_ = cfg::get().noise_filter_count_threshold; // threshold >= 10 ? 1 : 0; + } + + NormalClusterModel(const NormalClusterModel& other) = default; + + NormalClusterModel& operator=(const NormalClusterModel&) = default; + + bool NeedSubcluster(const hammer::KMerStat& stat) const { + return stat.count > 15 && GenomicLogLikelihood(stat) > -0.0001; + } + + double StatTransform(const hammer::KMerStat& stat) const { + return trans_.Apply(stat.qual, stat.count); + } + + double GenomicLogLikelihood(const hammer::KMerStat& stat) const { + return GenomicLogLikelihood(binarizer_.GetBin((double)GetKmerBinIdx(stat.kmer)), + stat.qual, stat.count); + } + + bool IsHighQuality(const hammer::KMerStat& stat) const { + const auto bin = binarizer_.GetBin((double)GetKmerBinIdx(stat.kmer)); + return trans_.Apply(stat.qual, stat.count) <= median_qualities_[bin]; + } + + double GenomicLogLikelihood(int bin, double quality, double count) const { + if (count <= lower_quality_threshold_) { + return -1e5; + } + const double x = trans_.Apply(quality, count); + return mixtures_[bin].FirstComponentPosterior(x); + } + + static size_t GetKmerBinIdx(const hammer::HKMer& kmer) { + if (kmer.size() > 21) { + return 1 + kmer.max_run_length(); + } else { + return 0; + } + } + + static double ErrorLogLikelihood(int from, int to) { + int diff = std::abs(from - to); + from = std::max(from, 0); + --from; + int sign = from > to ? -1 : 1; + from = std::min((int)equal_likelihoods_.size() - 1, from); + if (diff == 0) { + return equal_likelihoods_[from]; + } + if (sign == -1) { + return left_likelihoods_[from] * diff; + } + return right_likelihoods_[from] * diff; + } +}; + +class NormalMixtureEstimator { + private: + uint num_threads_; + size_t max_iterations_; + bool calc_likelihoods_; + + private: + std::vector BuildPriors(const std::vector& observations) const { + double threshold = SimpleTwoClassClustering::SimpleThresholdEstimation( + observations.begin(), observations.end()) + .split_; + + std::vector priors(observations.size()); + +#pragma omp parallel for num_threads(num_threads_) + for (size_t i = 0; i < observations.size(); ++i) { + priors[i] = observations[i] <= threshold ? 1 : 0; + } + + return priors; + } + + struct Stats { + double sum_left_ = 0; + double sum2_left_ = 0; + double weight_left_ = 0; + double sum_right_ = 0; + double sum2_right_ = 0; + + Stats& operator+=(const Stats& other) { + if (this != &other) { + sum_left_ += other.sum_left_; + sum2_left_ += other.sum2_left_; + sum_right_ += other.sum_right_; + sum2_right_ += other.sum2_right_; + weight_left_ += other.weight_left_; + } + return *this; + } + }; + + public: + NormalMixtureEstimator(uint num_threads, + size_t max_iterations, + bool calc_likelihood) + : num_threads_(num_threads), + max_iterations_(max_iterations), + calc_likelihoods_(calc_likelihood) {} + + NormalMixture Estimate(std::vector& observations) const { + std::sort(observations.begin(), observations.end(), std::greater()); + observations.resize(observations.size()); + std::reverse(observations.begin(), observations.end()); + + std::vector priors = BuildPriors(observations); + + NormalMixture mixture; + + for (size_t iter = 0; iter < max_iterations_; ++iter) { + auto stats = + n_computation_utils::ParallelStatisticsCalcer(num_threads_) + .Calculate(observations.size(), + [&]() -> Stats { return Stats(); }, + [&](Stats& stat, size_t k) { + const double x = observations[k]; + const double w = priors[k]; + stat.sum2_left_ += w * x * x; + stat.sum_left_ += w * x; + stat.weight_left_ += w; + stat.sum2_right_ += (1 - w) * x * x; + stat.sum_right_ += (1 - w) * x; + }); + + mixture = + NormalMixture(NormalDistribution::FromStats( + stats.sum_left_, stats.sum2_left_, stats.weight_left_), + NormalDistribution::FromStats( + stats.sum_right_, stats.sum2_right_, + (double)observations.size() - stats.weight_left_), + stats.weight_left_ / (double)observations.size()); + +// expectation +#pragma omp parallel for num_threads(num_threads_) + for (size_t i = 0; i < observations.size(); ++i) { + priors[i] = exp(mixture.FirstComponentPosterior(observations[i])); + } + + if (calc_likelihoods_) { + double ll = 0; + for (size_t i = 0; i < observations.size(); ++i) { + const double x = observations[i]; + ll += mixture.LogLikelihood(x); + } + INFO("LogLikelihood: " << ll); + } + + if (iter == 0 || iter == (max_iterations_ - 1)) { + const double llFirst = mixture.GetFirst().LogLikelihoodFromStats( + stats.sum_left_, stats.sum2_left_, stats.weight_left_); + INFO("Likelihood first: " << llFirst); + const double llSecond = mixture.GetSecond().LogLikelihoodFromStats( + stats.sum_right_, stats.sum2_right_, + (double)observations.size() - stats.weight_left_); + INFO("Likelihood second: " << llSecond); + INFO("First weights: " << mixture.GetFirstWeight()); + } + } + return mixture; + }; +}; + +// this class estimate prior distribution. +class ModelEstimator { + private: + const KMerData& data_; + uint num_threads_; + size_t max_iterations_; + bool is_calc_likelihood_; + + public: + ModelEstimator(const KMerData& data, + uint num_threads = 16, + size_t maxIterations = 40, + bool calc_likelihood = false) + : data_(data), + num_threads_(num_threads), + max_iterations_(maxIterations), + is_calc_likelihood_(calc_likelihood) {} + + NormalClusterModel Estimate( + const std::vector >& clusters) { + QualityTransform trans; + + std::vector cluster_center; + { + cluster_center.resize(clusters.size()); +#pragma omp parallel for num_threads(num_threads_) + for (uint i = 0; i < clusters.size(); ++i) { + auto& cluster = clusters[i]; + + double best_qual = + trans.Apply(data_[cluster[0]].qual, data_[cluster[0]].count); + size_t bestIdx = cluster[0]; + + for (auto idx : cluster) { + const auto qual = trans.Apply(data_[idx].qual, data_[idx].count); + if (qual < best_qual || + (qual == best_qual && + data_[idx].kmer.size() < data_[bestIdx].kmer.size())) { + best_qual = qual; + bestIdx = idx; + } + cluster_center[i] = bestIdx; + } + } + } + + std::vector > qualities; + qualities.reserve(16); + const size_t sampleMaxThreshold = (size_t)1e9; + const size_t min_sample_size = (size_t)1e4; + + { + double skip_threshold = cfg::get().noise_filter_count_threshold; // threshold >= 10 ? 1 : 0; + + for (size_t i = 0; i < cluster_center.size(); ++i) { + const auto& stat = data_[cluster_center[i]]; + + if (stat.count <= skip_threshold) { + continue; + } + const size_t bin = NormalClusterModel::GetKmerBinIdx(stat.kmer); + + if (bin >= qualities.size()) { + qualities.resize(bin + 1); + } + + if (qualities[bin].size() > sampleMaxThreshold) { + continue; + } + auto trans_qual = trans.Apply(stat.qual, stat.count); + qualities[bin].push_back(trans_qual); + } + } + + std::vector models; + std::vector borders; + std::vector median_qualities; + + size_t total_count = 0; + for (const auto& qual : qualities) { + total_count += qual.size(); + } + assert(qualities[1].size() == 0); + + { + auto model = NormalMixtureEstimator(num_threads_, max_iterations_, is_calc_likelihood_).Estimate(qualities[0]); + + const double median_quality = FindHighQualityThreshold(qualities[0], model); + INFO("For kmer length <= 21"); + INFO("Median quality " << median_quality); + INFO("Sample size " << qualities[0].size()); + INFO("Genomic dist: " << model.GetFirst().GetMean() << " " + << model.GetFirst().GetSigmaSqr()); + INFO("NonGenomic dist: " << model.GetSecond().GetMean() << " " + << model.GetSecond().GetSigmaSqr()); + models.push_back(model); + median_qualities.push_back(median_quality); + borders.push_back(0); + total_count -= qualities[0].size(); + } + + const auto len_limit = std::min(qualities.size(), 7UL); + for (uint max_run_len = 2; max_run_len < len_limit; ++max_run_len) { + if (total_count < min_sample_size) { + break; + } + + const size_t bin = max_run_len + 1; + auto bin_qualities = qualities[bin]; + total_count -= bin_qualities.size(); + + if (bin_qualities.size() < min_sample_size) { + if (bin + 1 < qualities.size()) { + qualities[bin + 1].insert(qualities[bin + 1].end(), + bin_qualities.begin(), + bin_qualities.end()); + } + continue; + } + + auto model = NormalMixtureEstimator(num_threads_, max_iterations_, is_calc_likelihood_).Estimate(bin_qualities); + + const double median_quality = FindHighQualityThreshold(bin_qualities, model); + + INFO("Sample size " << bin_qualities.size()); + INFO("Median quality " << median_quality); + INFO("For max run length >= " << max_run_len); + INFO("Genomic dist: " << model.GetFirst().GetMean() << " " + << model.GetFirst().GetSigmaSqr()); + INFO("NonGenomic dist: " << model.GetSecond().GetMean() << " " + << model.GetSecond().GetSigmaSqr()); + median_qualities.push_back(median_quality); + models.push_back(model); + borders.push_back((double)bin); + } + borders.resize(borders.size() - 1); + + return NormalClusterModel(models, Binarizer(borders), median_qualities, + trans); + } + + double FindHighQualityThreshold(const std::vector& bin_quality, + const NormalMixture& model) const { + std::vector good_samples; + good_samples.reserve(bin_quality.size()); + for (size_t i = 0; i < bin_quality.size(); ++i) { + if (model.FirstComponentPosterior(bin_quality[i]) > -0.69) { + good_samples.push_back(bin_quality[i]); + } + } + + const size_t quantile = (size_t)((double)good_samples.size() * cfg::get().dist_one_subcluster_alpha); + std::nth_element(good_samples.begin(), good_samples.begin() + quantile, + good_samples.end()); + return good_samples[quantile]; + } +}; + +} // namespace NNormalModel + +#endif // PROJECT_NORMAL_QUALITY_MODEL_HPP diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/penalty_estimator.hpp spades-3.11.1+dfsg/src/projects/ionhammer/penalty_estimator.hpp --- spades-3.10.1+dfsg/src/projects/ionhammer/penalty_estimator.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/penalty_estimator.hpp 2017-09-28 09:05:13.000000000 +0000 @@ -0,0 +1,326 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#ifndef __HAMMER_IT_PENALTY_ESTIMATOR_HPP__ +#define __HAMMER_IT_PENALTY_ESTIMATOR_HPP__ + +#include "HSeq.hpp" +#include "config_struct.hpp" +#include "consensus.hpp" +#include "flow_space_read.hpp" +#include "hkmer_distance.hpp" +#include "valid_hkmer_generator.hpp" + +#include +#include +#include + +#include +#include +#include "seqeval/BaseHypothesisEvaluator.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if 1 +#include +#include +#endif + +#include +#include +#include "gamma_poisson_model.hpp" +#include "read_corrector_structs_new.h" + +namespace hammer { +namespace correction { + +struct Interval { + size_t left_; + size_t right_; +}; + +class GammaPoissonLikelihoodCalcer { + private: + const n_gamma_poisson_model::GammaDistribution prior_; + n_gamma_poisson_model::PoissonGammaDistribution count_distribution_; + double upper_quantile_; + double lower_quantile_; + size_t noise_quantiles_lower_; + size_t noise_quantile_upper_; + double correction_penalty_; + double bad_kmer_penalty_; + const KMerData& data_; + + public: + class PenaltyState { + friend class GammaPoissonLikelihoodCalcer; + + private: + double likelihood_ = 0; + size_t last_correction_distances_ = 0; + size_t hkmer_distance_to_read_ = 0; + HKMer read_kmer_; + + public: + double Penalty() const { return likelihood_; } + }; + + public: + class Factory { + private: + KMerData& data_; + + public: + Factory(KMerData& data) : data_(data) {} + + GammaPoissonLikelihoodCalcer operator()(const std::string& read) const { + ValidHKMerGenerator generator(read.data(), nullptr, + read.length()); + + std::vector counts; + double sum_count = 0; + double sum_weight = 0; + + while (generator.HasMore()) { + size_t idx = data_.checking_seq_idx(generator.kmer()); + + if (idx != -1ULL) { + const auto& kmer_stat = data_[idx]; + if (kmer_stat.skip()) { + counts.push_back(data_[idx].count); + } + const double p = exp(kmer_stat.posterior_genomic_ll); + sum_count += p * data_[idx].count; + sum_weight += p; + } + generator.Next(); + } + + n_gamma_poisson_model::GammaDistribution read_prior = + [&]() -> n_gamma_poisson_model::GammaDistribution { + if (counts.size() < 10) { + return n_gamma_poisson_model::GammaDistribution(sum_count + 0.1, + sum_weight + 0.1); + } else { + return n_gamma_poisson_model::TClusterModelEstimator::EstimatePrior( + counts); + } + }(); + + return GammaPoissonLikelihoodCalcer(read_prior, data_); + } + }; + + using PenaltyCalcerFactory = Factory; + + GammaPoissonLikelihoodCalcer( + const n_gamma_poisson_model::GammaDistribution& prior, const KMerData& data) + : prior_(prior), count_distribution_(prior_), data_(data) { + upper_quantile_ = count_distribution_.Quantile(1.0 - cfg::get().count_dist_skip_quantile); + lower_quantile_ = count_distribution_.Quantile(cfg::get().count_dist_skip_quantile); + + const double eps = cfg::get().count_dist_eps; + noise_quantiles_lower_ = (size_t)max(count_distribution_.Quantile(eps), 1.0); + noise_quantile_upper_ = (size_t)count_distribution_.Quantile(1.0 - eps); + + correction_penalty_ = cfg::get().correction_penalty; + bad_kmer_penalty_ = cfg::get().bad_kmer_penalty; + assert(lower_quantile_ < upper_quantile_); + } + + inline void UpdateInitial(PenaltyState& state, const IonEvent& event, + const hammer::KMerStat* const) const { + state.read_kmer_ <<= event.FixedHRun(); + } + + inline void Update(PenaltyState& state, const IonEvent& event, + const hammer::KMerStat* const last_kmer_stats) const { + assert(event.fixed_size_ >= 0); + + if (std::isinf(state.likelihood_)) { + return; + } + + const size_t last_kmer_count = last_kmer_stats ? last_kmer_stats->count : 0; + + const int bits = 4; + const uint dist = + min((const uint)std::abs(event.fixed_size_ - event.overserved_size_), + (uint)(1 << bits) - 1); + + { + state.hkmer_distance_to_read_ += dist; + state.hkmer_distance_to_read_ -= + (state.last_correction_distances_ >> (bits * (hammer::K - 1))) & + ((1 << bits) - 1); + state.last_correction_distances_ = + ((state.last_correction_distances_ << bits) | (dist)); + state.read_kmer_ <<= event.ObservedHRun(); + } + + if (state.hkmer_distance_to_read_ > hammer::K / 2) { + state.likelihood_ = -std::numeric_limits::infinity(); + return; + } + + const bool is_good = last_kmer_stats ? last_kmer_stats->good() : false; + + if (!is_good || (dist)) { + const size_t cnt = min(max(noise_quantiles_lower_, last_kmer_count), noise_quantile_upper_); + + // state.Likelihood += dist * log(Model.ErrorRate(event.FixedSize)); + state.likelihood_ += (double)state.hkmer_distance_to_read_ * correction_penalty_; + state.likelihood_ += count_distribution_.LogLikelihood(cnt); + } + + if (!is_good) { + state.likelihood_ += bad_kmer_penalty_; + } else { + state.likelihood_ += std::max((double)(last_kmer_stats->posterior_genomic_ll), bad_kmer_penalty_); + } + } + + inline bool Skip(const HKMer& kmer) const { + size_t idx = data_.checking_seq_idx(kmer); + if (idx == -1ULL) { + return false; + } + const auto& stat = data_[idx]; + + return stat.good() && (stat.count <= upper_quantile_) && (stat.count >= lower_quantile_) && !stat.dist_one_subcluster; + } + + inline bool IsGood(const HKMer& kmer) const { + size_t idx = data_.checking_seq_idx(kmer); + if (idx == -1ULL) { + return false; + } + + return data_[idx].good(); + } + + inline std::function Good() const { + return [this](const HKMer& hkMer) { return this->IsGood(hkMer); }; + } + + static PenaltyState CreateState(const bool, const uint) { + return PenaltyState(); + } + + std::string TrimLeft(const std::string& read) const { + + ValidHKMerGenerator generator(read.data(), nullptr, read.size()); + size_t offset = 0; + while (generator.HasMore()) { + const auto& hkmer = generator.kmer(); + if (IsGood(hkmer)) { + break; + } + offset += hkmer[0].len; + generator.Next(); + } + const auto from = offset;//generator.pos() - generator.kmer().size(); + if (from > 0) { + if (read[from - 1] == read[from]) + { + assert(read[from - 1] != read[from]); + } + } + return read.substr(from); + } + + std::string TrimBadQuality(const std::string& read) const { + return TrimLeft(ReverseComplement(TrimLeft(ReverseComplement(read)))); + } + + inline Interval SolidIsland( + ValidHKMerGenerator& generator, + std::function is_good_predicate) const { + size_t bestLeft = (size_t)-1ULL; + size_t bestRight = (size_t)-1ULL; + size_t solidLength = 0; + + size_t leftPos = 0; + size_t rightPos = 0; + + while (generator.HasMore()) { + const auto& hkmer = generator.kmer(); + bool isGood = is_good_predicate(hkmer); + + if (isGood) { + const auto lastHRunSize = hkmer[K - 1].len; + const auto hkmerSize = hkmer.size(); + const auto hkmerStartPosition = generator.pos() - hkmerSize; + const auto prevEndPosition = generator.pos() - lastHRunSize; + + if (prevEndPosition != rightPos) { + leftPos = hkmerStartPosition; + } + rightPos = generator.pos(); + + if (rightPos - leftPos > solidLength) { + bestLeft = leftPos; + bestRight = rightPos; + solidLength = rightPos - leftPos; + } + } + generator.Next(); + } + return {bestLeft, bestRight}; + } + + inline Interval SolidIslandGood(ValidHKMerGenerator& generator) const { + return SolidIsland(generator, + [&](const HKMer& kmer) -> bool { return IsGood(kmer); }); + } + + inline Interval SolidIslandConservative( + ValidHKMerGenerator& generator) const { + return SolidIsland(generator, + [&](const HKMer& kmer) -> bool { return Skip(kmer); }); + } + + inline Interval SolidIsland(const std::string& read) const { + { + ValidHKMerGenerator generator(&read[0], nullptr, read.size()); + auto conservative = SolidIslandConservative(generator); + if (conservative.left_ != conservative.right_) { + return conservative; + } + } + { + ValidHKMerGenerator generator(&read[0], nullptr, read.size()); + return SolidIslandGood(generator); + } + } + + inline Interval SolidIsland(const io::SingleRead& read) const { + { + ValidHKMerGenerator generator(read); + auto conservative = SolidIslandConservative(generator); + if (conservative.left_ != conservative.right_) { + return conservative; + } + } + { + ValidHKMerGenerator generator(read); + return SolidIslandGood(generator); + } + } +}; + +}; // namespace correction +}; // namespace hammer +#endif diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/quality_metrics.cpp spades-3.11.1+dfsg/src/projects/ionhammer/quality_metrics.cpp --- spades-3.10.1+dfsg/src/projects/ionhammer/quality_metrics.cpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/quality_metrics.cpp 2017-09-28 09:05:13.000000000 +0000 @@ -0,0 +1,5 @@ +// +// Created by Vasiliy Ershov on 10/07/16. +// + +#include "quality_metrics.h" diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/quality_metrics.h spades-3.11.1+dfsg/src/projects/ionhammer/quality_metrics.h --- spades-3.10.1+dfsg/src/projects/ionhammer/quality_metrics.h 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/quality_metrics.h 2017-09-28 09:05:13.000000000 +0000 @@ -0,0 +1,189 @@ +// +// Created by Vasiliy Ershov on 10/07/16. +// + +#ifndef PROJECT_QUALITY_METRICS_H +#define PROJECT_QUALITY_METRICS_H + +#include "kmer_data.hpp" +#include "reference.h" +#include "subcluster.hpp" + +namespace hammer { + +struct TKmerQualitySample { + double quality_ = 0; + double posterior_ = 0; + size_t count_ = 0; + size_t idx_ = 0; + + TKmerQualitySample(double quality, double posterior, size_t count, size_t idx) + : quality_(quality), posterior_(posterior), count_(count), idx_(idx) {} +}; + +class TKmerQualitySamples { + private: + std::vector Samples; + + public: + void Add(const TKmerQualitySample& sample) { Samples.push_back(sample); } + + void PrintInfo(const std::string& message) const { + if (Samples.size() == 0) { + return; + } + + std::vector quality; + for (const auto& sample : Samples) { + quality.push_back(sample.quality_); + } + + double sum = 0; + double sum2 = 0; + for (double q : quality) { + sum += q; + sum2 += q * q; + } + double mean = sum / ((double)quality.size()); + double sd = sum2 / ((double)quality.size()) - mean * mean; + + std::sort(quality.begin(), quality.end()); + + const size_t quantile99 = (size_t)((double)quality.size() * 0.99); + const size_t quantile001 = (size_t)((double)quality.size() * 0.001); + const auto quantile01 = (size_t)((double)quality.size() * 0.01); + const auto quantile999 = (size_t)((double)quality.size() * 0.999); + INFO(message << "\nmean\tmedian\tsd\t0.01\t0.99\t0.001\t0.999\n" + << mean << "\t" << quality[quality.size() / 2] << "\t" << sd + << "\t" << quality[quantile01] << "\t" + << quality[quantile99] << "\t" + << quality[quantile001] << "\t" + << quality[quantile999]); + } + + std::vector::const_iterator begin() { + return Samples.begin(); + } + + std::vector::const_iterator end() { + return Samples.end(); + } +}; + +class ClusteringQuality { + private: + const TGenomReferenceOracle& oracle_; + const KMerData& data_; + + HKMerSet singleton_kmers_; + HKMerSet non_singleton_kmers_; + HKMerSet center_cluster_kmers_; + + HKMerSet good_kmers_; + HKMerSet bad_kmers_; + + TKmerQualitySamples genomic_centers_; + TKmerQualitySamples non_genomic_centers_; + + private: + static inline void AddKMer(const HKMer& kmer, HKMerSet& set) { + set.insert(kmer); + // set.insert(!kmer); + } + + void AddSingleton(const std::vector& indices) { + assert(indices.size() == 1); + const auto& kmer = data_[indices[0]].kmer; + AddKMer(kmer, singleton_kmers_); + } + + void AddNonSingleton(const std::vector& indices) { + for (auto idx : indices) { + AddKMer(data_[idx].kmer, non_singleton_kmers_); + } + } + + public: + ClusteringQuality(const TGenomReferenceOracle& oracle, + const KMerData& kMerData) + : oracle_(oracle), data_(kMerData) {} + + void AddCluster(const std::vector& indices) { + HKMer center; + if (indices.size() == 1) { + AddSingleton(indices); + center = data_[indices[0]].kmer; + } else { + AddNonSingleton(indices); + center = TGenomicHKMersEstimator::Center(data_, indices); + } + AddKMer(center, center_cluster_kmers_); + } + + void AddKMer(size_t idx) { + const KMerStat& kmerStat = data_[idx]; + const auto& kmer = kmerStat.kmer; + bool isGood = kmerStat.good(); + +#pragma omp critical + { + if (isGood) { + AddKMer(kmer, good_kmers_); + } else { + AddKMer(kmer, bad_kmers_); + } + + TKmerQualitySample qualitySample = {kmerStat.qual, + exp(kmerStat.posterior_genomic_ll), + (size_t)kmerStat.count, idx}; + + if (oracle_.IsGenomic(kmer)) { + genomic_centers_.Add(qualitySample); + } else { + non_genomic_centers_.Add(qualitySample); + } + } + } + + void Info() { + { oracle_.KMerSetStats(singleton_kmers_, "Singletons"); } + { oracle_.KMerSetStats(non_singleton_kmers_, "NonSingletons"); } + { oracle_.KMerSetStats(center_cluster_kmers_, "Center cluster kmers"); } + + { oracle_.KMerSetStats(good_kmers_, "Good kmers"); } + + { oracle_.KMerSetStats(bad_kmers_, "Bad not-filtered by clustering kmers"); } + + { + // GenomicCenters.PrintInfo("Genomic centers"); + // NonGenomicCenters.PrintInfo("Non genomic centers"); + + std::ofstream out("quality_samples.tsv"); + out << "is_genomic\tlength\tmax_run_length\tquality\tposterior\tcount" + << std::endl; + + for (const auto& entry : genomic_centers_) { + out << "1\t" << data_[entry.idx_].kmer.size() << "\t" + << MaxRunLength(data_[entry.idx_].kmer) << "\t" << entry.quality_ + << "\t" << entry.posterior_ << "\t" << entry.count_ << "\n"; + } + + for (const auto& entry : non_genomic_centers_) { + out << "0\t" << data_[entry.idx_].kmer.size() << "\t" + << MaxRunLength(data_[entry.idx_].kmer) << "\t" << entry.quality_ + << "\t" << entry.posterior_ << "\t" << entry.count_ << "\n"; + } + } + } + + int MaxRunLength(const HKMer& kmer) const { + int max_len = kmer[0].len; + for (uint i = 0; i < hammer::K; ++i) { + max_len = std::max(max_len, (int)kmer[i].len); + } + return max_len; + } +}; + +} // namespace hammer +#endif // PROJECT_QUALITY_METRICS_H \ No newline at end of file diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/quality_thresholds_estimator.cpp spades-3.11.1+dfsg/src/projects/ionhammer/quality_thresholds_estimator.cpp --- spades-3.10.1+dfsg/src/projects/ionhammer/quality_thresholds_estimator.cpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/quality_thresholds_estimator.cpp 2017-09-28 09:05:13.000000000 +0000 @@ -0,0 +1,5 @@ +// +// Created by Vasiliy Ershov on 16/07/16. +// + +#include "quality_thresholds_estimator.h" diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/quality_thresholds_estimator.h spades-3.11.1+dfsg/src/projects/ionhammer/quality_thresholds_estimator.h --- spades-3.10.1+dfsg/src/projects/ionhammer/quality_thresholds_estimator.h 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/quality_thresholds_estimator.h 2017-09-28 09:05:13.000000000 +0000 @@ -0,0 +1,101 @@ +// +// Created by Vasiliy Ershov on 16/07/16. +// + +#ifndef PROJECT_QUALITY_THRESHOLDS_ESTIMATOR_H +#define PROJECT_QUALITY_THRESHOLDS_ESTIMATOR_H + +#include +#include +#include + +class SimpleTwoClassClustering { + private: + std::vector samples_; + + static inline double Score(double sum, double weight) { + return weight > 3 ? -sum * sum * (1 + 2 * log(weight + 1)) / weight : 0; + } + + struct BestSplit { + double score_; + double split_; + double left_sum_; + double left_weight_; + double right_sum_; + double right_weight_; + }; + + public: + SimpleTwoClassClustering(size_t maxSize = 100) { samples_.reserve(maxSize); } + + void Add(double sample) { samples_.push_back(sample); } + + double FindBestSpit() { + std::sort(samples_.begin(), samples_.end()); + auto bestSplit = SimpleThresholdEstimation(samples_.begin(), samples_.end()); + return bestSplit.split_; + } + + double EstimateAlpha() { + double minSample = 0; + std::sort(samples_.begin(), samples_.end()); + + for (auto sampl : samples_) { + minSample = std::min(sampl, minSample); + } + auto bestSplit = SimpleThresholdEstimation(samples_.begin(), samples_.end()); + const double p = 0.5; + double alpha = log(1.0 - p) / bestSplit.split_; + return alpha; + } + + // it's simple decision tree with quality target + static BestSplit SimpleThresholdEstimation(std::vector::const_iterator from, + std::vector::const_iterator to) { + const double total_sum = [&]() -> double { + double sum = 0; + for (auto sorted_samples_iterator = from; sorted_samples_iterator != to; ++sorted_samples_iterator) { + const auto sample = *sorted_samples_iterator; + sum += sample; + } + return sum; + }(); + const double total_weight = (double)(to - from); + + double best_score = 0; + double best_left_sum = 0; + double best_left_weight = 0; + double best_split = 0; + + double sum = 0; + double weight = 0; + + for (auto sorted_samples_iterator = from; sorted_samples_iterator != to; ++sorted_samples_iterator) { + const auto sample = *sorted_samples_iterator; + sum += sample; + ++weight; + + const double right_leaf_sum = total_sum - sum; + const double right_leaf_weight = total_weight - weight; + const double split_score = + Score(sum, weight) + Score(right_leaf_sum, right_leaf_weight); + + if (split_score <= best_score) { + best_score = split_score; + best_left_weight = weight; + best_left_sum = sum; + best_split = sample; + } + } + + return {best_score, + best_split, + best_left_sum, + best_left_weight, + total_sum - best_left_sum, + total_weight - best_left_weight}; + } +}; + +#endif // PROJECT_QUALITY_THRESHOLDS_ESTIMATOR_H diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/read_corrector.hpp spades-3.11.1+dfsg/src/projects/ionhammer/read_corrector.hpp --- spades-3.10.1+dfsg/src/projects/ionhammer/read_corrector.hpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/read_corrector.hpp 2017-09-28 09:05:13.000000000 +0000 @@ -9,35 +9,35 @@ #define __HAMMER_IT_READ_CORRECTOR_HPP__ #include "HSeq.hpp" +#include "config_struct.hpp" +#include "consensus.hpp" #include "flow_space_read.hpp" #include "hkmer_distance.hpp" -#include "consensus.hpp" -#include "valid_hkmer_generator.hpp" -#include "config_struct.hpp" #include "io/reads/single_read.hpp" +#include "valid_hkmer_generator.hpp" -#include #include #include +#include #include #include #include "seqeval/BaseHypothesisEvaluator.h" +#include +#include #include -#include +#include #include #include -#include #include #include -#include -#include +#include #if 1 -#include "sequence/nucl.hpp" -#include #include +#include +#include "sequence/nucl.hpp" #endif namespace hammer { @@ -50,16 +50,14 @@ template static bool exactAlignH(It1 a_begin, It1 a_initial_pos, It1 a_end, - It2 b_initial_pos, It2 /*b_end*/, - uint8_t max_offset, uint8_t n_cmp, int* p_offset) -{ + It2 b_initial_pos, It2 /*b_end*/, uint8_t max_offset, + uint8_t n_cmp, int *p_offset) { int M = max_offset * 2 + 1; for (int i = 0; i < M; i++) { - int offset = (i / 2) * ((i & 1) ? 1 : -1); // 0, -1, 1, -2, 2, ... + int offset = (i / 2) * ((i & 1) ? 1 : -1); // 0, -1, 1, -2, 2, ... auto a_it = a_initial_pos + offset; auto b_it = b_initial_pos; - if (a_it < a_begin || a_it + n_cmp > a_end) - continue; + if (a_it < a_begin || a_it + n_cmp > a_end) continue; bool match = true; for (size_t j = 0; j < n_cmp; j++) if ((a_it + j)->raw != (b_it + j)->raw) { @@ -76,22 +74,19 @@ template static int overlapAlignH(It1 a_begin, It1 a_end, It2 b_begin, It2 b_end, - uint8_t max_offset) -{ + uint8_t max_offset) { // TODO: use dynamic programming int M = max_offset * 2 + 1; int best_offset = 0; int best_score = 0; for (int i = 0; i < M; i++) { - int offset = (i / 2) * ((i & 1) ? 1 : -1); // 0, -1, 1, -2, 2, ... + int offset = (i / 2) * ((i & 1) ? 1 : -1); // 0, -1, 1, -2, 2, ... auto a_it = offset < 0 ? a_begin : a_begin + offset; auto b_it = offset < 0 ? b_begin - offset : b_begin; - if (b_it < b_begin || a_it >= a_end) - continue; + if (b_it < b_begin || a_it >= a_end) continue; int score = 0; - for ( ; a_it != a_end && b_it != b_end; ++a_it, ++b_it) - if (a_it->nucl == b_it->nucl) - score += std::min(a_it->len, b_it->len); + for (; a_it != a_end && b_it != b_end; ++a_it, ++b_it) + if (a_it->nucl == b_it->nucl) score += std::min(a_it->len, b_it->len); score -= i / 4; if (score > best_score) { best_offset = offset; @@ -101,7 +96,6 @@ return best_offset; } - struct Score { short value; short dir; @@ -110,16 +104,15 @@ #if 1 template -static void dump(boost::numeric::ublas::matrix &scores, - It1 x_begin, It1 x_end, It2 y_begin, It2 y_end) { +static void dump(boost::numeric::ublas::matrix &scores, It1 x_begin, + It1 x_end, It2 y_begin, It2 y_end) { std::cerr << " "; for (auto it = x_begin; it != x_end; ++it) std::cerr << std::setw(3) << int(it->len) << nucl(it->nucl); std::cerr << "\n "; auto m = x_end - x_begin; auto n = y_end - y_begin; - for (int i = 0; i <= m; i++) - std::cerr << std::setw(4) << scores(i, 0).value; + for (int i = 0; i <= m; i++) std::cerr << std::setw(4) << scores(i, 0).value; std::cerr << '\n'; for (int i = 1; i <= n; i++) { auto run = *(y_begin + i - 1); @@ -132,33 +125,29 @@ #endif template -static int alignH(It1 read_begin, It1 read_end, - It2 consensus_begin, It2 consensus_end, - int approx_read_offset, size_t n_skip_consensus, - uint8_t n_side = 5, uint8_t n_cmp = 8) { - +static int alignH(It1 read_begin, It1 read_end, It2 consensus_begin, + It2 consensus_end, int approx_read_offset, + size_t n_skip_consensus, uint8_t n_side = 5, + uint8_t n_cmp = 8) { int left_offset = n_side; int read_len = int(read_end - read_begin); int consensus_len = int(consensus_end - consensus_begin); It1 x_begin = read_begin + std::max(approx_read_offset - n_side, 0); - if (x_begin == read_begin) - left_offset = approx_read_offset; + if (x_begin == read_begin) left_offset = approx_read_offset; if (approx_read_offset - n_side + n_cmp >= read_len) { x_begin = read_end - std::min(n_cmp + 2 * n_side, read_len); left_offset = int(read_begin + approx_read_offset - x_begin); } - auto x_end = x_begin + std::min(int(2 * n_side + n_cmp), - int(read_end - x_begin)); + auto x_end = + x_begin + std::min(int(2 * n_side + n_cmp), int(read_end - x_begin)); - auto y_begin = consensus_begin + - std::min(int(n_skip_consensus), consensus_len); - if (y_begin == consensus_end) - return 0; // weird situation - auto y_end = y_begin + std::min(int(n_cmp), - int(consensus_end - y_begin)); + auto y_begin = + consensus_begin + std::min(int(n_skip_consensus), consensus_len); + if (y_begin == consensus_end) return 0; // weird situation + auto y_end = y_begin + std::min(int(n_cmp), int(consensus_end - y_begin)); // glocal alignment of homopolymer runs const short kDirUpLeft = 0; @@ -256,10 +245,9 @@ int min_acceptable_score = ((kNuclMatch + kFullMatch) * n_cmp * 4) / 5; if (scores(highest_x, highest_y).value < min_acceptable_score && n_cmp < 16U) - return alignH(read_begin, read_end, - consensus_begin, consensus_end, - approx_read_offset, n_skip_consensus, - n_side, uint8_t(n_cmp * 2)); + return alignH(read_begin, read_end, consensus_begin, consensus_end, + approx_read_offset, n_skip_consensus, n_side, + uint8_t(n_cmp * 2)); int x = int(highest_x); int y = int(highest_y); @@ -267,11 +255,14 @@ int dir = scores(x, y).dir; switch (dir) { case kDirUp: - --x; break; + --x; + break; case kDirLeft: - --y; break; + --y; + break; case kDirUpLeft: - --x, --y; break; + --x, --y; + break; default: break; } @@ -287,43 +278,43 @@ // Not used now class HKMerProlonger { - const KMerData& kmer_data_; + const KMerData &kmer_data_; public: struct RightSide { static size_t changingPosition() { return hammer::K - 1; } static hammer::HKMer shift(const hammer::HKMer &kmer) { hammer::HKMer res; - for (size_t i = 1; i < hammer::K; ++i) - res[i - 1] = kmer[i]; + for (size_t i = 1; i < hammer::K; ++i) res[i - 1] = kmer[i]; return res; } template - static void append(T& cont, U obj) { cont.push_back(obj); } + static void append(T &cont, U obj) { + cont.push_back(obj); + } }; struct LeftSide { static size_t changingPosition() { return 0; } static hammer::HKMer shift(const hammer::HKMer &kmer) { hammer::HKMer res; - for (size_t i = 1; i < hammer::K; ++i) - res[i] = kmer[i - 1]; + for (size_t i = 1; i < hammer::K; ++i) res[i] = kmer[i - 1]; return res; } template - static void append(T& cont, U obj) { cont.push_front(obj); } + static void append(T &cont, U obj) { + cont.push_front(obj); + } }; public: - /// @param[in] seed kmer to prolong /// @param[in] bases_to_recover maximum number of bases to recover template std::deque prolong(const hammer::HKMer &seed, size_t bases_to_recover) { std::deque good_runs(hammer::K); - for (size_t i = 0; i < hammer::K; ++i) - good_runs[i] = seed[i]; + for (size_t i = 0; i < hammer::K; ++i) good_runs[i] = seed[i]; auto good_kmer = seed; auto changing_pos = Side::changingPosition(); @@ -338,8 +329,7 @@ auto kmer = Side::shift(good_kmer); for (size_t nucl = 0; nucl < 4; ++nucl) { - if (nucl == good_kmer[changing_pos].nucl) - continue; + if (nucl == good_kmer[changing_pos].nucl) continue; for (size_t len = 1; len <= 4; ++len) { kmer[changing_pos] = hammer::HomopolymerRun(nucl, len); auto &k = kmer_data_[kmer]; @@ -366,14 +356,14 @@ } public: - HKMerProlonger(const KMerData& kmer_data) : kmer_data_(kmer_data) {} + HKMerProlonger(const KMerData &kmer_data) : kmer_data_(kmer_data) {} }; static const double kLowScoreThreshold = 1.0; class CorrectedRead { - FlowSpaceRead raw_read_; // Uncorrected read - const KMerData& kmer_data_; + FlowSpaceRead raw_read_; // Uncorrected read + const KMerData &kmer_data_; bool debug_mode_; // Stores runs after joining chunks @@ -381,19 +371,15 @@ // Contiguous part of read with strong consensus struct ConsensusChunk { - int approx_read_offset; // in the vector of raw read runs + int approx_read_offset; // in the vector of raw read runs int approx_end_read_offset_; unsigned rollback_end; // remove if don't align well int initial_read_offset_; - enum { - kChunkLeftAligned, - kChunkRightAligned, - kChunkNotAligned - } alignment; + enum { kChunkLeftAligned, kChunkRightAligned, kChunkNotAligned } alignment; - const FlowSpaceRead& raw_read; + const FlowSpaceRead &raw_read; size_t trimmed_left; size_t trimmed_right; bool debug_mode; @@ -401,24 +387,21 @@ std::vector consensus; std::vector consensus_scores; - int raw_start_offset() const { - return initial_read_offset_; - } + int raw_start_offset() const { return initial_read_offset_; } - ConsensusChunk(int initial_read_offset, - int approximate_read_offset, - int approximate_end_read_offset, - const ScoreStorage &scores, - unsigned rollback_end, - const FlowSpaceRead &read, + ConsensusChunk(int initial_read_offset, int approximate_read_offset, + int approximate_end_read_offset, const ScoreStorage &scores, + unsigned rollback_end, const FlowSpaceRead &read, bool debug_mode) - : approx_read_offset(approximate_read_offset), + : approx_read_offset(approximate_read_offset), approx_end_read_offset_(approximate_end_read_offset), rollback_end(rollback_end), initial_read_offset_(initial_read_offset), - alignment(kChunkNotAligned), raw_read(read), - trimmed_left(0), trimmed_right(0), debug_mode(debug_mode) - { + alignment(kChunkNotAligned), + raw_read(read), + trimmed_left(0), + trimmed_right(0), + debug_mode(debug_mode) { bool left_trim = true; for (size_t i = 0; i < scores.size(); ++i) { auto run = hammer::iontorrent::consensus(scores[i]); @@ -431,8 +414,10 @@ } if (debug_mode && left_trim) { - std::cerr << "[ConsensusChunk] trimmed from left: " << trimmed_left << std::endl; - std::cerr << "[ConsensusChunk] approx. read offset: " << approx_read_offset << std::endl; + std::cerr << "[ConsensusChunk] trimmed from left: " << trimmed_left + << std::endl; + std::cerr << "[ConsensusChunk] approx. read offset: " + << approx_read_offset << std::endl; } left_trim = false; @@ -442,13 +427,11 @@ } size_t right_end = consensus_scores.size(); - if (right_end == 0) - return; + if (right_end == 0) return; while (consensus_scores[right_end - 1] <= kLowScoreThreshold) { --right_end; - if (right_end == 0) - break; + if (right_end == 0) break; } trimmed_right = consensus.size() - right_end; @@ -456,39 +439,38 @@ consensus_scores.resize(right_end); } - void AlignLeftEndAgainstRead(size_t skip=0) { - const auto& data = raw_read.data(); + void AlignLeftEndAgainstRead(size_t skip = 0) { + const auto &data = raw_read.data(); - int offset = alignH(data.begin(), data.end(), - consensus.begin(), consensus.end(), - approx_read_offset, skip); + int offset = alignH(data.begin(), data.end(), consensus.begin(), + consensus.end(), approx_read_offset, skip); if (debug_mode) { - std::cerr << "[approx. read offset (left)] before: " << approx_read_offset << "; after: " - << approx_read_offset + offset << std::endl; + std::cerr << "[approx. read offset (left)] before: " + << approx_read_offset + << "; after: " << approx_read_offset + offset << std::endl; } approx_read_offset += offset; alignment = kChunkLeftAligned; } - void AlignRightEndAgainstRead(size_t skip=0) { - const auto& data = raw_read.data(); + void AlignRightEndAgainstRead(size_t skip = 0) { + const auto &data = raw_read.data(); int position_on_read = approx_end_read_offset_ - 1; - int offset = alignH(data.rbegin(), data.rend(), - consensus.rbegin(), consensus.rend(), + int offset = alignH(data.rbegin(), data.rend(), consensus.rbegin(), + consensus.rend(), int(data.size()) - 1 - position_on_read, skip); if (debug_mode) { - std::cerr << "[approx. read offset (right)] before: " << approx_read_offset << "; after: " - << approx_read_offset - offset << std::endl; + std::cerr << "[approx. read offset (right)] before: " + << approx_read_offset + << "; after: " << approx_read_offset - offset << std::endl; } approx_read_offset -= offset; alignment = kChunkRightAligned; } - int approx_end_read_offset() const { - return approx_end_read_offset_; - } + int approx_end_read_offset() const { return approx_end_read_offset_; } int approx_end_read_offset_untrimmed() const { return approx_end_read_offset() + int(trimmed_right); @@ -505,38 +487,37 @@ rollback_end = 0; } - bool DoMerge(ConsensusChunk& chunk) { + bool DoMerge(ConsensusChunk &chunk) { int right_end_offset = approx_end_read_offset(); if (debug_mode) { - std::cerr << "============== Merging chunks ===============" << std::endl; - std::cerr << "(" << approx_read_offset << " .. " << right_end_offset << ")"; - std::cerr << " -- (" << chunk.approx_read_offset << " .. " << chunk.approx_end_read_offset() << ")" << std::endl; + std::cerr << "============== Merging chunks ===============" + << std::endl; + std::cerr << "(" << approx_read_offset << " .. " << right_end_offset + << ")"; + std::cerr << " -- (" << chunk.approx_read_offset << " .. " + << chunk.approx_end_read_offset() << ")" << std::endl; int white_l = 0; for (int i = right_end_offset - 1; i >= 0; --i) white_l += raw_read[i].len; for (size_t i = 0; i < consensus.size(); ++i) white_l -= consensus[i].len; - for (int i = 0; i < white_l; ++i) - std::cerr << ' '; + for (int i = 0; i < white_l; ++i) std::cerr << ' '; for (size_t i = std::max(-white_l, 0); i < consensus.size(); ++i) std::cerr << consensus[i].str(); std::cerr << std::endl; for (int i = 0; i < chunk.approx_read_offset; ++i) - for (int j = 0; j < raw_read[i].len; ++j) - std::cerr << ' '; + for (int j = 0; j < raw_read[i].len; ++j) std::cerr << ' '; for (size_t i = 0; i < chunk.consensus.size(); ++i) std::cerr << chunk.consensus[i].str(); std::cerr << std::endl; } if (right_end_offset <= chunk.approx_read_offset) { - for (int i = right_end_offset; i < chunk.approx_read_offset; ++i) { - if (i >= static_cast(raw_read.size())) - return false; + if (i >= static_cast(raw_read.size())) return false; consensus.push_back(raw_read[i]); alignment = kChunkNotAligned; @@ -544,8 +525,8 @@ consensus_scores.push_back(0); } - consensus.insert(consensus.end(), - chunk.consensus.begin(), chunk.consensus.end()); + consensus.insert(consensus.end(), chunk.consensus.begin(), + chunk.consensus.end()); consensus_scores.insert(consensus_scores.end(), chunk.consensus_scores.begin(), @@ -553,14 +534,11 @@ } else { int overlap = right_end_offset - chunk.approx_read_offset; - overlap -= overlapAlignH(consensus.end() - overlap, - consensus.end(), + overlap -= overlapAlignH(consensus.end() - overlap, consensus.end(), chunk.consensus.begin(), - chunk.consensus.begin() + overlap, - 5); + chunk.consensus.begin() + overlap, 5); - if (overlap > static_cast(chunk.consensus.size())) - return false; + if (overlap > static_cast(chunk.consensus.size())) return false; if (overlap < 0) { chunk.approx_read_offset = right_end_offset - overlap; @@ -573,7 +551,8 @@ // FIXME if (overlap > 0 && rollback_end > 0) { for (int i = 0; i < overlap; i++) { - if (n_runs - overlap + i < 0 || n_runs - overlap + i >= consensus.size()) + if (n_runs - overlap + i < 0 || + n_runs - overlap + i >= consensus.size()) continue; auto left_run = consensus[n_runs - overlap + i]; auto right_run = chunk.consensus[i]; @@ -586,11 +565,10 @@ } if (overlap >= 3 && n_runs > overlap) { - for ( ; n_trim < overlap / 3; ++n_trim) { + for (; n_trim < overlap / 3; ++n_trim) { auto score1 = consensus_scores[n_runs - n_trim - 1]; auto score2 = chunk.consensus_scores[overlap - n_trim - 1]; - if (score1 > score2) - break; + if (score1 > score2) break; } consensus.resize(consensus.size() - n_trim); @@ -601,43 +579,37 @@ chunk.consensus.begin() + overlap - n_trim, chunk.consensus.end()); - consensus_scores.insert(consensus_scores.end(), - chunk.consensus_scores.begin() + overlap - n_trim, - chunk.consensus_scores.end()); + consensus_scores.insert( + consensus_scores.end(), + chunk.consensus_scores.begin() + overlap - n_trim, + chunk.consensus_scores.end()); } approx_end_read_offset_ = chunk.approx_end_read_offset(); return true; } - bool MergeWithDisjointChunk(ConsensusChunk& chunk) { - if (debug_mode) - std::cerr << "[MergeWithDisjointChunk]" << std::endl; + bool MergeWithDisjointChunk(ConsensusChunk &chunk) { + if (debug_mode) std::cerr << "[MergeWithDisjointChunk]" << std::endl; AlignRightEndAgainstRead(); - if (chunk.alignment != kChunkLeftAligned) - chunk.AlignLeftEndAgainstRead(); + if (chunk.alignment != kChunkLeftAligned) chunk.AlignLeftEndAgainstRead(); return DoMerge(chunk); } - bool MergeWithOverlappingChunk(ConsensusChunk& chunk) { - if (debug_mode) - std::cerr << "[MergeWithOverlappingChunk]" << std::endl; + bool MergeWithOverlappingChunk(ConsensusChunk &chunk) { + if (debug_mode) std::cerr << "[MergeWithOverlappingChunk]" << std::endl; int right_end_offset = approx_end_read_offset_; size_t overlap = right_end_offset - chunk.approx_read_offset; - if (overlap > chunk.consensus_scores.size()) - return false; + if (overlap > chunk.consensus_scores.size()) return false; AlignRightEndAgainstRead(); - if (chunk.alignment != kChunkLeftAligned) - chunk.AlignLeftEndAgainstRead(); + if (chunk.alignment != kChunkLeftAligned) chunk.AlignLeftEndAgainstRead(); return DoMerge(chunk); } public: - - bool TryMergeWith(ConsensusChunk& chunk) { - if (chunk.consensus.empty()) - return true; + bool TryMergeWith(ConsensusChunk &chunk) { + if (chunk.consensus.empty()) return true; alignment = kChunkNotAligned; int right_end_offset = approx_end_read_offset_; @@ -647,17 +619,14 @@ else return MergeWithOverlappingChunk(chunk); } - }; // Chunks where strong consensus was obtained std::list chunks_; int trimmed_by_gen_; - void PushChunk(const ScoreStorage &scores, - int initial_read_offset, - int approx_read_offset, - int approx_end_read_offset, + void PushChunk(const ScoreStorage &scores, int initial_read_offset, + int approx_read_offset, int approx_end_read_offset, unsigned rollback_end) { chunks_.push_back(ConsensusChunk(initial_read_offset, approx_read_offset, approx_end_read_offset, scores, @@ -666,8 +635,7 @@ auto &consensus = chunks_.back().consensus; size_t len = consensus.size(); size_t nucl_len = 0; - for (size_t i = 0; i < len; ++i) - nucl_len += consensus[i].len; + for (size_t i = 0; i < len; ++i) nucl_len += consensus[i].len; } chunks_.back().AlignLeftEndAgainstRead(); @@ -675,9 +643,7 @@ trimmed_by_gen_ = chunks_.back().raw_start_offset(); } - const ConsensusChunk& LastChunk() const { - return chunks_.back(); - } + const ConsensusChunk &LastChunk() const { return chunks_.back(); } class ChunkCollector { CorrectedRead &cread_; @@ -716,17 +682,16 @@ k[1] = kmer_data_[kmer_data_[!seq].changeto]; k[1].kmer = !k[1].kmer; - if (k[0].qual > k[1].qual) - std::swap(k[0], k[1]); + if (k[0].qual > k[1].qual) std::swap(k[0], k[1]); using namespace hammer; for (size_t i = 0; i < 2; ++i) { auto &kmer = k[i].kmer; int end_diff; - auto dist = distanceHKMer(kmer.begin(), kmer.end(), seq.begin(), seq.end(), 3, &end_diff); + auto dist = distanceHKMer(kmer.begin(), kmer.end(), seq.begin(), + seq.end(), 3, &end_diff); if (debug_mode_) { - std::cerr << "[GetCenterOfCluster] distance(" - << seq << ", " << kmer << ") = " << dist << std::endl; - + std::cerr << "[GetCenterOfCluster] distance(" << seq << ", " << kmer + << ") = " << dist << std::endl; } if (dist <= 2) { return Center{kmer, start_pos + int(hammer::K) + end_diff}; @@ -736,8 +701,7 @@ } bool IsInconsistent(const Center ¢er) const { - if (!last_good_center_is_defined) - return false; + if (!last_good_center_is_defined) return false; for (size_t i = 0; i < hammer::K - skipped - 1; ++i) if (last_good_center.seq[i + skipped + 1].nucl != center.seq[i].nucl) @@ -750,17 +714,16 @@ unsigned rollback_end = 0; if (replacing) { - if (rollback_size < 0) - rollback_size = 0; + if (rollback_size < 0) rollback_size = 0; if (rollback_size < int(scores.size())) - rollback_end = int(scores.size()) - rollback_size; + rollback_end = int(scores.size()) - rollback_size; replacing = false; rollback_size = 0; } if (scores.size() > hammer::K) { - cread_.PushChunk(scores, raw_chunk_start_pos, - approx_read_offset, approx_end_read_offset, rollback_end); + cread_.PushChunk(scores, raw_chunk_start_pos, approx_read_offset, + approx_end_read_offset, rollback_end); pos = cread_.LastChunk().approx_end_read_offset_untrimmed() - hammer::K; pos += skipped; } else { @@ -779,12 +742,12 @@ // side effect: changes chunk_pos, pos, and approx_n_insertions bool TryToAlignCurrentCenter(const Center ¢er) { - if (!last_good_center_is_defined) - return true; + if (!last_good_center_is_defined) return true; if (debug_mode_) { std::cerr << "[TryToAlignCurrentCenter] " << center.seq.str() - << " (previous good center is " << last_good_center.seq.str() << "," + << " (previous good center is " << last_good_center.seq.str() + << "," << " skipped " << skipped << " centers)" << std::endl; } @@ -793,15 +756,15 @@ int offset; bool aligned = exactAlignH(last_good_center.seq.begin(), last_good_center.seq.begin() + skipped + 1, - last_good_center.seq.end(), - center.seq.begin(), center.seq.end(), 3, 8, &offset); + last_good_center.seq.end(), center.seq.begin(), + center.seq.end(), 3, 8, &offset); bool result = aligned && chunk_pos + offset >= 0; if (result) { if (debug_mode_) - std::cerr << "[TryToAlignCurrentCenter] offset = " << offset << std::endl; - if (offset < 0) - approx_n_insertions -= offset; + std::cerr << "[TryToAlignCurrentCenter] offset = " << offset + << std::endl; + if (offset < 0) approx_n_insertions -= offset; pos += offset; chunk_pos += offset; } @@ -820,12 +783,12 @@ auto k = kmer_data_[center.seq]; for (size_t i = 0; i < hammer::K; ++i) - scores[chunk_pos + i](center.seq[i].nucl, center.seq[i].len) += double(k.count) * (1.0 - k.qual); + scores[chunk_pos + i](center.seq[i].nucl, center.seq[i].len) += + double(k.count) * (1.0 - k.qual); last_good_center = center; last_good_center_is_defined = true; - if (raw_chunk_start_pos == -1) - raw_chunk_start_pos = raw_pos; + if (raw_chunk_start_pos == -1) raw_chunk_start_pos = raw_pos; approx_end_read_offset = center.end_offset; if (debug_mode_) { std::cerr << "e.o. = " << approx_end_read_offset << std::endl; @@ -834,20 +797,27 @@ skipped = 0; } - public: - ChunkCollector(const io::SingleRead& r, CorrectedRead &cread, - const KMerData &kmer_data, bool debug_mode) : - cread_(cread), kmer_data_(kmer_data), debug_mode_(debug_mode), - gen(r), pos(int(gen.trimmed_left())), skipped(0), - last_good_center(), last_good_center_is_defined(false), - is_first_center(true), - replacing(false), rollback_size(0), - need_to_align(false), - approx_read_offset(0), approx_end_read_offset(0), - scores(), chunk_pos(0), - raw_chunk_start_pos(-1), - approx_n_insertions(0) - { + public: + ChunkCollector(const io::SingleRead &r, CorrectedRead &cread, + const KMerData &kmer_data, bool debug_mode) + : cread_(cread), + kmer_data_(kmer_data), + debug_mode_(debug_mode), + gen(r), + pos(int(gen.trimmed_left())), + skipped(0), + last_good_center(), + last_good_center_is_defined(false), + is_first_center(true), + replacing(false), + rollback_size(0), + need_to_align(false), + approx_read_offset(0), + approx_end_read_offset(0), + scores(), + chunk_pos(0), + raw_chunk_start_pos(-1), + approx_n_insertions(0) { --pos; --chunk_pos; } @@ -858,7 +828,7 @@ raw_pos = int(gen.trimmed_left()) - 1; if (debug_mode_) { - std::cerr << "gen. trimmed = " << gen.trimmed_left() << std::endl; + std::cerr << "gen. trimmed = " << gen.trimmed_left() << std::endl; } while (gen.HasMore()) { @@ -869,9 +839,10 @@ ++raw_pos; if (debug_mode_) { std::cerr << "=================================" << std::endl; - std::cerr << "pos = " << pos << ", raw_pos = " << raw_pos << - ", last_good_center_is_defined = " << last_good_center_is_defined << - ", skipped = " << skipped << std::endl; + std::cerr << "pos = " << pos << ", raw_pos = " << raw_pos + << ", last_good_center_is_defined = " + << last_good_center_is_defined << ", skipped = " << skipped + << std::endl; } ++chunk_pos; @@ -884,21 +855,25 @@ qual = kmer_data_[center.seq].qual; } - if (qual > lowQualThreshold && last_good_center_is_defined && skipped == 0) { + if (qual > lowQualThreshold && last_good_center_is_defined && + skipped == 0) { if (debug_mode_) { - std::cerr << "raw_pos + hammer::K = " << raw_pos + hammer::K << std::endl; - std::cerr << "last_good_center.end_offset + 1 = " << last_good_center.end_offset + 1 << std::endl; + std::cerr << "raw_pos + hammer::K = " << raw_pos + hammer::K + << std::endl; + std::cerr << "last_good_center.end_offset + 1 = " + << last_good_center.end_offset + 1 << std::endl; } // Finding a center by means of clustering failed. // Let's try the following: take last good center and make a new one - // from it by appending next homopolymer run; if its quality is high, we use it. + // from it by appending next homopolymer run; if its quality is high, + // we use it. if (raw_pos + hammer::K < last_good_center.end_offset + 1) { --pos; --chunk_pos; if (debug_mode_) { std::cerr << "skipping low-quality hk-mer" << std::endl; } - continue; // move to next hk-mer + continue; // move to next hk-mer } else if (raw_pos + hammer::K == last_good_center.end_offset + 1) { auto seq_corr = last_good_center.seq; for (size_t i = 0; i < hammer::K - 1; ++i) @@ -907,7 +882,8 @@ center = Center{seq_corr, last_good_center.end_offset + 1}; qual = kmer_data_[center.seq].qual; if (debug_mode_) { - std::cerr << "seq_corr = " << seq_corr.str() << " , qual = " << qual << std::endl; + std::cerr << "seq_corr = " << seq_corr.str() + << " , qual = " << qual << std::endl; } if (qual > lowQualThreshold && can_be_changed) { @@ -922,9 +898,9 @@ bool inconsistent = IsInconsistent(center); if (debug_mode_ && !low_qual && seq != center.seq) { - std::cerr << "replaced " << seq.str() - << " (quality " << kmer_data_[seq].qual - << ", count " << kmer_data_[seq].count << ")" + std::cerr << "replaced " << seq.str() << " (quality " + << kmer_data_[seq].qual << ", count " + << kmer_data_[seq].count << ")" << " with " << center.seq.str() << std::endl; } @@ -954,7 +930,8 @@ } if (debug_mode_) { - std::cerr << "[include into consensus] raw_pos = " << raw_pos << std::endl; + std::cerr << "[include into consensus] raw_pos = " << raw_pos + << std::endl; } IncludeIntoConsensus(center); } @@ -964,34 +941,29 @@ } }; - void CollectChunks(const io::SingleRead& r) { + void CollectChunks(const io::SingleRead &r) { ChunkCollector chunk_collector(r, *this, kmer_data_, debug_mode_); chunk_collector.Run(); } public: - CorrectedRead(const io::SingleRead& read, const KMerData& kmer_data, - bool debug_mode = false) : - raw_read_(read), - kmer_data_(kmer_data), - debug_mode_(debug_mode) - { + CorrectedRead(const io::SingleRead &read, const KMerData &kmer_data, + bool debug_mode = false) + : raw_read_(read), kmer_data_(kmer_data), debug_mode_(debug_mode) { CollectChunks(read); } void MergeChunks() { - if (chunks_.empty()) - return; + if (chunks_.empty()) return; auto iter = chunks_.begin(); - ConsensusChunk& merged = *iter; + ConsensusChunk &merged = *iter; if (debug_mode_) { if (chunks_.size() == 1) { iter->AlignLeftEndAgainstRead(); for (int i = 0; i < iter->approx_read_offset; ++i) - for (int j = 0; j < raw_read_[i].len; ++j) - std::cerr << ' '; + for (int j = 0; j < raw_read_[i].len; ++j) std::cerr << ' '; for (size_t i = 0; i < iter->consensus.size(); ++i) std::cerr << iter->consensus[i].str(); std::cerr << std::endl; @@ -1000,8 +972,7 @@ ++iter; while (iter != chunks_.end()) { - if (iter->consensus.size() > hammer::K) - merged.TryMergeWith(*iter); + if (iter->consensus.size() > hammer::K) merged.TryMergeWith(*iter); iter = chunks_.erase(iter); } @@ -1010,16 +981,16 @@ void AttachUncorrectedRuns() { // attach runs from the right - const auto& data = raw_read_.data(); + const auto &data = raw_read_.data(); int n_raw = int(raw_read_.size()); int end_read_offset = LastChunk().approx_end_read_offset(); if (end_read_offset < n_raw && end_read_offset >= 0) { corrected_runs_.insert(corrected_runs_.end(), - data.begin() + end_read_offset, - data.end()); + data.begin() + end_read_offset, data.end()); } if (debug_mode_) { - std::cerr << "n_raw = " << n_raw << ", end_read_offset = " << end_read_offset << std::endl; + std::cerr << "n_raw = " << n_raw + << ", end_read_offset = " << end_read_offset << std::endl; } // attach runs from the left @@ -1033,16 +1004,14 @@ } std::string GetSequenceString() const { - if (chunks_.empty() && corrected_runs_.empty()) - return ""; + if (chunks_.empty() && corrected_runs_.empty()) return ""; std::string res; if (!corrected_runs_.empty()) { for (auto it = corrected_runs_.begin(); it != corrected_runs_.end(); ++it) res += it->str(); } else { - auto& runs = chunks_.front().consensus; - for (auto it = runs.begin(); it != runs.end(); ++it) - res += it->str(); + auto &runs = chunks_.front().consensus; + for (auto it = runs.begin(); it != runs.end(); ++it) res += it->str(); } return res; } @@ -1052,7 +1021,6 @@ const KMerData &kmer_data_; public: - struct ReadSelectionPredicate { virtual bool operator()(const io::SingleRead &read) = 0; }; @@ -1060,76 +1028,70 @@ struct DebugOutputPredicate : public ReadSelectionPredicate {}; struct NoDebug : public DebugOutputPredicate { - virtual bool operator()(const io::SingleRead &) { - return false; - } + virtual bool operator()(const io::SingleRead &) { return false; } }; struct FullDebug : public DebugOutputPredicate { - virtual bool operator()(const io::SingleRead &) { - return true; - } + virtual bool operator()(const io::SingleRead &) { return true; } }; class DebugIfContains : public DebugOutputPredicate { Sequence needle_; Sequence needle_rc_; - public: - DebugIfContains(const Sequence &seq) : - needle_(seq), needle_rc_(!seq) {} + + public: + DebugIfContains(const Sequence &seq) : needle_(seq), needle_rc_(!seq) {} virtual bool operator()(const io::SingleRead &read) { auto read_seq = read.sequence(); - if (read_seq.size() < needle_.size()) - return false; - if (read_seq.find(needle_, 0) != -1ULL) - return true; - if (read_seq.find(needle_rc_, 0) != -1ULL) - return true; + if (read_seq.size() < needle_.size()) return false; + if (read_seq.find(needle_, 0) != -1ULL) return true; + if (read_seq.find(needle_rc_, 0) != -1ULL) return true; return false; } }; struct SelectPredicate : public ReadSelectionPredicate {}; struct SelectAll : public SelectPredicate { - virtual bool operator()(const io::SingleRead &) { - return true; - } + virtual bool operator()(const io::SingleRead &) { return true; } }; class SelectByName : public SelectPredicate { std::set names_; - public: - SelectByName(const std::set& names) : - names_(names) {} + + public: + SelectByName(const std::set &names) : names_(names) {} virtual bool operator()(const io::SingleRead &r) { return names_.find(r.name()) != names_.end(); } }; -private: - BamTools::SamHeader* sam_header_; + private: + BamTools::SamHeader *sam_header_; DebugOutputPredicate &debug_pred_; SelectPredicate &select_pred_; -public: + public: SingleReadCorrector(const KMerData &kmer_data, BamTools::SamHeader *sam_header, - DebugOutputPredicate &debug, - SelectPredicate &select) : - kmer_data_(kmer_data), sam_header_(sam_header), - debug_pred_(debug), select_pred_(select) {} + DebugOutputPredicate &debug, SelectPredicate &select) + : kmer_data_(kmer_data), + sam_header_(sam_header), + debug_pred_(debug), + select_pred_(select) {} - SingleReadCorrector(const KMerData &kmer_data, - DebugOutputPredicate &debug, - SelectPredicate &select) : - kmer_data_(kmer_data), sam_header_(NULL), - debug_pred_(debug), select_pred_(select) {} + SingleReadCorrector(const KMerData &kmer_data, DebugOutputPredicate &debug, + SelectPredicate &select) + : kmer_data_(kmer_data), + sam_header_(NULL), + debug_pred_(debug), + select_pred_(select) {} - std::unique_ptr operator()(std::unique_ptr r) { + std::unique_ptr operator()( + std::unique_ptr r) { return operator()(*r); } - + std::unique_ptr operator()(const io::SingleRead &r) { if (!select_pred_(r)) return nullptr; bool debug_mode = debug_pred_(r); @@ -1142,32 +1104,28 @@ CorrectedRead read(r, kmer_data_, debug_mode); read.MergeChunks(); - if (cfg::get().keep_uncorrected_ends) - read.AttachUncorrectedRuns(); + if (cfg::get().keep_uncorrected_ends) read.AttachUncorrectedRuns(); if (debug_mode) { std::cerr << "final result: " << read.GetSequenceString() << std::endl; } auto seq = read.GetSequenceString(); - if (seq.empty()) - return nullptr; + if (seq.empty()) return nullptr; return std::unique_ptr(new io::SingleRead(r.name(), seq)); } - std::unique_ptr - operator()(std::unique_ptr alignment) { + std::unique_ptr operator()( + std::unique_ptr alignment) { VERIFY(sam_header_); io::SingleRead r(alignment->Name, alignment->QueryBases); // reverse strand means we're working with a mapped BAM, might be // the case for datasets downloaded from IonCommunity - if (alignment->IsReverseStrand()) - r = !r; + if (alignment->IsReverseStrand()) r = !r; auto corrected_r = operator()(r); std::string rg; - if (!alignment->GetTag("RG", rg) || !corrected_r) - return nullptr; + if (!alignment->GetTag("RG", rg) || !corrected_r) return nullptr; auto flow_order = sam_header_->ReadGroups[rg].FlowOrder; float delta_score, fit_score; @@ -1176,17 +1134,26 @@ std::reverse(seq.begin(), seq.end()); for (auto it = seq.begin(); it != seq.end(); ++it) { switch (*it) { - case 'A': *it = 'T'; break; - case 'C': *it = 'G'; break; - case 'G': *it = 'C'; break; - case 'T': *it = 'A'; break; - default: break; + case 'A': + *it = 'T'; + break; + case 'C': + *it = 'G'; + break; + case 'G': + *it = 'C'; + break; + case 'T': + *it = 'A'; + break; + default: + break; } } } - BaseHypothesisEvaluator(*alignment, flow_order, seq, - delta_score, fit_score, 0); + BaseHypothesisEvaluator(*alignment, flow_order, seq, delta_score, fit_score, + 0); std::stringstream ss; ss << alignment->Name << "_" << delta_score << "_" << fit_score; alignment->Name = ss.str(); @@ -1201,22 +1168,22 @@ class PairedReadCorrector : public SingleReadCorrector { public: - PairedReadCorrector(const KMerData &kmer_data, - DebugOutputPredicate &debug, + PairedReadCorrector(const KMerData &kmer_data, DebugOutputPredicate &debug, SelectPredicate &select) - : SingleReadCorrector(kmer_data, debug, select) {} + : SingleReadCorrector(kmer_data, debug, select) {} - std::unique_ptr operator()(std::unique_ptr r) { + std::unique_ptr operator()( + std::unique_ptr r) { auto corrected_r = SingleReadCorrector::operator()(r->first()); auto corrected_l = SingleReadCorrector::operator()(r->second()); - if (!corrected_r || !corrected_l) - return nullptr; + if (!corrected_r || !corrected_l) return nullptr; - return std::unique_ptr(new io::PairedRead(*corrected_r, *corrected_l, 0)); + return std::unique_ptr( + new io::PairedRead(*corrected_r, *corrected_l, 0)); } }; -}; // namespace correction -}; // namespace hammer -#endif // __HAMMER_IT_READ_CORRECTOR_HPP__ +}; // namespace correction +}; // namespace hammer +#endif // __HAMMER_IT_READ_CORRECTOR_HPP__ diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/read_corrector_new.hpp spades-3.11.1+dfsg/src/projects/ionhammer/read_corrector_new.hpp --- spades-3.10.1+dfsg/src/projects/ionhammer/read_corrector_new.hpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/read_corrector_new.hpp 2017-09-28 09:05:13.000000000 +0000 @@ -0,0 +1,252 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#ifndef __HAMMER_IT_READ_CORRECTOR_HPP__ +#define __HAMMER_IT_READ_CORRECTOR_HPP__ + +#include "HSeq.hpp" +#include "config_struct.hpp" +#include "consensus.hpp" +#include "flow_space_read.hpp" +#include "hkmer_distance.hpp" +#include "valid_hkmer_generator.hpp" + +#include +#include +#include + +#include +#include +#include "seqeval/BaseHypothesisEvaluator.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if 1 +#include +#include +#endif + +#include "read_corrector_structs_new.h" + +namespace hammer { +namespace correction { + +template +class ReadCorrector { + public: + using PenaltyCalcer = CorrectionsLikelihoodCalcer; + private: + using State = CorrectionState; + const KMerData& data; + using PenaltyCalcerFactory = typename CorrectionsLikelihoodCalcer::PenaltyCalcerFactory; + const PenaltyCalcerFactory& penalty_calcer_factory; + + mutable size_t skipped_reads = 0; + mutable size_t queue_overflow_reads = 0; + + inline bool Flush(std::priority_queue& candidates, + std::priority_queue& corrections, + size_t limit, + size_t readSize) const { + + if (corrections.size() > limit) { + auto top = pop_queue(candidates); + if (!std::isinf(top.Penalty())) { + corrections.emplace(std::move(top)); + } + std::priority_queue().swap(candidates); + return true; + } else { + while (!candidates.empty()) { + auto top = pop_queue(candidates); + if (top.TotalCorrections() > std::max(readSize / 10, (size_t)3)) { + continue; + } + if (!std::isinf(top.Penalty())) { + corrections.emplace(std::move(top)); + } + } + return false; + } + } + + std::string CorrectRight(const PenaltyCalcer& penalty_calcer, + const std::string& read, + const size_t offset, + bool reverse, + bool& is_too_many_corrections, + bool make_only_simple_corrections = false) const { + if (offset >= read.size()) { + return read; + } + + std::priority_queue corrections; + std::priority_queue candidates; + + CorrectionContext context(data, read, reverse); + { + corrections.emplace(StateBuilder::Initial( + context, penalty_calcer, (uint)offset)); + } + + std::map > visited; + const size_t queue_limit = (const size_t)(cfg::get().queue_limit_multiplier * log2(read.size() - offset + 1));//(const size_t)(100 * read.size()); + + bool queue_overflow = false; + + while (!corrections.empty()) { + + auto state = std::pop_queue(corrections); + assert(state.Position() <= read.size()); + + { + size_t hash = state.GetHKMer().GetHash(); + if (visited[state.Position()].count(hash) && corrections.size()) { + continue; + } + visited[state.Position()].insert(hash); + } + + if (state.Position() < read.size()) { + MoveToNextDivergence mover(state, + context, + penalty_calcer); + if (mover.FindNextDivergence()) { + mover.Move(); + } + } + + if (state.Position() == read.size()) { + return state.Read()->ToString(); + } + + // //don't correct last kmer + if ((state.Position() + context.GetHRun(state.Position()).len) == + read.size()) { + auto result = state.Read()->ToString(); + result += (context.GetHRun(state.Position()).str()); + return result; + } + + { + SkipMayBeBadHRun skipHRun(state, + context, + penalty_calcer); + candidates.emplace(skipHRun.State()); + } + + { + CorrectLastHRun hrun_corrector(state, + context, + penalty_calcer); + if (make_only_simple_corrections) { + hrun_corrector.AddOnlySimpleCorrections(candidates); + } else { + hrun_corrector.AddPossibleCorrections(candidates); + } + queue_overflow |= Flush(candidates, corrections, queue_limit, read.size()); + } + } + is_too_many_corrections = queue_overflow; + + return read; + } + + public: + + ReadCorrector(const KMerData& kmer_data, + const PenaltyCalcerFactory& factory) + : data(kmer_data) + , penalty_calcer_factory(factory) {} + + ~ReadCorrector() { + INFO("Skipped reads count: " << skipped_reads); + if (queue_overflow_reads) { + WARN("Too many possible corrections in some reads (" << queue_overflow_reads << "), something may be wrong"); + } + } + + std::string Correct(const io::SingleRead& read, + bool keep_uncorrected_ends = true, + bool debug = false, + uint simple_passes_count = 0, + uint complex_passes_count = 1) const { + + std::string current_read = read.GetSequenceString(); + + PenaltyCalcer penalty_calcer = penalty_calcer_factory(current_read); + + bool overflow = false; + + for (uint pass = 0; pass < 2 * (simple_passes_count + complex_passes_count); ++pass) { + const bool reverse = pass % 2 == 0; // tail has more errors, so let's start with "simple" part + const bool only_simple = pass < 2 * simple_passes_count; + if (reverse) { + current_read = ReverseComplement(current_read); + } + const auto solid_island = penalty_calcer.SolidIsland(current_read); + const size_t solid_length = solid_island.right_ - solid_island.left_; + + if (debug) { +#pragma omp critical + { + std::cerr << "Solid length: " << solid_length << " / " + << current_read.size() << std::endl; + std::cerr << "Position: " << solid_island.left_ << " / " + << solid_island.right_ << std::endl; + } + } + + if (solid_length == 0 || solid_length == current_read.size()) { + if (pass == 0) { + if (solid_length == 0) { +#pragma omp atomic + skipped_reads++; + } + } + + break; + } + + bool pass_overflow = false; + current_read = CorrectRight(penalty_calcer, + current_read, + solid_island.right_, + reverse, + overflow, + only_simple); + + overflow |= pass_overflow; + + if (reverse) { + current_read = ReverseComplement(current_read); + } + } + + if (overflow) { + #pragma omp atomic + queue_overflow_reads++; + } + + if (!keep_uncorrected_ends) { + return penalty_calcer.TrimBadQuality(current_read); + } + return current_read; + } +}; + +}; // namespace correction +}; // namespace hammer +#endif // __HAMMER_IT_READ_CORRECTOR_HPP__ diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/read_corrector_structs_new.h spades-3.11.1+dfsg/src/projects/ionhammer/read_corrector_structs_new.h --- spades-3.10.1+dfsg/src/projects/ionhammer/read_corrector_structs_new.h 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/read_corrector_structs_new.h 2017-09-28 09:05:13.000000000 +0000 @@ -0,0 +1,740 @@ +// +// Created by Vasily Ershov on 19.03.16. +// + +#ifndef PROJECT_READ_CORRECTOR_INFO_H +#define PROJECT_READ_CORRECTOR_INFO_H + +#include +#include +#include "hkmer.hpp" + +namespace hammer { +namespace correction { + +namespace numeric = boost::numeric::ublas; +using HRun = HomopolymerRun; + +template +inline Moveable pop_queue(std::priority_queue& queue) { + Moveable result(std::move(const_cast(queue.top()))); + queue.pop(); + return result; +} + +struct IonEvent { + + IonEvent(const char nucl = 0, const char observed_size = 0, + const char fixed_size = 0, const bool to_good_correction = false) + : nucl_(nucl), + overserved_size_(observed_size), + fixed_size_(fixed_size), + is_to_good_correction_(to_good_correction) {} + + IonEvent(const IonEvent& other) = default; + + char nucl_; + char overserved_size_; + char fixed_size_; + bool is_to_good_correction_; + + inline HRun FixedHRun() const { + return HRun((uint8_t)nucl_, (uint8_t)fixed_size_); + } + + inline HRun ObservedHRun() const { + return HRun((uint8_t)nucl_, (uint8_t)overserved_size_); + } +}; + +class CorrectedRead { +private: + std::vector runs_; + std::shared_ptr previous_; + + public: + CorrectedRead() : previous_(nullptr) {} + + CorrectedRead(std::shared_ptr previous) + : previous_(previous) {} + + CorrectedRead(std::vector&& runs, + std::shared_ptr previous) + : runs_(std::move(runs)), previous_(previous) {} + + inline void Add(const HRun hrun) { runs_.push_back(hrun); } + + size_t Size() const { + size_t size = previous_ != nullptr ? previous_->Size() : 0; + for (auto hrun : runs_) { + size += hrun.len; + } + return size; + } + + inline void Fill(std::string& result) const { + if (previous_ != nullptr) { + previous_->Fill(result); + } + + for (auto hrun : runs_) { + result += hrun.str(); + } + } + + inline std::string ToString() const { + std::string result; + result.reserve(Size() + 10); + Fill(result); + return result; + } +}; + +template +class CorrectionState { + template + friend class MoveToNextDivergence; + template + friend class StateBuilder; + + private: + PenaltyState penalty_state; + HKMer kmer_; + std::shared_ptr current_read_ = std::shared_ptr(nullptr); + int16_t cursor_ = 0; + int16_t corrections_ = 0; + + public: + const HKMer& GetHKMer() const { return kmer_; } + + inline double Penalty() const { return penalty_state.Penalty(); } + + inline size_t TotalCorrections() const { return (size_t)corrections_; } + + const CorrectedRead* Read() const { return current_read_.get(); } + + unsigned Position() const { return (unsigned)cursor_; } +}; + +class CorrectionContext { + private: + std::vector read_; + std::vector hrun_sizes_; + const KMerData& data_; + bool reversed_; + + inline void FillHRunSizes(const std::vector& read, + std::vector& hrun_sizes) const { + size_t offset = 0; + hrun_sizes.resize(read.size()); + + while (offset < read.size()) { + size_t cursor = offset; + while (cursor < read.size() && read[cursor] == read[offset]) { + ++cursor; + }; + uint8_t sz = (uint8_t)(cursor - offset); + while (sz > 0) { + hrun_sizes[offset++] = sz; + --sz; + } + } + } + + public: + CorrectionContext(const KMerData& data, const std::string& read, + bool reverse) + : data_(data) + , reversed_(reverse) { + read_.resize(read.size()); + for (size_t i = 0; i < read.size(); ++i) { + read_[i] = dignucl(read[i]); + } + + FillHRunSizes(read_, hrun_sizes_); + } + + inline const std::vector& GetRead() const { return read_; } + + inline size_t GetOriginalOffset(const size_t offset) const { + if (reversed_) { + return read_.size() - offset; + } + return offset; + } + + inline bool IsReversed() const { return reversed_; } + + inline HRun GetHRun(size_t offset) const { + return HRun((uint8_t)read_[offset], (uint8_t)hrun_sizes_[offset]); + } + + inline KMerStat const* TryGetKMerStats(const HKMer& kmer) const { + auto idx = data_.checking_seq_idx(kmer); + return idx == -1ULL ? nullptr : &data_[kmer]; + } + + inline bool Skip(const HKMer& kmer) const { + auto stat = TryGetKMerStats(kmer); + return stat != nullptr ? stat->skip() : false; + } +}; + +// +template +class StateBuilder { + using State = CorrectionState; + const State& previous_; + const PenaltyCalcer& penalty_calcer_; + const CorrectionContext& context_; + State next_; + + public: + StateBuilder(const State& previous, + const PenaltyCalcer& penalty_calcer, + const CorrectionContext& context) + : previous_(previous), + penalty_calcer_(penalty_calcer), + context_(context), + next_() { + next_.current_read_.reset(new CorrectedRead(previous_.current_read_)); + next_.kmer_ = previous_.kmer_; + next_.penalty_state = previous_.penalty_state; + next_.cursor_ = previous_.cursor_; + next_.corrections_ = previous_.corrections_; + } + + inline void AddEvent(const IonEvent& event) { + if (event.fixed_size_ != 0) { + const HRun run = event.FixedHRun(); + next_.kmer_ <<= run; + next_.current_read_->Add(run); + } + + next_.cursor_ = (int16_t)(next_.cursor_ + event.overserved_size_); + penalty_calcer_.Update(next_.penalty_state, event, + context_.TryGetKMerStats(next_.kmer_)); + + if (event.fixed_size_ != event.overserved_size_) { + next_.corrections_++; + } + } + + inline State Build() { return next_; } + + static State Initial(const CorrectionContext& context, + const PenaltyCalcer& penalty, + unsigned skip) { + State state; + state.penalty_state = PenaltyCalcer::CreateState( + context.IsReversed(), (unsigned)context.GetRead().size()); + state.current_read_.reset(new CorrectedRead()); + size_t offset = 0; + size_t minSkip = 0; + + for (unsigned i = 0; i < hammer::K; ++i) { + minSkip += context.GetHRun(minSkip).len; + if (minSkip >= context.GetRead().size()) { + break; + } + } + + if (minSkip > skip) { + skip = (unsigned)minSkip; + } + state.cursor_ = (int16_t)skip; + + while (offset < skip) { + HRun run = context.GetHRun(offset); + state.kmer_ <<= run; + state.current_read_->Add(context.GetHRun(offset)); + penalty.UpdateInitial(state.penalty_state, + IonEvent(run.nucl, run.len, run.len, true), + context.TryGetKMerStats(state.kmer_)); + offset += run.len; + } + return state; + } +}; + +template +class MoveToNextDivergence { + using State = CorrectionState; + std::vector Proceeded; + State& state_; + const CorrectionContext& context_; + const PenaltyCalcer& calcer_; + unsigned cursor_; + + public: + MoveToNextDivergence(State& state, + const CorrectionContext& context, + const PenaltyCalcer& calcer) + : state_(state), + context_(context), + calcer_(calcer), + cursor_((unsigned)state.cursor_) {} + + inline bool FindNextDivergence() { + const auto& context = context_; + const size_t readSize = context.GetRead().size(); + HKMer currentHKMer = state_.kmer_; + + while (cursor_ < readSize) { + const HRun hrun = context.GetHRun(cursor_); + currentHKMer <<= hrun; + + if (calcer_.Skip(currentHKMer)) { + Proceeded.push_back({hrun.Nucl(), hrun.Len(), hrun.Len(), true}); + cursor_ += hrun.len; + } else { + break; + } + } + return cursor_ != (unsigned)state_.cursor_; + } + + // we'll use it only while we move in branch… + inline void Move() { + for (unsigned i = 0; i < Proceeded.size(); ++i) { + state_.current_read_->Add(Proceeded[i].FixedHRun()); + state_.kmer_ <<= Proceeded[i].FixedHRun(); + calcer_.Update(state_.penalty_state, Proceeded[i], + context_.TryGetKMerStats(state_.kmer_)); + } + state_.cursor_ = (int16_t)cursor_; + } +}; + +template +class SkipMayBeBadHRun { +private: + using TState = CorrectionState; + const TState& previous_; + const CorrectionContext& context_; + const PenaltyCalcer& calcer_; + + public: + SkipMayBeBadHRun(const TState& previous, + const CorrectionContext& context, + const PenaltyCalcer& calcer) + : previous_(previous) + , context_(context) + , calcer_(calcer) {} + + inline TState State() { + StateBuilder nextBuilder(previous_, calcer_, context_); + const auto hrun = context_.GetHRun(previous_.Position()); + nextBuilder.AddEvent(IonEvent(hrun.nucl, hrun.len, hrun.len, false)); + return nextBuilder.Build(); + } +}; + +class HRunSizeSearcher { + private: + HKMer hkmer_; + const uint8_t observed_nucl_; + const char observed_size_; + const std::function& is_good_func; + + public: + HRunSizeSearcher(const HKMer& prev, + HRun run, + std::function& good) + : hkmer_(prev), + observed_nucl_(run.nucl), + observed_size_(run.len), + is_good_func(good) { + assert(hkmer_[K - 1].nucl != run.nucl); + hkmer_ <<= run; + } + + inline IonEvent WithoutCorrection() { + hkmer_[K - 1].len = observed_size_ & 0x3F; + return IonEvent(observed_nucl_, observed_size_, observed_size_, is_good_func(hkmer_)); + } + + inline std::vector TryFindInsertions(char max_error_size = 3, + const bool greedy = true) { + std::vector results; + results.reserve(max_error_size); + + const char nucl = hkmer_[K - 1].nucl; + for (char i = 1; i <= max_error_size; ++i) { + hkmer_[K - 1].len = (observed_size_ + i) & 0x3F; + if (is_good_func(hkmer_)) { + results.push_back( + IonEvent(nucl, observed_size_, (uint8_t)(observed_size_ + i), true)); + if (greedy) { + break; + } + } + } + return results; + } + + inline std::vector TryFindAllDeletions(const char max_error_size = 3, + const bool greedy = true) { + std::vector results; + results.reserve(max_error_size); + + const char nucl = hkmer_[K - 1].nucl; + + const char start = (const char)std::max(1, observed_size_ - max_error_size); + + for (char i = (char)(observed_size_ - 1); i >= start; --i) { + hkmer_[K - 1].len = i & 0x3F; + if (is_good_func(hkmer_)) { + results.push_back(IonEvent(nucl, observed_size_, i, true)); + if (greedy) { + break; + } + } + } + return results; + } + + inline IonEvent TryFindInsertion(char max_error_size = 3) { + const char nucl = hkmer_[K - 1].nucl; + bool found = false; + for (char i = 1; i <= max_error_size; ++i) { + hkmer_[K - 1].len = (observed_size_ + i) & 0x3F; + if (is_good_func(hkmer_)) { + found = true; + break; + } + } + return IonEvent(nucl, observed_size_, + (const char)(found ? hkmer_[K - 1].len : observed_size_ + 1), + found); + } + + inline IonEvent TryFindDeletion(const char max_error_size = 3) { + const char nucl = hkmer_[K - 1].nucl; + bool found = false; + + const char start = (const char)std::max(1, observed_size_ - max_error_size); + + for (char i = (char)(observed_size_ - 1); i >= start; --i) { + hkmer_[K - 1].len = i & 0x3F; + if (is_good_func(hkmer_)) { + found = true; + break; + } + } + return IonEvent(nucl, observed_size_, + (const char)(found ? hkmer_[K - 1].len : observed_size_ - 1), + found); + } + + inline std::vector Find(const char max_error_size = 3) { + std::vector events; + + IonEvent without = WithoutCorrection(); + if (without.is_to_good_correction_) { + events.push_back(without); + return events; + } + + IonEvent insertion = TryFindInsertion(max_error_size); + if (insertion.is_to_good_correction_) { + events.push_back(insertion); + } + + IonEvent deletion = TryFindDeletion(max_error_size); + if (deletion.is_to_good_correction_) { + events.push_back(deletion); + } + + return events; + } +}; + +template +class CorrectLastHRun { + using TState = CorrectionState; + const TState& previous_; + const CorrectionContext& context_; + const PenaltyCalcer& calcer_; + std::function is_good_function_; + + const unsigned kMaxFulldel = cfg::get().max_full_del; + const unsigned kMaxInDel = cfg::get().max_indel; + const unsigned kMaxFromZeroInsertion = cfg::get().max_from_zero_insertion; + const unsigned kMaxSecondIndel = cfg::get().max_second_indel; + + private: + inline bool AddAnotherNuclInsertions(const HRun run, + const TState& previous, + std::priority_queue& corrections) { + bool found = false; + const auto& kmer = previous.GetHKMer(); + + for (uint8_t c = 0; c < 4; ++c) { + if (c == run.nucl || c == kmer[K - 1].nucl) { + continue; + } + + HKMer another_nucl_insertion = kmer; + another_nucl_insertion <<= HRun(c, 1); + + for (unsigned i = 1; i <= kMaxFromZeroInsertion; ++i) { + another_nucl_insertion[K - 1].len = i & 0x3F; + if (is_good_function_(another_nucl_insertion)) { + HRunSizeSearcher rest_searcher(another_nucl_insertion, run, is_good_function_); + auto events = rest_searcher.Find((const char)kMaxSecondIndel); + for (auto& event : events) { + if (event.is_to_good_correction_) { + StateBuilder builder(previous, calcer_, context_); + builder.AddEvent(IonEvent(c, 0, (const char)i, true)); // new insertion + builder.AddEvent(event); + corrections.emplace(builder.Build()); + found = true; + } + } + break; + } + } + } + return found; + } + + public: + CorrectLastHRun(const TState& previous, + const CorrectionContext& context, + const PenaltyCalcer& calcer) + : previous_(previous), + context_(context), + calcer_(calcer), + is_good_function_(calcer_.Good()) {} + + inline void AddOnlySimpleCorrections(std::priority_queue& corrections, + unsigned indel_size = 1) { + const unsigned cursor = previous_.Position(); + const HRun run = context_.GetHRun(cursor); + + if (!is_good_function_(previous_.GetHKMer())) { + return; + } + + { + HRunSizeSearcher searcher(previous_.GetHKMer(), run, is_good_function_); + auto insertion = searcher.TryFindInsertion((char)indel_size); + if (insertion.is_to_good_correction_) { + StateBuilder builder(previous_, calcer_, context_); + builder.AddEvent(insertion); + corrections.emplace(builder.Build()); + } + + auto deletion = searcher.TryFindDeletion((const char)indel_size); + if (deletion.is_to_good_correction_) { + StateBuilder builder(previous_, calcer_, context_); + builder.AddEvent(deletion); + corrections.emplace(builder.Build()); + } + } + // + if (run.len == 1 && (cursor + 1 < context_.GetRead().size())) { + auto nextRun = context_.GetHRun(cursor + 1); + { + for (char c = 0; c < 4; ++c) { + if (c == run.nucl || c == nextRun.nucl) { + continue; + } + + HKMer kmer = previous_.GetHKMer(); + kmer <<= HRun((uint8_t)c, 1); + + if (is_good_function_(kmer)) { + StateBuilder builder(previous_, calcer_, context_); + builder.AddEvent(IonEvent(run.nucl, run.len, 0, true)); + builder.AddEvent(IonEvent(c, 0, 1, true)); + corrections.emplace(builder.Build()); + } + } + } + } else if (run.len > 2) { + for (char c = 0; c < 4; ++c) { + if (c == run.nucl) { + continue; + } + + HKMer kmer = previous_.GetHKMer(); + kmer <<= HRun(run.nucl, (uint8_t)(run.len - 1)); + kmer <<= HRun(c, 1); + kmer <<= HRun(run.nucl, 1); + + const unsigned maxLen = (unsigned)(run.len - 2); + for (unsigned i = 0; i < maxLen; ++i) { + kmer[K - 3].len = (i + 1) & 0x3F; + kmer[K - 1].len = (maxLen - i) & 0x3F; + if (is_good_function_(kmer)) { + StateBuilder builder(previous_, calcer_, context_); + builder.AddEvent(IonEvent(run.nucl, (char)(i + 2), (char)(i + 1), true)); + builder.AddEvent(IonEvent(c, (char)0, (char)1, true)); + builder.AddEvent( + IonEvent(run.nucl, (char)(maxLen - i), (char)(maxLen - i), true)); + corrections.emplace(builder.Build()); + } + } + } + } + } + + inline bool AddPossibleCorrections(std::priority_queue& corrections) { + const unsigned cursor = previous_.Position(); + const HRun run = context_.GetHRun(cursor); + bool found = false; + + if (is_good_function_(previous_.GetHKMer())) { + HRunSizeSearcher searcher(previous_.GetHKMer(), run, is_good_function_); + { + auto insertions = searcher.TryFindInsertions((char)kMaxInDel); + for (const auto& insertion : insertions) { + if (insertion.is_to_good_correction_) { + StateBuilder builder(previous_, calcer_, context_); + builder.AddEvent(insertion); + corrections.emplace(builder.Build()); + found = true; + } + } + } + + { + auto deletions = searcher.TryFindAllDeletions((const char)std::max((int)run.len, 1)); + if (deletions.size()) { + for (const auto& deletion : deletions) { + const uint8_t restSize = + (uint8_t)(deletion.overserved_size_ - deletion.fixed_size_); + if (restSize <= kMaxInDel) { + StateBuilder builder(previous_, calcer_, context_); + builder.AddEvent(deletion); + corrections.emplace(builder.Build()); + } + + // Try insertion after part of hrun. Errors of type aaaaa -> aaa g + // aa + if (restSize > 1) { + StateBuilder indel_builder(previous_, + calcer_, + context_); + const IonEvent partDel = IonEvent( + deletion.nucl_, deletion.fixed_size_, deletion.fixed_size_, true); + indel_builder.AddEvent(partDel); + const TState state = indel_builder.Build(); + found |= AddAnotherNuclInsertions(HRun(deletion.nucl_, restSize), + state, corrections); + } + } + found = true; + } + } + + if (!found) { + found |= AddAnotherNuclInsertions(run, previous_, corrections); + + int read_size = (int)context_.GetRead().size(); + const int next_cursor = cursor + run.len; + + if (next_cursor >= read_size) { + return found; + } + const HRun next_run = context_.GetHRun((size_t)next_cursor); + + // try full deletion of hrun. + if (run.len <= kMaxFulldel) { + if (next_run.nucl != previous_.GetHKMer()[K - 1].nucl) { + StateBuilder builder(previous_, calcer_, context_); + builder.AddEvent(IonEvent(run.nucl, run.len, 0, true)); // full deletion + corrections.emplace(builder.Build()); + found = true; + } else { + { + StateBuilder builder(previous_, calcer_, context_); + builder.AddEvent(IonEvent(run.nucl, run.len, 0, true)); // full deletion + builder.AddEvent(IonEvent(next_run.nucl, next_run.len, 0, true)); // full deletion + corrections.emplace(builder.Build()); + } + { + StateBuilder builder(previous_, calcer_, context_); + builder.AddEvent(IonEvent(run.nucl, run.len, 0, true)); // full deletion + auto state = builder.Build(); + found |= AddAnotherNuclInsertions(next_run, state, corrections); + } + } + } + } + } else { + { + HKMer test = previous_.GetHKMer(); + HRun fixed = run; + fixed.len = (fixed.len + 1) & 0x3F; + test <<= fixed; + size_t local_cursor = cursor + run.len; + + for (unsigned i = 0; i < (K - 1); ++i) { + if (local_cursor >= context_.GetRead().size()) { + break; + } + const HRun cursorRun = context_.GetHRun(local_cursor); + test <<= cursorRun; + local_cursor += cursorRun.len; + + if (is_good_function_(test)) { + found = true; + StateBuilder builder(previous_, calcer_, context_); + builder.AddEvent(IonEvent(run.nucl, run.len, (char) (run.len + 1), false)); + corrections.emplace(builder.Build()); + break; + } + } + } + + if (run.len > 1) { + HKMer test = previous_.GetHKMer(); + HRun fixed = run; + fixed.len = (fixed.len - 1) & 0x3F; + test <<= fixed; + + size_t local_cursor = cursor + run.len; + + for (unsigned i = 0; i < (K - 1); ++i) { + if (local_cursor >= context_.GetRead().size()) { + break; + } + const HRun cursorRun = context_.GetHRun(local_cursor); + test <<= cursorRun; + local_cursor += cursorRun.len; + + if (is_good_function_(test)) { + found = true; + StateBuilder builder(previous_, calcer_, context_); + builder.AddEvent(IonEvent(run.nucl, run.len, (uint8_t)(run.len - 1), false)); + corrections.emplace(builder.Build()); + break; + } + } + } + } + return found; + } +}; +} // namespace correction +} // namespace hammer + +namespace std { +using namespace hammer::correction; + +template +struct less > { + bool operator()(const CorrectionState& left, + const CorrectionState& right) const { + return left.Penalty() < right.Penalty() || + (left.Penalty() == right.Penalty() && + left.Position() < right.Position()); + } +}; + +} // namespace std + +#endif // PROJECT_READ_CORRECTOR_INFO_H diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/reference.cpp spades-3.11.1+dfsg/src/projects/ionhammer/reference.cpp --- spades-3.10.1+dfsg/src/projects/ionhammer/reference.cpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/reference.cpp 2017-09-28 09:05:13.000000000 +0000 @@ -0,0 +1 @@ +#include "reference.h" diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/reference.h spades-3.11.1+dfsg/src/projects/ionhammer/reference.h --- spades-3.10.1+dfsg/src/projects/ionhammer/reference.h 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/reference.h 2017-09-28 09:05:13.000000000 +0000 @@ -0,0 +1,59 @@ +// +// Created by Vasiliy Ershov on 10/07/16. +// + +#ifndef PROJECT_REFERENCE_H +#define PROJECT_REFERENCE_H +#include "utils/logger/log_writers.hpp" + +#include "hkmer.hpp" + +#include "io/reads/file_reader.hpp" +#include "io/reads/read_processor.hpp" +#include "kmer_helpers.h" + +#include +#include +#include +#include + +class TGenomReferenceOracle { + private: + const std::string FilePath; + HKMerSet ReferenceKMers; + + public: + TGenomReferenceOracle(const std::string& filePath) : FilePath(filePath) { + FillSet(ReferenceKMers, filePath.data()); + INFO("Reference kmers: " << ReferenceKMers.size()); + } + + bool IsGenomic(const hammer::HKMer& kmer) const { + return ReferenceKMers.count(kmer) > 0; + } + + void KMerSetStats(const HKMerSet& kmers, std::string setName) const { + INFO("Stats for " << setName); + + size_t total_genomic = ReferenceKMers.size(); + size_t total_set = kmers.size(); + + size_t set_genomic = 0; + + for (auto it = ReferenceKMers.cbegin(), et = ReferenceKMers.cend(); + it != et; ++it) { + if (kmers.count(*it) > 0) { + set_genomic += 1; + } + } + + long set_non_genomic = total_set - set_genomic; + + INFO("Set kmers: " << total_set); + INFO("Genomic: " << set_genomic << " (" + << ((double)set_genomic * 100.0 / (double)total_genomic) << "%)"); + INFO("NonGenomic: " << set_non_genomic); + } +}; + +#endif // PROJECT_REFERENCE_H diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/subcluster.cpp spades-3.11.1+dfsg/src/projects/ionhammer/subcluster.cpp --- spades-3.10.1+dfsg/src/projects/ionhammer/subcluster.cpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/subcluster.cpp 2017-09-28 09:05:13.000000000 +0000 @@ -14,25 +14,42 @@ #include -#include #include +#include +#include "quality_metrics.h" +#include + +using namespace hammer; +using namespace hammer_config; +using namespace n_gamma_poisson_model; + +double TGenomicHKMersEstimator::GenerateLikelihood(const HKMer& from, + const HKMer& to) const { + double llGenerate = 0; + for (size_t i = 0; i < hammer::K; ++i) { + llGenerate += cluster_model_.ErrorLogLikelihood(from[i].len, to[i].len); + } + return llGenerate; +} -hammer::HKMer center(const KMerData &data, const std::vector& kmers) { +HKMer TGenomicHKMersEstimator::Center(const KMerData& data, + const std::vector& kmers) { hammer::HKMer res; namespace numeric = boost::numeric::ublas; for (unsigned i = 0; i < hammer::K; ++i) { numeric::matrix scores(4, 64, 0); for (size_t j = 0; j < kmers.size(); ++j) { - const hammer::KMerStat &k = data[kmers[j]]; - // FIXME: switch to MLE when we'll have use per-run quality values + const hammer::KMerStat& k = data[kmers[j]]; +// FIXME: switch to MLE when we'll have use per-run quality values #if 1 - scores(k.kmer[i].nucl, k.kmer[i].len) += double(k.count) * (1 - k.qual); + scores(k.kmer[i].nucl, k.kmer[i].len) += k.count * (1.0 - exp(k.qual)); #else for (unsigned n = 0; n < 4; ++n) for (unsigned l = 1; l < 64; ++l) - scores(n, l) += k.count * (n == k.kmer[i].nucl && l == k.kmer[i].len ? - log(1 - k.qual) : log(k.qual) - log(4*63 - 1)); + scores(n, l) += k.count * (n == k.kmer[i].nucl && l == k.kmer[i].len + ? log(1 - k.qual) + : log(k.qual) - log(4 * 63 - 1)); #endif } @@ -42,94 +59,210 @@ return res; } -bool assign(KMerData &kmer_data, const std::vector &cluster) { - hammer::HKMer c = center(kmer_data, cluster); - bool nonread = false; +HKMer TGenomicHKMersEstimator::ByPosteriorQualCenter( + const std::vector& kmers) { + hammer::HKMer res; + namespace numeric = boost::numeric::ublas; - size_t idx = kmer_data.seq_idx(c); - if (kmer_data[idx].kmer != c) { -# pragma omp critical - { - idx = kmer_data.push_back(hammer::KMerStat(0, c, 1.0)); + for (unsigned i = 0; i < hammer::K; ++i) { + numeric::matrix scores(4, 64, 0); + for (size_t j = 0; j < kmers.size(); ++j) { + const hammer::KMerStat& kmerStat = data_[kmers[j]]; + scores(kmerStat.kmer[i].nucl, kmerStat.kmer[i].len) += + kmerStat.count * exp(cluster_model_.GenomicLogLikelihood(kmerStat)); } - nonread = true; - } - for (size_t j = 0; j < cluster.size(); ++j) - kmer_data[cluster[j]].changeto = unsigned(idx); + res[i] = hammer::iontorrent::consensus(scores).first; + } - return nonread; + return res; } -void dump(const KMerData &kmer_data, const std::vector &cluster) { - std::cerr << "{ \n\"kmers\": {"; - for (size_t j = 0; j < cluster.size(); ++j) { - if (j > 0) std::cerr << ", "; - std::cerr << '"' << kmer_data[cluster[j]].kmer << "\": [" - << kmer_data[cluster[j]].count << ", " - << 1 - kmer_data[cluster[j]].qual << "] \n"; - } - std::cerr << "}, \"center\": { \"status\": "; - hammer::HKMer c = center(kmer_data, cluster); - size_t idx = kmer_data.seq_idx(c); - if (kmer_data[idx].kmer == c) { - std::cerr << "\"ok\", \"center\": \"" << c << "\"}\n"; - } else { - std::cerr << "\"not\", \"kmer\": \"" << kmer_data[idx].kmer - << "\", \"center\": \"" << c << "\"}\n"; +void TGenomicHKMersEstimator::ProceedCluster(std::vector& cluster) { + std::sort(cluster.begin(), cluster.end(), CountCmp(data_)); + + std::vector qualities; + std::vector candidates; + + for (size_t i = 0; i < cluster.size(); ++i) { + + const auto idx = cluster[i]; + const auto& stat = data_[idx]; + + if ((uint)stat.count < cfg::get().subcluster_min_count && (i > 0)) { + break; + } + + const double qual = cluster_model_.StatTransform(stat); + const double posterior = cluster_model_.GenomicLogLikelihood(stat); + + if (!std::isfinite(posterior)) { + continue; + } + + if (posterior > cfg::get().subcluster_threshold || i == 0) { + candidates.push_back(idx); + qualities.push_back(qual); + } } - std::cerr << "}" << std::endl; -} -size_t subcluster(KMerData &kmer_data, std::vector &cluster) { - size_t nonread = 0; + std::vector distOneBestQualities(qualities); + std::vector countThreshold(qualities.size()); - // First, sort the kmer indicies wrt count - std::sort(cluster.begin(), cluster.end(), CountCmp(kmer_data)); - // The number of subclusters for now is really dumb: we assume that the quality should be 1. - size_t k = 0; - for (size_t i = 0; i < cluster.size(); ++i) - k += kmer_data[cluster[i]].qual < cfg::get().center_qual_threshold; - - if (k <= 1) { -#if 0 - dump(kmer_data, cluster); -#endif - return assign(kmer_data, cluster); + std::vector kmerErrorRates; + + { + for (size_t i = 0; i < candidates.size(); ++i) { + const auto& centerCandidate = data_[candidates[i]]; + kmerErrorRates.push_back(exp(GenerateLikelihood(centerCandidate.kmer, centerCandidate.kmer))); + + for (size_t j = 0; j < i; ++j) { + const auto& parent = data_[candidates[j]]; + + if (cfg::get().subcluster_filter_by_count_enabled) { + const double mult = pow(cfg::get().subcluster_count_mult, hammer::hkmerDistance(parent.kmer, centerCandidate.kmer).levenshtein_); + countThreshold[i] += mult * parent.count / kmerErrorRates[j]; + } + + if (hammer::hkmerDistance(parent.kmer, centerCandidate.kmer).levenshtein_ <= 1) { + distOneBestQualities[i] = std::min(distOneBestQualities[i], qualities[j]); + } + } + + auto distOneParents = FindDistOneFullDels(centerCandidate); + for (auto distOneParent : distOneParents) { + const auto& parent = data_[distOneParent]; + distOneBestQualities[i] = std::min(distOneBestQualities[i], cluster_model_.StatTransform(parent)); + + if (cfg::get().subcluster_filter_by_count_enabled) { + countThreshold[i] += cfg::get().subcluster_count_mult * parent.count / 10 / exp(GenerateLikelihood(parent.kmer, parent.kmer)); + } + } + } } - // Find the closest center - std::vector > idx(k, std::vector()); - for (size_t i = 0; i < k; ++i) - idx[i].push_back(cluster[i]); - for (size_t i = k; i < cluster.size(); ++i) { - unsigned dist = std::numeric_limits::max(); - size_t cidx = k; - hammer::HKMer kmerx = kmer_data[cluster[i]].kmer; - for (size_t j = 0; j < k; ++j) { - hammer::HKMer kmery = kmer_data[cluster[j]].kmer; - unsigned cdist = hammer::distanceHKMer(kmerx.begin(), kmerx.end(), - kmery.begin(), kmery.end()); - if (cdist < dist) { - cidx = j; - dist = cdist; - } - } - VERIFY(cidx < k); - idx[cidx].push_back(cluster[i]); - } - - for (auto it = idx.begin(), et = idx.end(); it != et; ++it) { - const std::vector &subcluster = *it; - - if (assign(kmer_data, subcluster)) { - nonread += 1; -#if 0 - dump(kmer_data, subcluster); -#endif + std::vector centerCandidates; + + const double qualMult = cfg::get().subcluster_qual_mult; + // const double alpha = cfg::get().dist_one_subcluster_alpha; + + for (size_t i = 0; i < candidates.size(); ++i) { + const auto& candidate = data_[candidates[i]]; + + //don't subcluster low coverage hkmers with long runs, we can't distinguish dist-one error from noise. + if (cfg::get().subcluster_filter_by_count_enabled) { + double upperCountThreshold = boost::math::gamma_q_inva(countThreshold[i] + 1, 0.99) - 1; + if (candidate.count <= upperCountThreshold) { + continue; + } } + + if (i != 0 && distOneBestQualities[i] * qualMult < qualities[i]) { + if (!cluster_model_.IsHighQuality(candidate)) { + continue; + } + } + centerCandidates.push_back(candidates[i]); } - return nonread; + if (!centerCandidates.size()) { + return; + } + + // First consensus (it's also filtering step) + if (consensus_type_ != CenterType::COUNT_ARGMAX) { + std::set centerCandidatesSet; + const size_t k = centerCandidates.size(); + // Find the closest center + std::vector centralKmers; + std::vector > subclusters(k, std::vector()); + + for (size_t i = 0; i < k; ++i) { + auto centerId = centerCandidates[i]; + centralKmers.push_back(data_[centerId].kmer); + } + + for (size_t i = 0; i < cluster.size(); ++i) { + double dist = std::numeric_limits::infinity(); + size_t cidx = k; + size_t count = 0; + + size_t kmerIdx = cluster[i]; + hammer::HKMer kmerx = data_[kmerIdx].kmer; + + for (size_t j = 0; j < k; ++j) { + hammer::HKMer kmery = centralKmers[j]; + double cdist = hammer::hkmerDistance(kmerx, kmery).levenshtein_; + if (cdist < dist || (cdist == dist && count < (size_t)data_[kmery].count)) { + cidx = j; + dist = cdist; + count = data_[kmery].count; + } + } + VERIFY(cidx < k); + subclusters[cidx].push_back(cluster[i]); + } + + for (size_t i = 0; i < k; ++i) { + const auto& subcluster = subclusters[i]; + + HKMer center; + if (consensus_type_ == CenterType::CONSENSUS) { + center = Center(data_, subcluster); + } else if (consensus_type_ == CenterType::BY_POSTERIOR_QUALITY) { + center = ByPosteriorQualCenter(subcluster); + } else { + INFO("Unsupported center type: will use mean instead"); + center = Center(data_, subcluster); + } + + auto centerIdx = data_.checking_seq_idx(center); + + if ((k == 1 && centerIdx != -1ULL) || (centerIdx == centerCandidates[i])) { + centerCandidatesSet.insert(centerIdx); + } + } + + centerCandidates = std::vector(centerCandidatesSet.begin(), + centerCandidatesSet.end()); + } + + std::vector posteriorQualities; + // Now let's "estimate" quality + std::vector distOneGoodCenters(centerCandidates.size()); + + for (uint k = 0; k < centerCandidates.size(); ++k) { + const auto idx = centerCandidates[k]; + const KMerStat& centerCandidate = data_[idx]; + for (uint j = 0; j < centerCandidates.size(); ++j) { + if (hammer::hkmerDistance(centerCandidate.kmer, data_[centerCandidates[j]].kmer).hamming_ == 1) { + distOneGoodCenters[k] = 1; + } + } + double quality = cluster_model_.GenomicLogLikelihood(centerCandidate); + quality = std::isfinite(quality) ? quality : -1000; + posteriorQualities.push_back(max(quality, -1000.0)); + } + + for (size_t i = 0; i < posteriorQualities.size(); ++i) { + const auto idx = centerCandidates[i]; + data_[idx].lock(); + const bool wasGood = data_[idx].good(); + data_[idx].posterior_genomic_ll = (float)max(posteriorQualities[i], (double)data_[idx].posterior_genomic_ll); + data_[idx].dist_one_subcluster |= distOneGoodCenters[i]; + data_[idx].unlock(); + if (!wasGood && data_[idx].good()) { +#pragma omp atomic + GoodKmers++; + } + if (!wasGood && data_[idx].skip()) { +#pragma omp atomic + SkipKmers++; + } + if (wasGood) { +#pragma omp atomic + ReasignedByConsenus++; + } + } } diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/subcluster.hpp spades-3.11.1+dfsg/src/projects/ionhammer/subcluster.hpp --- spades-3.10.1+dfsg/src/projects/ionhammer/subcluster.hpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/subcluster.hpp 2017-09-28 09:05:13.000000000 +0000 @@ -8,16 +8,102 @@ #ifndef __SUBCLUSTER_HPP__ #define __SUBCLUSTER_HPP__ -#include -#include - #include "hkmer.hpp" +#include "kmer_data.hpp" +#include "quality_thresholds_estimator.h" +#include "reference.h" + +#include +#include +#include "gamma_poisson_model.hpp" +#include "normal_quality_model.hpp" +#include "utils/logger/logger.hpp" + +namespace hammer { + +class ClusteringQuality; + + +class TGenomicHKMersEstimator { + private: + KMerData& data_; + const n_normal_model::NormalClusterModel& cluster_model_; + hammer_config::CenterType consensus_type_; + size_t GoodKmers = 0; + size_t SkipKmers = 0; + size_t ReasignedByConsenus = 0; + + public: + TGenomicHKMersEstimator(KMerData& data, const n_normal_model::NormalClusterModel& clusterModel, + hammer_config::CenterType consensusType = hammer_config::CenterType::CONSENSUS) + : data_(data), cluster_model_(clusterModel), consensus_type_(consensusType) {} + + ~TGenomicHKMersEstimator() { + INFO("Good kmers: " << GoodKmers); + INFO("Perfect kmers: " << SkipKmers); + INFO("Reasigned by consensus: " << ReasignedByConsenus); + } + + // we trying to find center candidate, not error candidates. + // so we try insert in every "center" po + std::vector FindDistOneFullDels(const KMerStat& kmerStat) const { + std::vector indices; + const auto& source = kmerStat.kmer; + for (uint k = 1; k < K; ++k) { + auto fixed = source; + for (uint j = k + 1; j < K; ++j) { + fixed[j] = fixed[j - 1]; + } + + auto prev = source[k - 1]; + auto next = source[k]; + + for (int i = 0; i < 4; ++i) { + if (i == prev.nucl || i == next.nucl) { + continue; + } + fixed[k] = HomopolymerRun((uint8_t)i, 1); + auto idx = data_.checking_seq_idx(fixed); + if (idx != -1ULL) { + indices.push_back(idx); + } + } + } + return indices; + } + + void ProceedCluster(std::vector& cluster); + + static size_t GetCenterIdx(const KMerData& kmerData, + const std::vector& cluster) { + if (cluster.size() == 1) { + return cluster[0]; + } + + hammer::HKMer center = Center(kmerData, cluster); + size_t idx = kmerData.checking_seq_idx(center); + + if (idx == -1ULL) { + double bestQual = kmerData[cluster[0]].qual; + idx = cluster[0]; + + for (auto i : cluster) { + if (kmerData[i].qual < bestQual) { + bestQual = kmerData[i].qual; + idx = i; + } + } + } + return idx; + } + + double GenerateLikelihood(const HKMer& from, const HKMer& to) const; -class KMerData; + static HKMer Center(const KMerData& data, const std::vector& kmers); -size_t subcluster(KMerData &kmer_data, std::vector &cluster); + HKMer ByPosteriorQualCenter(const std::vector& kmers); +}; -// for debug purposes -hammer::HKMer center(const KMerData &data, const std::vector& kmers); +} // namespace hammer -#endif // __SUBCLUSTER_HPP__ +#endif // __SUBCLUSTER_HPP__ diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/thread_utils.h spades-3.11.1+dfsg/src/projects/ionhammer/thread_utils.h --- spades-3.10.1+dfsg/src/projects/ionhammer/thread_utils.h 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/thread_utils.h 2017-09-28 09:05:13.000000000 +0000 @@ -0,0 +1,57 @@ +// +// Created by Vasiliy Ershov on 02/12/2016. +// + +#ifndef PROJECT_THREAD_UTILS_H +#define PROJECT_THREAD_UTILS_H + +namespace n_computation_utils { + +template +class ParallelStatisticsCalcer { + private: + size_t num_threads_; + + public: + ParallelStatisticsCalcer(size_t num_threads) : num_threads_(num_threads) {} + + template + AdditiveStat Calculate(size_t n, std::function&& factory, + TFunction&& func) const { + std::vector aggregated_stats; + for (uint i = 0; i < num_threads_; ++i) { + aggregated_stats.push_back(factory()); + } + +#pragma omp parallel for num_threads(num_threads_) + for (size_t i = 0; i < n; ++i) { + const auto tid = omp_get_thread_num(); + func(aggregated_stats[tid], i); + } + + for (size_t i = 1; i < aggregated_stats.size(); ++i) { + aggregated_stats[0] += aggregated_stats[i]; + } + return aggregated_stats[0]; + } +}; + +template +class TAdditiveStatisticsCalcer { + private: + const std::vector& stats_; + size_t num_threads_; + + public: + TAdditiveStatisticsCalcer(const std::vector& stats, size_t num_threads) + : stats_(stats), num_threads_(num_threads) {} + + TAdditiveStat Calculate(std::function&& factory) const { + ParallelStatisticsCalcer parallel_calcer(num_threads_); + return parallel_calcer.Calculate( + stats_.size(), std::move(factory), + [&](TAdditiveStat& stat, size_t i) { stat.Add(stats_[i]); }); + } +}; +} // namespace n_computation_utils +#endif // PROJECT_THREAD_UTILS_H diff -Nru spades-3.10.1+dfsg/src/projects/ionhammer/valid_hkmer_generator.hpp spades-3.11.1+dfsg/src/projects/ionhammer/valid_hkmer_generator.hpp --- spades-3.10.1+dfsg/src/projects/ionhammer/valid_hkmer_generator.hpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/ionhammer/valid_hkmer_generator.hpp 2017-09-28 09:05:13.000000000 +0000 @@ -12,13 +12,13 @@ #include #include -#include "io/reads/single_read.hpp" #include "HSeq.hpp" +#include "io/reads/single_read.hpp" -#include #include +#include -template +template class ValidHKMerGenerator { public: /** @@ -29,11 +29,9 @@ */ // FIXME: Switch to delegating ctor. explicit ValidHKMerGenerator(const io::SingleRead &read, - unsigned bad_quality_threshold = 2) { - Reset(read.GetSequenceString().data(), - read.GetQualityString().data(), - read.GetSequenceString().size(), - bad_quality_threshold); + unsigned bad_quality_threshold = 5) { + Reset(read.GetSequenceString().data(), read.GetQualityString().data(), + read.GetSequenceString().size(), bad_quality_threshold); } /** @@ -43,21 +41,26 @@ * nucleotides with quality lower the threshold from the ends of the * read. */ - explicit ValidHKMerGenerator(const char *seq, const char *qual, - size_t len, - unsigned bad_quality_threshold = 2) { + explicit ValidHKMerGenerator(const char *seq, const char *qual, size_t len, + unsigned bad_quality_threshold = 5) { Reset(seq, qual, len, bad_quality_threshold); } ValidHKMerGenerator() - : kmer_(), seq_(0), qual_(0), - pos_(-1), nlen_(-1), end_(-1), len_(0), - correct_probability_(1), bad_quality_threshold_(2), - has_more_(false), first_(true) {} - - void Reset(const char *seq, const char *qual, - size_t len, - unsigned bad_quality_threshold = 2) { + : kmer_(), + seq_(0), + qual_(0), + pos_(-1), + nlen_(-1), + end_(-1), + len_(0), + correct_probability_(0), + bad_quality_threshold_(5), + has_more_(false), + first_(true) {} + + void Reset(const char *seq, const char *qual, size_t len, + unsigned bad_quality_threshold = 5) { kmer_ = hammer::HSeq(); seq_ = seq; qual_ = qual; @@ -65,12 +68,14 @@ nlen_ = -1; end_ = -1; len_ = len; - correct_probability_ = 1.0; + correct_probability_ = 0.0; bad_quality_threshold_ = bad_quality_threshold; has_more_ = true; first_ = true; last_ = false; probs_.resize(0); + runlens_.resize(0); + length = 0; TrimBadQuality(); Next(); @@ -80,47 +85,35 @@ * @result true if Next() succeed while generating new k-mer, false * otherwise. */ - bool HasMore() const { - return has_more_; - } + bool HasMore() const { return has_more_; } /** * @result last k-mer generated by Next(). */ - const hammer::HSeq& kmer() const { - return kmer_; - } + const hammer::HSeq &kmer() const { return kmer_; } /** * @result last k-mer position in initial read. */ - size_t pos() const { - return pos_; - } + size_t pos() const { return pos_; } - size_t nlen() const { - return nlen_; - } + size_t nlen() const { return nlen_; } /** * @result number of nucleotides trimmed from left end */ - size_t trimmed_left() const { - return beg_; - } + size_t trimmed_left() const { return beg_; } /** * @result number of nucleotides trimmed from right end */ - size_t trimmed_right() const { - return len_ - end_; - } + size_t trimmed_right() const { return len_ - end_; } /** * @result probability that last generated k-mer is correct. */ double correct_probability() const { - return correct_probability_; + return exp(correct_probability_ / (double)length); } /** @@ -128,27 +121,30 @@ * if succeeded. You can access k-mer read with kmer(). */ void Next(); + private: void TrimBadQuality(); double Prob(unsigned qual) { - return (qual < 3 ? 0.25 : 1 - pow(10.0, -(qual / 10.0))); - // return Globals::quality_probs[qual]; + return max(1 - pow(10.0, -(qual / 10.0)), + 1e-40); //(qual < 3 ? 0.25 : 1 - pow(10.0, -(qual / 10.0))); + // return Globals::quality_probs[qual]; } unsigned GetQual(size_t pos) { if (pos >= len_) { - return 2; + return 1; } else { return qual_[pos]; } } hammer::HSeq kmer_; - const char* seq_; - const char* qual_; + const char *seq_; + const char *qual_; size_t pos_; size_t nlen_; + size_t length = 0; size_t beg_; size_t end_; size_t len_; @@ -158,30 +154,29 @@ bool first_; bool last_; std::deque probs_; + std::deque runlens_; // Disallow copy and assign - ValidHKMerGenerator(const ValidHKMerGenerator&) = delete; - void operator=(const ValidHKMerGenerator&) = delete; + ValidHKMerGenerator(const ValidHKMerGenerator &) = delete; + void operator=(const ValidHKMerGenerator &) = delete; }; -template +template void ValidHKMerGenerator::TrimBadQuality() { pos_ = 0; if (qual_) for (; pos_ < len_; ++pos_) { - if (GetQual(pos_) >= bad_quality_threshold_) - break; + if (GetQual(pos_) >= bad_quality_threshold_) break; } beg_ = pos_; end_ = len_; if (qual_) for (; end_ > pos_; --end_) { - if (GetQual(end_ - 1) >= bad_quality_threshold_) - break; - } + if (GetQual(end_ - 1) >= bad_quality_threshold_) break; + } } -template +template void ValidHKMerGenerator::Next() { if (last_) { has_more_ = false; @@ -190,7 +185,8 @@ size_t toadd = (first_ ? kK : 1); char pnucl = -1; - double cprob = 1.0; + double cprob = 0.0; + double len = 0.0; nlen_ = 0; // Build the flow-space kmer looking over homopolymer streches. while (toadd) { @@ -200,7 +196,7 @@ if (pos_ >= end_) { last_ = true; if (toadd > 0) { - has_more_ = false; + has_more_ = false; } return; } @@ -212,12 +208,17 @@ pnucl = -1; pos_ += nlen_ + 1; nlen_ = 0; - correct_probability_ = 1.0; + len = 0; + length = 0; + correct_probability_ = 0.0; probs_.resize(0); + runlens_.resize(0); continue; } - if (qual_) - cprob *= Prob(GetQual(pos_ + nlen_)); + if (qual_) { + cprob += log(Prob(GetQual(pos_ + nlen_))); + ++len; + } // If current nucl differs from previous nucl then either we're starting the // k-mer or just finished the homopolymer run. @@ -225,14 +226,21 @@ // If previous nucl was valid then finish the current homopolymer run if (pnucl != -1) { toadd -= 1; - correct_probability_ *= cprob; + + correct_probability_ += cprob; + length += (size_t)len; + if (probs_.size() == kK) { - correct_probability_ /= probs_[0]; + correct_probability_ -= probs_[0]; + length -= (size_t)runlens_[0]; probs_.pop_front(); + runlens_.pop_front(); } probs_.push_back(cprob); - cprob = 1.0; + runlens_.push_back(len); + cprob = 0.0; + len = 0; } pnucl = cnucl; } diff -Nru spades-3.10.1+dfsg/src/projects/mph_test/main.cpp spades-3.11.1+dfsg/src/projects/mph_test/main.cpp --- spades-3.10.1+dfsg/src/projects/mph_test/main.cpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mph_test/main.cpp 2017-09-28 09:05:13.000000000 +0000 @@ -7,8 +7,8 @@ #include "utils/logger/log_writers.hpp" #include "utils/segfault_handler.hpp" -#include "utils/indices/perfect_hash_map.hpp" -#include "utils/mph_index/kmer_index_builder.hpp" +#include "utils/ph_map/perfect_hash_map.hpp" +#include "utils/kmer_mph/kmer_index_builder.hpp" #include "io/reads/read_processor.hpp" #include "io/reads/io_helper.hpp" @@ -30,18 +30,17 @@ attach_logger(lg); } -class SimplePerfectHashMap : public debruijn_graph::KeyIteratingMap { - using base = debruijn_graph::KeyIteratingMap; +class SimplePerfectHashMap : public utils::KeyIteratingMap { + using base = utils::KeyIteratingMap; public: SimplePerfectHashMap(size_t k, const std::string &workdir) : base(k, workdir) {} }; -class ParallelSortingSplitter : public KMerSortingSplitter { +class ParallelSortingSplitter : public utils::KMerSortingSplitter { using Seq = RtSeq; std::vector files_; - unsigned nthreads_; size_t read_buffer_size_; class BufferFiller { @@ -78,17 +77,15 @@ public: - ParallelSortingSplitter(const std::string &workdir, unsigned K, unsigned nthreads, size_t read_buffer_size = 0) - : KMerSortingSplitter(workdir, K), nthreads_(nthreads), read_buffer_size_(read_buffer_size) {} + ParallelSortingSplitter(const std::string &workdir, unsigned K, size_t read_buffer_size = 0) + : KMerSortingSplitter(workdir, K), read_buffer_size_(read_buffer_size) {} void push_back(const std::string &filename) { files_.push_back(filename); } - path::files_t Split(size_t num_files) override { - INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while."); - - path::files_t out = PrepareBuffers(num_files, nthreads_, read_buffer_size_); + fs::files_t Split(size_t num_files, unsigned nthreads) override { + fs::files_t out = PrepareBuffers(num_files, nthreads, read_buffer_size_); size_t n = 10; BufferFiller filler(*this, K()); @@ -96,7 +93,7 @@ INFO("Processing " << file); auto irs = io::EasyStream(file, true, true); while (!irs->eof()) { - hammer::ReadProcessor rp(nthreads_); + hammer::ReadProcessor rp(nthreads); rp.Run(*irs, filler); DumpBuffers(out); VERIFY_MSG(rp.read() == rp.processed(), "Queue unbalanced"); @@ -116,7 +113,7 @@ }; int main(int argc, char* argv[]) { - perf_counter pc; + utils::perf_counter pc; srand(42); srandom(42); @@ -160,7 +157,7 @@ INFO("# of threads to use: " << nthreads); SimplePerfectHashMap index(K, workdir); - ParallelSortingSplitter splitter(workdir, K, nthreads, read_buffer_size); + ParallelSortingSplitter splitter(workdir, K, read_buffer_size); if (options.count("dataset")) { io::DataSet<> idataset; idataset.load(dataset); @@ -170,7 +167,7 @@ for (const auto& s : input) splitter.push_back(s); } - KMerDiskCounter counter(workdir, splitter); + utils::KMerDiskCounter counter(workdir, splitter); counter.CountAll(16, nthreads); INFO("K-mer counting done, kmers saved to " << counter.GetFinalKMersFname()); } catch (std::string const &s) { diff -Nru spades-3.10.1+dfsg/src/projects/mts/annotation.hpp spades-3.11.1+dfsg/src/projects/mts/annotation.hpp --- spades-3.10.1+dfsg/src/projects/mts/annotation.hpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/annotation.hpp 2017-09-28 09:05:13.000000000 +0000 @@ -20,11 +20,8 @@ ContigAnnotation Parse(const std::string& s) const { ContigAnnotation annotation; - stringstream ss(s); + std::istringstream ss(s); ss >> annotation.first; - string delim; - ss >> delim; - VERIFY(delim == ":"); while (true) { bin_id bin; ss >> bin; @@ -67,7 +64,7 @@ AnnotationOutStream& operator <<(const ContigAnnotation& annotation) { inner_stream_ << annotation.first; - string delim = " : "; + string delim = "\t"; for (bin_id bin : annotation.second) { inner_stream_ << delim << bin; delim = " "; @@ -121,14 +118,14 @@ if (!edge_annotation_.count(e)) { return {}; } - const auto& annotation = get(edge_annotation_, e); + const auto& annotation = utils::get(edge_annotation_, e); return vector(annotation.begin(), annotation.end()); } set RelevantBins(const vector& path) const { set answer; for (EdgeId e : path) { - insert_all(answer, Annotation(e)); + utils::insert_all(answer, Annotation(e)); } return answer; } @@ -171,7 +168,7 @@ for (const bin_id& bin : bins) { if (interesting_bins_.count(bin)) { answer.push_back(bin); - } + } } return answer; } @@ -186,7 +183,7 @@ splits_annotation_stream >> contig_annotation; auto bins = FilterInteresting(contig_annotation.second); if (!bins.empty()) { - insert_all(annotation_map[contig_annotation.first], bins); + utils::insert_all(annotation_map[contig_annotation.first], bins); } ++cnt; } @@ -226,11 +223,15 @@ return answer; }; + static bool IsSpurious(size_t colored_len, size_t full_len) { + return math::ls(double(colored_len) / double(full_len), 0.3); + } + void FilterSpuriousInfo(map>& coloring) const { for (auto& edge_info : coloring) { size_t edge_len = gp_.g.length(edge_info.first); for (auto color_it = edge_info.second.begin(); color_it != edge_info.second.end(); ) { - if (math::ls(double(color_it->second) / double(edge_len), 0.3)) { + if (IsSpurious(color_it->second, edge_len)) { edge_info.second.erase(color_it++); } else { ++color_it; @@ -265,8 +266,15 @@ } } set answer; + using ColorInfo = std::pair; + auto it = std::max_element(path_colors.begin(), path_colors.end(), + [](const ColorInfo& p1, const ColorInfo& p2) { + return p1.second < p2.second; + }); + if (it != path_colors.end()) + answer.insert(it->first); for (auto color_info : path_colors) { - if (math::gr(double(color_info.second) / double(total_len), 0.3)) { + if (!IsSpurious(color_info.second, total_len)) { answer.insert(color_info.first); } } diff -Nru spades-3.10.1+dfsg/src/projects/mts/CMakeLists.txt spades-3.11.1+dfsg/src/projects/mts/CMakeLists.txt --- spades-3.10.1+dfsg/src/projects/mts/CMakeLists.txt 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/CMakeLists.txt 2017-09-28 09:05:13.000000000 +0000 @@ -29,7 +29,8 @@ add_executable(prop_binning propagate.cpp read_binning.cpp - prop_binning.cpp) + prop_binning.cpp + gzstream/gzstream.C) target_link_libraries(prop_binning common_modules nlopt BamTools ssw getopt_pp ${COMMON_LIBRARIES}) diff -Nru spades-3.10.1+dfsg/src/projects/mts/Common.snake spades-3.11.1+dfsg/src/projects/mts/Common.snake --- spades-3.10.1+dfsg/src/projects/mts/Common.snake 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/Common.snake 2017-09-28 09:05:13.000000000 +0000 @@ -4,25 +4,36 @@ from functools import partial import os.path -from scripts.common import detect_reads +from scripts.common import detect_reads, fill_default_values #Config parameters -IN = config["IN"] -LOCAL_DIR = config["LOCAL_DIR"] -SPADES = config.get("SPADES", LOCAL_DIR) -SPADES_REASSEMBLY = config.get("SPADES_REASSEMBLY", LOCAL_DIR) -BIN = config.get("BIN", os.path.join(LOCAL_DIR, "build/release/bin")) -SCRIPTS = config.get("SCRIPTS", os.path.join(LOCAL_DIR, "src/projects/mts/scripts")) -SOFT = config["SOFT"] -K = int(config.get("K", 55)) -SMALL_K = int(config.get("SMALL_K", 21)) -MIN_CONTIG_LENGTH = int(config.get("MIN_CONTIG_LENGTH", 2000)) -THREADS = config.get("THREADS", 16) -BINNER = config.get("BINNER", "canopy") +fill_default_values(config) + +#TODO: check if modern Snakemake allows dictionaries in string splicing +IN = config["data"] +ASSEMBLER = config["assembly"]["assembler"] +ASSEMBLER_DIR = config["assembly"]["dir"] +REASSEMBLER_DIR = config["reassembly"]["dir"] +BIN = config["bin"] +SCRIPTS = config["scripts"] +SOFT = config["soft"] +ASSEMBLY_K = config["assembly"]["k"] +PROFILE_K = config["profile"]["k"] +PROFILER = config["profile"]["profiler"] +SPLIT_LENGTH = config["profile"]["split"] +MIN_CONTIG_LENGTH = config["binning"]["min_length"] +MIN_NONZEROES = config["binning"]["min_nonzeroes"] +THREADS = config["threads"] +BINNER = config["binning"]["binner"] + +IS_COASSEMBLY = config["assembly"].get("groups") == ["*"] +if not IS_COASSEMBLY and BINNER == "metabat": + raise WorkflowError("MetaBAT is supported only in the full coassembly mode") #Autodetect samples and their reads +#Check that sample names are consecutive and all are presented SAMPLE_DIRS = set(glob_wildcards(os.path.join(IN, "{sample,sample\d+}"))[0]) -SAMPLE_COUNT = len(SAMPLE_DIRS) +SAMPLE_COUNT = config.get("count", len(SAMPLE_DIRS)) SAMPLES = list() for i in range(1, SAMPLE_COUNT + 1): sample_name = "sample" + str(i) @@ -32,10 +43,16 @@ SAMPLE_READS = dict(map(lambda sample: (sample, detect_reads(os.path.join(IN, sample))), SAMPLES)) +NAME_TEMPLATE = "(\w+\.?)?\d+" + #Group samples -GROUP_SAMPLES = config.get("GROUPS", []) +GROUP_SAMPLES = config["assembly"]["groups"] +#Form /N groups +if type(GROUP_SAMPLES) == str and GROUP_SAMPLES[0] == "/": + group_size = SAMPLE_COUNT // int(GROUP_SAMPLES[1:]) + GROUP_SAMPLES = [["sample"+str(j) for j in range(i, min(i + group_size, SAMPLE_COUNT + 1))] + for i in range(1, SAMPLE_COUNT, group_size)] USED_SAMPLES = set(chain(*GROUP_SAMPLES)) -#TODO: double-check #Replace the wildcard group with unused samples if GROUP_SAMPLES and GROUP_SAMPLES[-1] == "*": GROUP_SAMPLES[-1] = [sample for sample in SAMPLES if sample not in USED_SAMPLES] @@ -45,25 +62,56 @@ if sample not in USED_SAMPLES: GROUP_SAMPLES.append([sample]) +#Dictionary: {group name: [samples of the group]} +#Can be iterated to retrieve all group names GROUPS = dict() group_id = 1 for group in GROUP_SAMPLES: if len(group) == 1: - key = group[0] + key = group[0] #Groups of a single sample are simply called sampleXX else: key = "group" + str(group_id) - #SAMPLE_READS[key] = ["reads/{}/{}.fastq".format(key, dir) for dir in ["left", "right"]] SAMPLE_READS[key] = ([SAMPLE_READS[s][0] for s in group], [SAMPLE_READS[s][1] for s in group]) group_id += 1 GROUPS[key] = group #Helpers for locating input files + +#Returns all filepaths with left/right reads for a sample/group/bin/etc, used as Snakemake input +def reads_input(dict): + return (partial(dict, 0), partial(dict, 1)) + def sample_reads(dir, wildcards): - res = SAMPLE_READS[wildcards.sample][dir] - if res is str: - return [res] - else: - return res + return SAMPLE_READS[wildcards["sample"]][dir] + +left_sample_reads, right_sample_reads = reads_input(sample_reads) + +def group_reads(dir, wildcards): + return SAMPLE_READS[wildcards["group"]][dir] + +left_reads, right_reads = reads_input(group_reads) -left_reads = partial(sample_reads, 0) -right_reads = partial(sample_reads, 1) +def is_fastq(wildcards): + name = getattr(wildcards, "sample", None) + if not name: + name = GROUPS[wildcards.group][0] + for ext in {".fastq", ".fq", ".fastq.gz", "fq.gz"}: + if SAMPLE_READS[name][0].endswith(ext): + return True + return False + +rule combine_contigs: + input: expand("assembly/{{frags}}/{group}.fasta", assembler=ASSEMBLER, group=GROUPS) + output: "assembly/{frags}/all.fasta" + message: "Combine all {wildcards.frags} contigs" + shell: "{SCRIPTS}/combine_contigs.py {input} > {output}" + +rule combine_annotation: + input: expand("{{stage}}/annotation/{group}.ann", group=GROUPS) + output: "{stage}/annotation/all.ann" + message: "Combine all annotation on {wildcards.stage}" + run: + shell("rm -f {output}") + for sample_ann in input: + sample, _ = os.path.splitext(os.path.basename(sample_ann)) + shell("sed -e 's/^/{sample}-/' {sample_ann} >> {output}") diff -Nru spades-3.10.1+dfsg/src/projects/mts/config.yaml spades-3.11.1+dfsg/src/projects/mts/config.yaml --- spades-3.10.1+dfsg/src/projects/mts/config.yaml 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/config.yaml 2017-09-28 09:05:13.000000000 +0000 @@ -1,10 +1,24 @@ -IN: "/Sid/snurk/mts/sim/data" -SPADES: "~/Projects/mts/assembler/" -QUAST: "python2 ~/opt/quast-3.2/metaquast.py" -BIN: "~/Projects/mts/assembler/build/release/bin" -SCRIPTS: "~/Projects/mts/assembler/src/projects/mts/scripts" -SOFT: "/home/snurk/soft/" -REF: "/Sid/snurk/mts/nielsen/ref.fasta" -K: 55 -small_k: 21 -MIN_CONTIG_LENGTH: 2000 +data: "path/to/samples/directory" +soft: "path/to/canopy/etc" +#bin: "/path/to/mts/binaries" #Derived from the current dir by default +#scripts: "/path/to/scripts/directory" #Derived from the current dir by default +#assembly: + #assembler: spades + #dir: "/path/to/assembler/directory" #Derived from the current dir by default + #k: 55 + #groups: [] +#profile: + #k: 21 + #split: 10000 +#binning: + #binner: canopy + #min_nonzeroes: 3 + #min_length: 2000 +#propagation: + #enabled: true +#reassembly: + #enabled: true +#stats: + #stage: ["gf", "summary", "pca", "tsne", "checkm"] #stage can be binning, propagation, reassembly, or all + #quast: "python2 /path/to/QUAST" #Default is global + #refs: "path/to/refs/directory/or/list/of/them" diff -Nru spades-3.10.1+dfsg/src/projects/mts/contig_abundance_counter.cpp spades-3.11.1+dfsg/src/projects/mts/contig_abundance_counter.cpp --- spades-3.10.1+dfsg/src/projects/mts/contig_abundance_counter.cpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/contig_abundance_counter.cpp 2017-09-28 09:05:13.000000000 +0000 @@ -15,39 +15,28 @@ class Runner { public: static void Run(ContigAbundanceCounter& abundance_counter, size_t min_length_bound, - io::FileReadStream& contigs_stream, io::osequencestream& splits_os, - std::ofstream& id_out, std::ofstream& mpl_out) { - static const size_t split_length = 10000; - io::SingleRead full_contig; + io::FileReadStream& contigs_stream, std::ofstream& out) { + io::SingleRead contig; while (!contigs_stream.eof()) { - contigs_stream >> full_contig; - DEBUG("Analyzing contig " << GetId(full_contig)); + contigs_stream >> contig; + contig_id id = GetId(contig); + DEBUG("Analyzing contig " << id); + + if (contig.size() < min_length_bound) { + DEBUG("Fragment is shorter than min_length_bound " << min_length_bound); + break; + } + + auto abundance_vec = abundance_counter(contig.GetSequenceString(), contig.name()); - for (size_t i = 0; i < full_contig.size(); i += split_length) { - if (full_contig.size() - i < min_length_bound) { - DEBUG("Fragment shorter than min_length_bound " << min_length_bound); - break; - } - - io::SingleRead contig = full_contig.Substr(i, std::min(i + split_length, full_contig.size())); - splits_os << contig; - - contig_id id = GetId(contig); - DEBUG("Processing fragment # " << (i / split_length) << " with id " << id); - - auto abundance_vec = abundance_counter(contig.GetSequenceString(), contig.name()); - - if (abundance_vec) { - stringstream ss; - copy(abundance_vec->begin(), abundance_vec->end(), - ostream_iterator(ss, " ")); - DEBUG("Successfully estimated abundance of " << id << " : " << ss.str()); - - id_out << id << std::endl; - mpl_out << ss.str() << std::endl; - } else { - DEBUG("Failed to estimate abundance of " << id); - } + if (abundance_vec) { + DEBUG("Successfully estimated abundance of " << id); + out << id; + for (auto mpl : *abundance_vec) + out << "\t" << mpl; + out << std::endl; + } else { + DEBUG("Failed to estimate abundance of " << id); } } } @@ -60,7 +49,7 @@ unsigned k; size_t sample_cnt, min_length_bound; - std::string work_dir, contigs_path, splits_path; + std::string work_dir, contigs_path; std::string kmer_mult_fn, contigs_abundance_fn; try { @@ -69,14 +58,13 @@ ops >> Option('k', k) >> Option('w', work_dir) >> Option('c', contigs_path) - >> Option('f', splits_path) >> Option('n', sample_cnt) >> Option('m', kmer_mult_fn) >> Option('o', contigs_abundance_fn) >> Option('l', min_length_bound, size_t(0)); } catch(GetOptEx &ex) { std::cout << "Usage: contig_abundance_counter -k -w -c " - "-n -m -f " + "-n -m " "-o [-l (default: 0)]" << std::endl; exit(1); } @@ -85,17 +73,14 @@ create_console_logger(); SetSampleCount(sample_cnt); - ContigAbundanceCounter abundance_counter(k, SingleClusterAnalyzer(), work_dir); + ContigAbundanceCounter abundance_counter(k, make_shared(), work_dir); abundance_counter.Init(kmer_mult_fn); io::FileReadStream contigs_stream(contigs_path); - io::osequencestream splits_os(splits_path); - std::ofstream id_out(contigs_abundance_fn + ".id"); - std::ofstream mpl_out(contigs_abundance_fn + ".mpl"); + std::ofstream out(contigs_abundance_fn); Runner::Run(abundance_counter, min_length_bound, - contigs_stream, splits_os, - id_out, mpl_out); + contigs_stream, out); return 0; } diff -Nru spades-3.10.1+dfsg/src/projects/mts/contig_abundance.cpp spades-3.11.1+dfsg/src/projects/mts/contig_abundance.cpp --- spades-3.10.1+dfsg/src/projects/mts/contig_abundance.cpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/contig_abundance.cpp 2017-09-28 09:05:13.000000000 +0000 @@ -1,5 +1,5 @@ #include "contig_abundance.hpp" -#include "utils/indices/kmer_splitters.hpp" +#include "utils/kmer_mph/kmer_splitters.hpp" namespace debruijn_graph { @@ -13,7 +13,7 @@ return sample_cnt_; } -MplVector SingleClusterAnalyzer::SampleMpls(const KmerProfiles& kmer_mpls, size_t sample) const { +MplVector SampleMpls(const KmerProfiles& kmer_mpls, size_t sample) { MplVector answer; answer.reserve(kmer_mpls.size()); for (const auto& kmer_mpl : kmer_mpls) { @@ -22,14 +22,14 @@ return answer; } -Mpl SingleClusterAnalyzer::SampleMedian(const KmerProfiles& kmer_mpls, size_t sample) const { +Mpl SampleMedian(const KmerProfiles& kmer_mpls, size_t sample) { std::vector sample_mpls = SampleMpls(kmer_mpls, sample); std::nth_element(sample_mpls.begin(), sample_mpls.begin() + sample_mpls.size()/2, sample_mpls.end()); return sample_mpls[sample_mpls.size()/2]; } -MplVector SingleClusterAnalyzer::MedianVector(const KmerProfiles& kmer_mpls) const { +MplVector MedianVector(const KmerProfiles& kmer_mpls) { VERIFY(kmer_mpls.size() != 0); MplVector answer(SampleCount(), 0); for (size_t i = 0; i < SampleCount(); ++i) { @@ -66,42 +66,44 @@ return answer; } -boost::optional SingleClusterAnalyzer::operator()(const KmerProfiles& kmer_mpls) const { +boost::optional TrivialClusterAnalyzer::operator()(const KmerProfiles& kmer_mpls) const { auto med = MedianVector(kmer_mpls); return AbundanceVector(med.begin(), med.end()); - //return boost::optional(answer); - //MplVector center = MedianVector(kmer_mpls); - //auto locality = CloseKmerMpls(kmer_mpls, KmerProfile(center)); - - //for (size_t it_cnt = 0; it_cnt < MAX_IT; ++it_cnt) { - // DEBUG("Iteration " << it_cnt); - // DEBUG("Center is " << PrintVector(center)); - - // DEBUG("Locality size is " << locality.size() - // << " making " << (double(locality.size()) / double(kmer_mpls.size())) - // << " of total # points"); - - // double center_share = double(locality.size()) / double(kmer_mpls.size()); - // if (math::ls(center_share, central_clust_share_)) { - // DEBUG("Detected central area contains too few k-mers: share " << center_share - // << " ; center size " << locality.size() - // << " ; total size " << kmer_mpls.size()); - // return boost::none; - // } - - // MplVector update = MedianVector(locality); - // DEBUG("Center update is " << PrintVector(update)); - - // if (center == update) { - // DEBUG("Old and new centers matched on iteration " << it_cnt); - // break; - // } - - // center = update; - // locality = CloseKmerMpls(kmer_mpls, center); - //} +} + +boost::optional SingleClusterAnalyzer::operator()(const KmerProfiles& kmer_mpls) const { + MplVector center = MedianVector(kmer_mpls); + auto locality = CloseKmerMpls(kmer_mpls, KmerProfile(center)); + + for (size_t it_cnt = 0; it_cnt < MAX_IT; ++it_cnt) { + DEBUG("Iteration " << it_cnt); + DEBUG("Center is " << PrintVector(center)); + + DEBUG("Locality size is " << locality.size() + << " making " << (double(locality.size()) / double(kmer_mpls.size())) + << " of total # points"); + + double center_share = double(locality.size()) / double(kmer_mpls.size()); + if (math::ls(center_share, central_clust_share_)) { + DEBUG("Detected central area contains too few k-mers: share " << center_share + << " ; center size " << locality.size() + << " ; total size " << kmer_mpls.size()); + return boost::none; + } + + MplVector update = MedianVector(locality); + DEBUG("Center update is " << PrintVector(update)); + + if (center == update) { + DEBUG("Old and new centers matched on iteration " << it_cnt); + break; + } + + center = update; + locality = CloseKmerMpls(kmer_mpls, center); + } - //return boost::optional(MeanVector(locality, sample_cnt_)); + return boost::optional(MeanVector(locality)); } vector ContigAbundanceCounter::SplitOnNs(const std::string& seq) const { @@ -170,7 +172,7 @@ return boost::none; } - return cluster_analyzer_(kmer_mpls); + return (*cluster_analyzer_)(kmer_mpls); } } diff -Nru spades-3.10.1+dfsg/src/projects/mts/contig_abundance.hpp spades-3.11.1+dfsg/src/projects/mts/contig_abundance.hpp --- spades-3.10.1+dfsg/src/projects/mts/contig_abundance.hpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/contig_abundance.hpp 2017-09-28 09:05:13.000000000 +0000 @@ -1,7 +1,7 @@ #pragma once #include "pipeline/graph_pack.hpp" -#include "utils/indices/perfect_hash_map_builder.hpp" +#include "utils/ph_map/perfect_hash_map_builder.hpp" namespace debruijn_graph { @@ -77,15 +77,32 @@ return ss.str(); } -class SingleClusterAnalyzer { +MplVector SampleMpls(const KmerProfiles& kmer_mpls, size_t sample); +Mpl SampleMedian(const KmerProfiles& kmer_mpls, size_t sample); +MplVector MedianVector(const KmerProfiles& kmer_mpls); + +class ClusterAnalyzer { +public: + virtual boost::optional operator()(const KmerProfiles& kmer_mpls) const = 0; + virtual ~ClusterAnalyzer() {}; +}; + +class TrivialClusterAnalyzer : public ClusterAnalyzer { +public: + TrivialClusterAnalyzer() {} + + boost::optional operator()(const KmerProfiles& kmer_mpls) const override; + +private: + DECL_LOGGER("TrivialClusterAnalyzer"); +}; + +class SingleClusterAnalyzer : public ClusterAnalyzer { static const uint MAX_IT = 10; double coord_vise_proximity_; double central_clust_share_; - MplVector SampleMpls(const KmerProfiles& kmer_mpls, size_t sample) const; - Mpl SampleMedian(const KmerProfiles& kmer_mpls, size_t sample) const; - MplVector MedianVector(const KmerProfiles& kmer_mpls) const; bool AreClose(const KmerProfile& c, const KmerProfile& v) const; KmerProfiles CloseKmerMpls(const KmerProfiles& kmer_mpls, const KmerProfile& center) const; @@ -96,34 +113,32 @@ central_clust_share_(central_clust_share) { } - boost::optional operator()(const KmerProfiles& kmer_mpls) const; + boost::optional operator()(const KmerProfiles& kmer_mpls) const override; private: DECL_LOGGER("SingleClusterAnalyzer"); }; class ContigAbundanceCounter { - typedef typename InvertableStoring::trivial_inverter InverterT; + typedef typename utils::InvertableStoring::trivial_inverter InverterT; - typedef KeyStoringMap, - InvertableStoring> IndexT; + utils::kmer_index_traits, + utils::InvertableStoring> IndexT; unsigned k_; - SingleClusterAnalyzer cluster_analyzer_; + shared_ptr cluster_analyzer_; double min_earmark_share_; IndexT kmer_mpl_; InverterT inverter_; std::vector mpl_data_; - void FillMplMap(const std::string& kmers_mpl_file); - vector SplitOnNs(const std::string& seq) const; public: ContigAbundanceCounter(unsigned k, - const SingleClusterAnalyzer& cluster_analyzer, + shared_ptr cluster_analyzer, const std::string& work_dir, double min_earmark_share = 0.7) : k_(k), diff -Nru spades-3.10.1+dfsg/src/projects/mts/gzstream/gzstream.C spades-3.11.1+dfsg/src/projects/mts/gzstream/gzstream.C --- spades-3.10.1+dfsg/src/projects/mts/gzstream/gzstream.C 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/gzstream/gzstream.C 2017-09-28 09:05:13.000000000 +0000 @@ -0,0 +1,165 @@ +// ============================================================================ +// gzstream, C++ iostream classes wrapping the zlib compression library. +// Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// ============================================================================ +// +// File : gzstream.C +// Revision : $Revision: 1.7 $ +// Revision_date : $Date: 2003/01/08 14:41:27 $ +// Author(s) : Deepak Bandyopadhyay, Lutz Kettner +// +// Standard streambuf implementation following Nicolai Josuttis, "The +// Standard C++ Library". +// ============================================================================ + +#include "gzstream.h" +#include +#include // for memcpy + +#ifdef GZSTREAM_NAMESPACE +namespace GZSTREAM_NAMESPACE { +#endif + +// ---------------------------------------------------------------------------- +// Internal classes to implement gzstream. See header file for user classes. +// ---------------------------------------------------------------------------- + +// -------------------------------------- +// class gzstreambuf: +// -------------------------------------- + +gzstreambuf* gzstreambuf::open( const char* name, int open_mode) { + if ( is_open()) + return (gzstreambuf*)0; + mode = open_mode; + // no append nor read/write mode + if ((mode & std::ios::ate) || (mode & std::ios::app) + || ((mode & std::ios::in) && (mode & std::ios::out))) + return (gzstreambuf*)0; + char fmode[10]; + char* fmodeptr = fmode; + if ( mode & std::ios::in) + *fmodeptr++ = 'r'; + else if ( mode & std::ios::out) + *fmodeptr++ = 'w'; + *fmodeptr++ = 'b'; + *fmodeptr = '\0'; + file = gzopen( name, fmode); + if (file == 0) + return (gzstreambuf*)0; + opened = 1; + return this; +} + +gzstreambuf * gzstreambuf::close() { + if ( is_open()) { + sync(); + opened = 0; + if ( gzclose( file) == Z_OK) + return this; + } + return (gzstreambuf*)0; +} + +int gzstreambuf::underflow() { // used for input buffer only + if ( gptr() && ( gptr() < egptr())) + return * reinterpret_cast( gptr()); + + if ( ! (mode & std::ios::in) || ! opened) + return EOF; + // Josuttis' implementation of inbuf + int n_putback = gptr() - eback(); + if ( n_putback > 4) + n_putback = 4; + memcpy( buffer + (4 - n_putback), gptr() - n_putback, n_putback); + + int num = gzread( file, buffer+4, bufferSize-4); + if (num <= 0) // ERROR or EOF + return EOF; + + // reset buffer pointers + setg( buffer + (4 - n_putback), // beginning of putback area + buffer + 4, // read position + buffer + 4 + num); // end of buffer + + // return next character + return * reinterpret_cast( gptr()); +} + +int gzstreambuf::flush_buffer() { + // Separate the writing of the buffer from overflow() and + // sync() operation. + int w = pptr() - pbase(); + if ( gzwrite( file, pbase(), w) != w) + return EOF; + pbump( -w); + return w; +} + +int gzstreambuf::overflow( int c) { // used for output buffer only + if ( ! ( mode & std::ios::out) || ! opened) + return EOF; + if (c != EOF) { + *pptr() = c; + pbump(1); + } + if ( flush_buffer() == EOF) + return EOF; + return c; +} + +int gzstreambuf::sync() { + // Changed to use flush_buffer() instead of overflow( EOF) + // which caused improper behavior with std::endl and flush(), + // bug reported by Vincent Ricard. + if ( pptr() && pptr() > pbase()) { + if ( flush_buffer() == EOF) + return -1; + } + return 0; +} + +// -------------------------------------- +// class gzstreambase: +// -------------------------------------- + +gzstreambase::gzstreambase( const char* name, int mode) { + init( &buf); + open( name, mode); +} + +gzstreambase::~gzstreambase() { + buf.close(); +} + +void gzstreambase::open( const char* name, int open_mode) { + if ( ! buf.open( name, open_mode)) + clear( rdstate() | std::ios::badbit); +} + +void gzstreambase::close() { + if ( buf.is_open()) + if ( ! buf.close()) + clear( rdstate() | std::ios::badbit); +} + +#ifdef GZSTREAM_NAMESPACE +} // namespace GZSTREAM_NAMESPACE +#endif + +// ============================================================================ +// EOF // diff -Nru spades-3.10.1+dfsg/src/projects/mts/gzstream/gzstream.h spades-3.11.1+dfsg/src/projects/mts/gzstream/gzstream.h --- spades-3.10.1+dfsg/src/projects/mts/gzstream/gzstream.h 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/gzstream/gzstream.h 2017-09-28 09:05:13.000000000 +0000 @@ -0,0 +1,121 @@ +// ============================================================================ +// gzstream, C++ iostream classes wrapping the zlib compression library. +// Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// ============================================================================ +// +// File : gzstream.h +// Revision : $Revision: 1.5 $ +// Revision_date : $Date: 2002/04/26 23:30:15 $ +// Author(s) : Deepak Bandyopadhyay, Lutz Kettner +// +// Standard streambuf implementation following Nicolai Josuttis, "The +// Standard C++ Library". +// ============================================================================ + +#ifndef GZSTREAM_H +#define GZSTREAM_H 1 + +// standard C++ with new header file names and std:: namespace +#include +#include +#include + +#ifdef GZSTREAM_NAMESPACE +namespace GZSTREAM_NAMESPACE { +#endif + +// ---------------------------------------------------------------------------- +// Internal classes to implement gzstream. See below for user classes. +// ---------------------------------------------------------------------------- + +class gzstreambuf : public std::streambuf { +private: + static const int bufferSize = 47+256; // size of data buff + // totals 512 bytes under g++ for igzstream at the end. + + gzFile file; // file handle for compressed file + char buffer[bufferSize]; // data buffer + char opened; // open/close state of stream + int mode; // I/O mode + + int flush_buffer(); +public: + gzstreambuf() : opened(0) { + setp( buffer, buffer + (bufferSize-1)); + setg( buffer + 4, // beginning of putback area + buffer + 4, // read position + buffer + 4); // end position + // ASSERT: both input & output capabilities will not be used together + } + int is_open() { return opened; } + gzstreambuf* open( const char* name, int open_mode); + gzstreambuf* close(); + ~gzstreambuf() { close(); } + + virtual int overflow( int c = EOF); + virtual int underflow(); + virtual int sync(); +}; + +class gzstreambase : virtual public std::ios { +protected: + gzstreambuf buf; +public: + gzstreambase() { init(&buf); } + gzstreambase( const char* name, int open_mode); + ~gzstreambase(); + void open( const char* name, int open_mode); + void close(); + gzstreambuf* rdbuf() { return &buf; } +}; + +// ---------------------------------------------------------------------------- +// User classes. Use igzstream and ogzstream analogously to ifstream and +// ofstream respectively. They read and write files based on the gz* +// function interface of the zlib. Files are compatible with gzip compression. +// ---------------------------------------------------------------------------- + +class igzstream : public gzstreambase, public std::istream { +public: + igzstream() : std::istream( &buf) {} + igzstream( const char* name, int open_mode = std::ios::in) + : gzstreambase( name, open_mode), std::istream( &buf) {} + gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } + void open( const char* name, int open_mode = std::ios::in) { + gzstreambase::open( name, open_mode); + } +}; + +class ogzstream : public gzstreambase, public std::ostream { +public: + ogzstream() : std::ostream( &buf) {} + ogzstream( const char* name, int mode = std::ios::out) + : gzstreambase( name, mode), std::ostream( &buf) {} + gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } + void open( const char* name, int open_mode = std::ios::out) { + gzstreambase::open( name, open_mode); + } +}; + +#ifdef GZSTREAM_NAMESPACE +} // namespace GZSTREAM_NAMESPACE +#endif + +#endif // GZSTREAM_H +// ============================================================================ +// EOF // + diff -Nru spades-3.10.1+dfsg/src/projects/mts/kmer_multiplicity_counter.cpp spades-3.11.1+dfsg/src/projects/mts/kmer_multiplicity_counter.cpp --- spades-3.10.1+dfsg/src/projects/mts/kmer_multiplicity_counter.cpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/kmer_multiplicity_counter.cpp 2017-09-28 09:05:13.000000000 +0000 @@ -12,10 +12,10 @@ #include "kmc_api/kmc_file.h" //#include "omp.h" #include "io/kmers/mmapped_reader.hpp" -#include "utils/path_helper.hpp" -#include "utils/simple_tools.hpp" -#include "utils/indices/perfect_hash_map_builder.hpp" -#include "utils/indices/kmer_splitters.hpp" +#include "utils/filesystem/path_helper.hpp" +#include "utils/stl_utils.hpp" +#include "utils/ph_map/perfect_hash_map_builder.hpp" +#include "utils/kmer_mph/kmer_splitters.hpp" #include "logger.hpp" using std::string; @@ -49,7 +49,7 @@ string SortKmersCountFile(const string& filename) { MMappedRecordArrayReader ins(filename, RtSeq::GetDataSize(k_) + 1, false); - libcxx::sort(ins.begin(), ins.end(), array_less()); + libcxx::sort(ins.begin(), ins.end(), adt::array_less()); std::string sorted_filename = filename + KMER_SORTED_EXTENSION; std::ofstream out(sorted_filename); out.write((char*) ins.data(), ins.data_size()); @@ -137,7 +137,7 @@ //TODO: extract into a common header typedef size_t Offset; typedef uint16_t Mpl; - using namespace debruijn_graph; + using namespace utils; KeyStoringMap, InvertableStoring> kmer_mpl(k_, workdir); @@ -247,7 +247,7 @@ std::vector input_files; for (size_t i = 1; i <= sample_cnt; ++i) { - input_files.push_back(work_dir + "/sample" + ToString(i)); + input_files.push_back(work_dir + "/sample" + std::to_string(i)); } KmerMultiplicityCounter kmcounter(k, output); diff -Nru spades-3.10.1+dfsg/src/projects/mts/logger.hpp spades-3.11.1+dfsg/src/projects/mts/logger.hpp --- spades-3.10.1+dfsg/src/projects/mts/logger.hpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/logger.hpp 2017-09-28 09:05:13.000000000 +0000 @@ -5,7 +5,7 @@ string log_props_file = "log.properties"; - logger *lg = create_logger(path::FileExists(log_props_file) ? log_props_file : ""); + logger *lg = create_logger(fs::FileExists(log_props_file) ? log_props_file : ""); lg->add_writer(std::make_shared()); attach_logger(lg); } diff -Nru spades-3.10.1+dfsg/src/projects/mts/mts.py spades-3.11.1+dfsg/src/projects/mts/mts.py --- spades-3.10.1+dfsg/src/projects/mts/mts.py 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/mts.py 2017-09-28 09:05:13.000000000 +0000 @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python from __future__ import (print_function) import argparse @@ -7,6 +7,9 @@ import os import os.path import shutil +import yaml + +from scripts.common import fill_default_values #copied from http://stackoverflow.com/questions/431684/how-do-i-cd-in-python/13197763#13197763 class cd: @@ -26,9 +29,12 @@ parser.add_argument("--threads", "-t", type=int, default=8, help="Number of threads") parser.add_argument("dir", type=str, help="Output directory") parser.add_argument("--config", "-c", type=str, default="", help="config.yaml to be copied to the directory (unnecessary if config.yaml is already there)") -parser.add_argument("--stats", "-s", action="store_true", help="Calculate stats (when the REFS parameter in config.yaml is provided)") -parser.add_argument("--reuse-assemblies", action="store_true", help="Use existing assemblies (put them in the corresponding folders)") +parser.add_argument("--reuse-assemblies", type=str, help="Directory with existing assemblies to reuse") +parser.add_argument("--reuse-profiles", type=str, help="Directory with existing profiles to reuse") +parser.add_argument("--reuse-from", type=str, help="Directory with another assembly to reuse everything that is possible (overrides other --reuses)") +parser.add_argument("--no-stats", "-S", action="store_true", help="Skip the stats section (overrides the config value)") parser.add_argument("--verbose", "-v", action="store_true", help="Increase verbosity level") +parser.add_argument("--alt", action="store_true", help=argparse.SUPPRESS) args = parser.parse_args() @@ -40,34 +46,55 @@ if args.verbose: base_params.extend(["-p", "--verbose"]) -if args.config: - if os.path.exists(os.path.join(args.dir, "config.yaml")): - print("Config path specified, but config.yaml already exists in output folder " + args.dir) - sys.exit(239) - if not os.path.exists(args.dir): os.makedirs(args.dir) -print("Output folder set to " + args.dir) +print("Output folder set to", args.dir) +config_path = os.path.join(args.dir, "config.yaml") if args.config: - print("Copying config from " + args.config) - shutil.copy(args.config, args.dir) + if os.path.exists(config_path): + if subprocess.call(["diff", config_path, args.config]): + print("\033[31mConfig path specified, but different config.yaml already exists in output folder", args.dir, "\033[0m") + sys.exit(239) + else: + print("Copying config from", args.config) + shutil.copy(args.config, config_path) with cd(exec_dir): def call_snake(extra_params=[]): subprocess.check_call(base_params + extra_params, stdout=sys.stdout, stderr=sys.stderr) - - print("Step #1 - Assembly") - if args.reuse_assemblies: - call_snake(["assemble_all", "--touch"]) + def reuse_dir(dir_from, dir_name): + if not dir_from: + return + local_dir = os.path.join(args.dir, dir_name) + if not os.path.isdir(dir_from): + print("\033[33mWarning: {} source directory doesn't exist\033[0m".format(dir_from)) + return + if os.path.exists(local_dir): + print("\033[33mWarning: {} destination directory already exists\033[0m".format(dir_name)) + return + os.symlink(dir_from, local_dir) + + with open(config_path) as config_in: + config = yaml.load(config_in) + fill_default_values(config) + + if args.reuse_from: + args.reuse_assemblies = os.path.join(args.reuse_from, "assembly") + args.reuse_profiles = os.path.join(args.reuse_from, "profile") + + reuse_dir(args.reuse_assemblies, "assembly") + reuse_dir(args.reuse_profiles, "profile") + + print("Step #1 - Assembly") call_snake() - - if args.stats: - print("Step #2a - Assembly statistics") - call_snake(["--snakefile", "Stats.snake", "stats_all"]) - - print("Step #2b - Reassembly statistics") - call_snake(["--snakefile", "Stats.snake", "stats_reassembly"]) + if config.get("reassembly", dict()).get("enabled", True): + print("Step #1b - Reassembly") + call_snake(["--snakefile", "Reassembly.snake"]) + + if not args.no_stats and len(config.get("stats", dict())) > 0: + print("Step #2 - Stats") + call_snake(["--snakefile", "Stats.snake"]) diff -Nru spades-3.10.1+dfsg/src/projects/mts/multirun.py spades-3.11.1+dfsg/src/projects/mts/multirun.py --- spades-3.10.1+dfsg/src/projects/mts/multirun.py 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/multirun.py 2017-09-28 09:05:13.000000000 +0000 @@ -0,0 +1,84 @@ +#!/usr/bin/env python +from __future__ import (print_function) + +import argparse +import os +import os.path +import subprocess +import sys +import yaml + +parser = argparse.ArgumentParser(description="MTS Multi Runner") + +all_assemblers = ["main", "spades", "megahit"] +all_binners = ["canopy", "concoct", "metabat"] +unsupported = set(["main_metabat", "spades_canopy", "megahit_canopy"]) + +parser.add_argument("--threads", "-t", type=int, default=8, help="Number of threads for each run") +parser.add_argument("dir", type=str, help="Output directory") +parser.add_argument("--config", "-c", type=str, help="Base config") +parser.add_argument("--pipelines", "-p", type=str, nargs="+", default=[], help="Pipeline configurations to run") +parser.add_argument("--assemblers", "-a", type=str, nargs="+", default=all_assemblers, help="Assemblers to use") +parser.add_argument("--binners", "-b", type=str, nargs="+", default=all_binners, help="Binners to use") +parser.add_argument("--exclude", "-e", type=str, nargs="+", default=[], help="Excluded (skipped) configurations") +parser.add_argument("--no-stats", "-S", action="store_true", help="Skip the stats section (overrides the config value)") +parser.add_argument("--verbose", "-v", action="store_true", help="Increase verbosity level") +parser.add_argument("--ignore-errors", action="store_true") + +args = parser.parse_args() + +with open(args.config) as config_in: + config_template = yaml.load(config_in) + +def pipelines(): + for assembler in args.assemblers: + for binner in args.binners: + yield assembler + "_" + binner + for pipeline in args.pipelines: + yield pipeline + +prev_runs = dict() + +excluded = unsupported.union(args.exclude) +for pipeline in pipelines(): + if pipeline in excluded: + if pipeline in unsupported: + print("\033[33mWarning:", pipeline, "is not currently supported; skipping\033[0m\n") + continue + print("Running", pipeline) + cur_dir = os.path.join(args.dir, pipeline) + if not os.path.exists(cur_dir): + os.makedirs(cur_dir) + call_params = ["./mts.py", "-t", str(args.threads), cur_dir] + if args.no_stats: + call_params.extend(["--no-stats"]) + config = config_template.copy() + params = pipeline.split("_") + assembly_name = params[0] + if assembly_name == "main": + config["profile"] = {"profiler": "mts"} + else: + config["assembly"] = {"assembler": params[0], "groups": ["*"]} + config["profile"] = {"profiler": "jgi"} + config["propagation"] = {"enabled": False} + config["reassembly"] = {"enabled": False} + + config["binning"] = {"binner": params[1]} + with open(os.path.join(cur_dir, "config.yaml"), "w") as config_out: + yaml.dump(config, config_out) + # Try to reuse assemblies from previous runs with the same assembler + prev_run = prev_runs.get(assembly_name) + if prev_run: + print("Reusing same data from", prev_run) + call_params.extend(["--reuse-from", prev_run]) + #TODO: rewrite using Snakemake API + errcode = subprocess.call(call_params) + if errcode: + print(" ".join(call_params), "returned with error:", errcode) + if not args.ignore_errors: + sys.exit(errcode) + elif not prev_run: #Reuse only successful run + prev_runs[assembly_name] = cur_dir + print() + +#TODO: compare stats diff -Nru spades-3.10.1+dfsg/src/projects/mts/propagate.cpp spades-3.11.1+dfsg/src/projects/mts/propagate.cpp --- spades-3.10.1+dfsg/src/projects/mts/propagate.cpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/propagate.cpp 2017-09-28 09:05:13.000000000 +0000 @@ -5,7 +5,7 @@ //* See file LICENSE for details. //*************************************************************************** -#include "utils/simple_tools.hpp" +#include "utils/stl_utils.hpp" //#include "pipeline/graphio.hpp" #include "pipeline/graph_pack.hpp" @@ -15,15 +15,20 @@ #include "visualization.hpp" namespace debruijn_graph { -static const size_t EDGE_LENGTH_THRESHOLD = 2000; +static const size_t EDGE_LENGTH_THRESHOLD = 2000, + EDGE_UPPER_THRESHOLD = 3000; //FIXME 2kb edge length threshold might affect tip propagator in undesired way class EdgeAnnotationPropagator { const conj_graph_pack& gp_; const string name_; - size_t edge_length_threshold_; + size_t edge_length_threshold_, edge_upper_threshold_; protected: + size_t edge_length_threshold() const { + return edge_length_threshold_; + } + const conj_graph_pack& gp() const { return gp_; } @@ -37,10 +42,12 @@ public: EdgeAnnotationPropagator(const conj_graph_pack& gp, const string& name, - size_t edge_length_threshold = EDGE_LENGTH_THRESHOLD) : + size_t edge_length_threshold = EDGE_LENGTH_THRESHOLD, + size_t edge_upper_threshold = EDGE_UPPER_THRESHOLD) : gp_(gp), name_(name), - edge_length_threshold_(edge_length_threshold) {} + edge_length_threshold_(edge_length_threshold), + edge_upper_threshold_(edge_upper_threshold) {} const std::string& name() const { return name_; @@ -51,9 +58,20 @@ DEBUG("Propagating with propagator: " << name_); for (bin_id bin : edge_annotation.interesting_bins()) { DEBUG("Processing bin " << bin << " with propagator: " << name_); - auto init_edges = edge_annotation.EdgesOfBin(bin, edge_length_threshold_); - DEBUG("Initial edge cnt " << init_edges.size() << " (edge length threshold " << edge_length_threshold_ << ")"); + auto init_edges = edge_annotation.EdgesOfBin(bin, edge_length_threshold()); + DEBUG("Initial edge cnt " << init_edges.size() << " (edge length threshold " << edge_length_threshold() << ")"); auto raw_propagated = PropagateEdges(init_edges); + auto old_size = raw_propagated.size(); + //Filter + size_t n = 0; + for (auto i = raw_propagated.begin(); i != raw_propagated.end(); ++n) { + DEBUG("Edge cnt: " << raw_propagated.size() << "; iter " << n); + if (gp_.g.length(*i) > edge_upper_threshold_) + raw_propagated.erase(i++); + else + ++i; + } + DEBUG("Excluded " << (old_size - raw_propagated.size()) << " >" << edge_upper_threshold_ << "bp edges"); set propagated; std::set_difference(raw_propagated.begin(), raw_propagated.end(), init_edges.begin(), init_edges.end(), @@ -72,40 +90,6 @@ class ConnectingPathPropagator : public EdgeAnnotationPropagator { size_t path_length_threshold_; size_t path_edge_cnt_; - const EdgeAnnotation& debug_annotation_; - - bin_id DetermineBin(const set& edges) const { - map cnt_map; - for (EdgeId e : edges) { - for (auto b : debug_annotation_.Annotation(e)) { - cnt_map[b]++; - } - } - bin_id candidate = ""; - for (auto cnt_el : cnt_map) { - if (cnt_el.second > edges.size() / 2) { - if (candidate.empty()) - candidate = cnt_el.first; - else - return ""; - } - } - return candidate; - } - - bool BadPath(const vector& path, bin_id base_bin) const { - size_t cnt = 0; - for (EdgeId e : path) { - if (g().length(e) < 2000) - continue; - auto ann = debug_annotation_.Annotation(e); - if (!ann.empty() && - std::find(ann.begin(), ann.end(), base_bin) == ann.end()) { - cnt++; - } - } - return cnt > 0; - } set CollectEdgeStarts(const set& edges) const { set answer; @@ -116,29 +100,16 @@ } set PropagateEdges(const set& edges) const override { - //static size_t pic_cnt = 0; - bin_id bin = DetermineBin(edges); - if (!bin.empty()) { - DEBUG("Bin determined as " << bin); - } else { - DEBUG("Failed to determine bin"); - } + DEBUG(__FUNCTION__); set answer; set starts = CollectEdgeStarts(edges); for (EdgeId e : edges) { PathProcessor path_searcher(g(), g().EdgeEnd(e), path_length_threshold_); for (VertexId v : starts) { auto callback = AdapterCallback([&](const vector& path) { - //if (pic_cnt < 10) { - //if (BadPath(path, bin)) { - // auto to_draw = path; - // to_draw.insert(to_draw.begin(), e); - // PrintAnnotatedAlongPath(gp(), to_draw, debug_annotation_, "/home/snurk/tmp/pics/pic_" + ToString(++pic_cnt) + "_"); - //} - //} - insert_all(answer, path); + utils::insert_all(answer, path); }, true); - TRACE("Launching path search between edge " << g().str(e) << " and vertex " + DEBUG("Launching path search between edge " << g().str(e) << " and vertex " << g().str(v) << " with length bound " << path_length_threshold_); path_searcher.Process(v, 0, path_length_threshold_, callback, path_edge_cnt_); } @@ -148,13 +119,12 @@ public: ConnectingPathPropagator(const conj_graph_pack& gp, - size_t path_length_threshold, - size_t path_edge_cnt, - const EdgeAnnotation& ann) : - EdgeAnnotationPropagator(gp, "ConnectingPath"), + size_t length_threshold, + size_t path_length_threshold, + size_t path_edge_cnt) : + EdgeAnnotationPropagator(gp, "ConnectingPath", length_threshold), path_length_threshold_(path_length_threshold), - path_edge_cnt_(path_edge_cnt), - debug_annotation_(ann) {} + path_edge_cnt_(path_edge_cnt) {} private: DECL_LOGGER("ConnectingPathPropagator"); @@ -173,13 +143,15 @@ if (math::ge(point.weight, weight_threshold_)) { DEBUG("Adding (" << g().str(e1) << "," << g().str(i.first) << "); " << point); answer.insert(i.first); - } + } } return answer; } public: - PairedInfoPropagator(const conj_graph_pack& gp, omnigraph::de::DEWeight threshold): - EdgeAnnotationPropagator(gp, "PairedInfo"), weight_threshold_(threshold) {} + PairedInfoPropagator(const conj_graph_pack& gp, + size_t length_threshold, + omnigraph::de::DEWeight threshold): + EdgeAnnotationPropagator(gp, "PairedInfo", length_threshold), weight_threshold_(threshold) {} private: DECL_LOGGER("PairedInfoPropagator"); }; @@ -202,9 +174,9 @@ auto edges_of_contig = mapper_->MapRead(contig).simple_path(); for (EdgeId e : edges_of_contig) { if (edges.count(e)) { - DEBUG("Edge " << gp().g.str(e) << " belongs to the contig #" << + DEBUG("Edge " << gp().g.str(e) << " belongs to the contig #" << contig.name() << " of " << edges_of_contig.size() << " edges"); - insert_all(answer, edges_of_contig); + utils::insert_all(answer, edges_of_contig); break; } } @@ -222,8 +194,8 @@ class TipPropagator : public EdgeAnnotationPropagator { public: - TipPropagator(const conj_graph_pack& gp) : - EdgeAnnotationPropagator(gp, "TipPropagator"), tipper_(gp.g) {} + TipPropagator(const conj_graph_pack& gp, size_t length_threshold) : + EdgeAnnotationPropagator(gp, "TipPropagator", length_threshold), tipper_(gp.g) {} protected: set PropagateEdges(const set& edges) const override { @@ -290,13 +262,13 @@ DECL_LOGGER("AnnotationChecker"); }; -void AnnotationPropagator::Run(io::SingleStream& /*contigs*/, +void AnnotationPropagator::Run(io::SingleStream& /*contigs*/, EdgeAnnotation& edge_annotation /*const string& annotation_out_fn*/) { std::vector> propagator_pipeline { - std::make_shared(gp_, 8000, 10, edge_annotation), - std::make_shared(gp_), - std::make_shared(gp_, 10.)};//, + make_propagator(8000, 10), + make_propagator(), + make_propagator(10.)};//, // std::make_shared(gp_, contigs)};//, // std::make_shared(gp_, 8000, 10, edge_annotation), // std::make_shared(gp_, contigs), diff -Nru spades-3.10.1+dfsg/src/projects/mts/propagate.hpp spades-3.11.1+dfsg/src/projects/mts/propagate.hpp --- spades-3.10.1+dfsg/src/projects/mts/propagate.hpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/propagate.hpp 2017-09-28 09:05:13.000000000 +0000 @@ -14,10 +14,16 @@ class AnnotationPropagator { const conj_graph_pack& gp_; + size_t length_threshold_; public: - AnnotationPropagator(const conj_graph_pack& gp) : - gp_(gp) { + AnnotationPropagator(const conj_graph_pack& gp, size_t length_threshold) : + gp_(gp), length_threshold_(length_threshold) { + } + + template + std::shared_ptr make_propagator(Args... args) { + return std::make_shared(gp_, length_threshold_, args...); } void Run(io::SingleStream& contigs, EdgeAnnotation& edge_annotation); diff -Nru spades-3.10.1+dfsg/src/projects/mts/prop_binning.cpp spades-3.11.1+dfsg/src/projects/mts/prop_binning.cpp --- spades-3.10.1+dfsg/src/projects/mts/prop_binning.cpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/prop_binning.cpp 2017-09-28 09:05:13.000000000 +0000 @@ -17,25 +17,28 @@ using namespace debruijn_graph; std::string add_suffix(const std::string& path, const std::string& suffix) { - auto ext = path::extension(path); + auto ext = fs::extension(path); return path.substr(0, path.length() - ext.length()) + suffix + ext; } -void DumpEdgesAndAnnotation(const Graph& g, - const EdgeAnnotation& edge_annotation, - const string& out_edges, - const string& out_annotation) { - INFO("Dumping edges to " << out_edges << "; their annotation to " << out_annotation); - io::osequencestream oss(out_edges); +//TODO: refactor to process the graph only once +void DumpEdges(const Graph& g, const string& out_edges) { + INFO("Dumping edges to " << out_edges); + io::OutputSequenceStream oss(out_edges); + for (auto it = g.ConstEdgeBegin(true); !it.IsEnd(); ++it) { + EdgeId e = *it; + oss << io::SingleRead("NODE_" + std::to_string(g.int_id(e)), g.EdgeNucls(e).str()); + } +} + +void DumpAnnotation(const Graph& g, const EdgeAnnotation& edge_annotation, const string& out_annotation) { + INFO("Dumping annotation to " << out_annotation); AnnotationOutStream annotation_out(out_annotation); for (auto it = g.ConstEdgeBegin(true); !it.IsEnd(); ++it) { EdgeId e = *it; - io::SingleRead edge_read("NODE_" + ToString(g.int_id(e)), - g.EdgeNucls(e).str()); - oss << edge_read; auto relevant_bins = edge_annotation.Annotation(e); if (!relevant_bins.empty()) { - annotation_out << ContigAnnotation(GetId(edge_read), + annotation_out << ContigAnnotation("NODE_" + std::to_string(g.int_id(e)), vector(relevant_bins.begin(), relevant_bins.end())); } } @@ -48,10 +51,10 @@ create_console_logger(); size_t k; - string saves_path, contigs_path, splits_path, annotation_path; + string saves_path, contigs_path, splits_path, annotation_path, bins_file; vector sample_names, left_reads, right_reads; - string out_root, propagation_dump; - vector bins_of_interest; + string out_root, edges_dump, propagation_dump; + size_t length_threshold; bool no_binning; try { GetOpt_pp ops(argc, argv); @@ -65,19 +68,29 @@ >> Option('l', left_reads) >> Option('r', right_reads) >> Option('o', out_root) - >> Option('d', propagation_dump, "") - >> Option('b', bins_of_interest, {}) - >> OptionPresent('p', no_binning); + >> Option('p', propagation_dump, "") + >> Option('e', edges_dump, "") + >> Option('b', bins_file) + >> Option('t', length_threshold, (size_t)2000) + >> OptionPresent('D', "no-binning", no_binning) + ; } catch(GetOptEx &ex) { cout << "Usage: prop_binning -k -s -c -f " - "-a -n -l -r -o " - "[-d ] [-p to disable binning] [-b *]" << endl; + "-a -n -l -r " + "-o -b [-D to disable binning] " + "[-p ] [-e ]" << endl; exit(1); } - for (const auto& bin_id : bins_of_interest) { - VERIFY_MSG(bin_id.find_last_of(',') == std::string::npos, "Specify bins of interest via space, not comma"); + vector bins_of_interest; + ifstream bins_stream(bins_file); + bin_id bin; + while (!bins_stream.eof()) { + bins_stream >> bin; + bins_of_interest.push_back(bin); + bins_stream.ignore(numeric_limits::max(), '\n'); //Skip the rest of bin info } + INFO("Loaded " << bins_of_interest.size() << " interesting bins"); conj_graph_pack gp(k, "tmp", 1); gp.kmer_mapper.Attach(); @@ -96,33 +109,28 @@ EdgeAnnotation edge_annotation = filler(contigs_stream, split_stream, annotation_in); INFO("Propagation launched"); - AnnotationPropagator propagator(gp); + AnnotationPropagator propagator(gp, length_threshold); propagator.Run(contigs_stream, edge_annotation); INFO("Propagation finished"); + if (!edges_dump.empty()) { + INFO("Dumping propagated edges to " << edges_dump); + DumpEdges(gp.g, edges_dump); + } + if (!propagation_dump.empty()) { - INFO("Dumping propagation info to " << propagation_dump); - DumpEdgesAndAnnotation(gp.g, edge_annotation, - propagation_dump + ".fasta", - propagation_dump + ".ann"); + INFO("Dumping propagated annotation to " << propagation_dump); + DumpAnnotation(gp.g, edge_annotation, propagation_dump); } + //Binning stage if (no_binning) { INFO("Binning was disabled with -p flag"); return 0; } - //Binning stage -// contigs_stream.reset(); -// INFO("Using propagated annotation from " << propagated_path); -// AnnotationStream binning_stream(propagated_path); - for (size_t i = 0; i < sample_names.size(); ++i) { - ContigBinner binner(gp, edge_annotation, out_root, sample_names[i]); - INFO("Initializing binner for " << sample_names[i]); - auto paired_stream = io::PairedEasyStream(left_reads[i], right_reads[i], false, 0); - INFO("Running binner on " << left_reads[i] << " and " << right_reads[i]); - binner.Run(*paired_stream); - binner.close(); - } + + for (size_t i = 0; i < sample_names.size(); ++i) + BinReads(gp, out_root, sample_names[i], left_reads[i], right_reads[i], edge_annotation, bins_of_interest); return 0; } diff -Nru spades-3.10.1+dfsg/src/projects/mts/read_binning.cpp spades-3.11.1+dfsg/src/projects/mts/read_binning.cpp --- spades-3.10.1+dfsg/src/projects/mts/read_binning.cpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/read_binning.cpp 2017-09-28 09:05:13.000000000 +0000 @@ -5,7 +5,7 @@ //* See file LICENSE for details. //*************************************************************************** -#include "utils/simple_tools.hpp" +#include "utils/stl_utils.hpp" #include "utils/logger/log_writers.hpp" #include "pipeline/graphio.hpp" @@ -19,10 +19,12 @@ } void ContigBinner::Init(bin_id bin) { - string out_dir = out_root_ + "/" + ToString(bin) + "/"; - path::make_dirs(out_dir); - out_streams_.insert(make_pair(bin, make_shared(out_dir + sample_name_ + "_1.fastq", - out_dir + sample_name_ + "_2.fastq"))); + string out_dir = out_root_ + "/" + bin + "/"; + fs::make_dirs(out_dir); + out_streams_.insert(make_pair(bin, make_shared( + out_dir + sample_name_ + "_1.fastq.gz", + out_dir + sample_name_ + "_2.fastq.gz") + )); } void ContigBinner::Run(io::PairedStream& paired_reads) { @@ -30,9 +32,13 @@ while (!paired_reads.eof()) { paired_reads >> paired_read; set bins; - insert_all(bins, RelevantBins(paired_read.first())); - insert_all(bins, RelevantBins(paired_read.second())); - for (auto bin : bins) { + utils::insert_all(bins, RelevantBins(paired_read.first())); + utils::insert_all(bins, RelevantBins(paired_read.second())); + for (const auto& bin : bins) { + if (bins_of_interest_.size() && !bins_of_interest_.count(bin)) { + INFO(bin << " was excluded from read binning"); + continue; + } if (out_streams_.find(bin) == out_streams_.end()) { Init(bin); } @@ -41,50 +47,18 @@ } } -}; - -//todo make it take dataset info -/* -int main(int argc, char** argv) { - using namespace debruijn_graph; - - if (argc < 9) { - cout << "Usage: read_binning " - " ()*" << endl; - exit(1); - } - - //TmpFolderFixture fixture("tmp"); - create_console_logger(); - size_t k = lexical_cast(argv[1]); - string saves_path = argv[2]; - string contigs_path = argv[3]; - string contigs_binning_path = argv[4]; - string left_reads = argv[5]; - string right_reads = argv[6]; - string out_root = argv[7]; - string sample_name = argv[8]; - - std::vector bins_of_interest; - for (int i = 9; i < argc; ++i) { - bins_of_interest.push_back(argv[i]); - } - - conj_graph_pack gp(k, "tmp", 0); - gp.kmer_mapper.Attach(); - INFO("Load graph from " << saves_path); - graphio::ScanGraphPack(saves_path, gp); - - ContigBinner binner(gp, bins_of_interest); - - auto contigs_stream_ptr = make_shared(contigs_path); - AnnotationStream binning_stream(contigs_binning_path); - - binner.Init(out_root, sample_name, *contigs_stream_ptr, binning_stream); - +int BinReads(const conj_graph_pack& gp, const std::string& out_root, + const std::string& sample, + const std::string& left_reads, const std::string& right_reads, + const EdgeAnnotation& edge_annotation, + const vector& bins_of_interest) { + ContigBinner binner(gp, edge_annotation, out_root, sample, bins_of_interest); + INFO("Initializing binner for " << sample); auto paired_stream = io::PairedEasyStream(left_reads, right_reads, false, 0); + INFO("Running binner on " << left_reads << " and " << right_reads); binner.Run(*paired_stream); binner.close(); return 0; } -*/ + +}; diff -Nru spades-3.10.1+dfsg/src/projects/mts/read_binning.hpp spades-3.11.1+dfsg/src/projects/mts/read_binning.hpp --- spades-3.10.1+dfsg/src/projects/mts/read_binning.hpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/read_binning.hpp 2017-09-28 09:05:13.000000000 +0000 @@ -8,15 +8,17 @@ #include "annotation.hpp" #include "io/reads/io_helper.hpp" +#include "gzstream/gzstream.h" namespace io { +template class OSingleReadStream { - std::ofstream os_; + Stream os_; public: - OSingleReadStream(const std::string& fn) : - os_(fn) { + OSingleReadStream(const std::string& fn) { + os_.open(fn.c_str()); } OSingleReadStream& operator<<(const SingleRead& read) { @@ -32,9 +34,10 @@ } }; +template class OPairedReadStream { - OSingleReadStream l_os_; - OSingleReadStream r_os_; + OSingleReadStream l_os_; + OSingleReadStream r_os_; public: OPairedReadStream(const std::string& l_fn, const std::string& r_fn) : @@ -63,23 +66,27 @@ std::string out_root_; std::string sample_name_; shared_ptr> mapper_; + std::set bins_of_interest_; - map> out_streams_; + typedef io::OPairedReadStream Stream; + map> out_streams_; set RelevantBins(const io::SingleRead& r) const; void Init(bin_id bin); public: - ContigBinner(const conj_graph_pack& gp, + ContigBinner(const conj_graph_pack& gp, const EdgeAnnotation& edge_annotation, const std::string& out_root, - const std::string& sample_name) : + const std::string& sample_name, + const std::vector& bins_of_interest = {}) : gp_(gp), edge_annotation_(edge_annotation), out_root_(out_root), sample_name_(sample_name), - mapper_(MapperInstance(gp)) { + mapper_(MapperInstance(gp)), + bins_of_interest_(bins_of_interest.begin(), bins_of_interest.end()) { } void Run(io::PairedStream& paired_reads); @@ -89,4 +96,10 @@ } }; +int BinReads(const conj_graph_pack& gp, const std::string& out_root, + const std::string& sample, + const std::string& left_reads, const std::string& right_reads, + const EdgeAnnotation& edge_annotation, + const vector& bins_of_interest); + } diff -Nru spades-3.10.1+dfsg/src/projects/mts/README spades-3.11.1+dfsg/src/projects/mts/README --- spades-3.10.1+dfsg/src/projects/mts/README 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/README 2017-09-28 09:05:13.000000000 +0000 @@ -8,14 +8,15 @@ > conda install -c bioconda snakemake 2. Running MTS -Make a directory for output, place config.yaml there, and configure it. Then run -> snakemake --directory --cores XX +To assemble a dataset, you need to prepare a config.yaml. Use the provided template, it's mostly self-descriptive. Then run +> ./mts.py -c -t XX --directory 3. Gathering stats -To render some interesting info, you need to specify some references in config: -REFS: path +To enable stats for some stage(s), add the section to the config: +stats: + stage: [stat1, stat2] +"gf" stats require references provided with "refs" parameter: + refs: path or -REFS: [path1, path2, ...] + refs: [path1, path2, ...] where path can be either a single reference or a folder with references. -Then run the stats target manually: -> snakemake --directory stats_all diff -Nru spades-3.10.1+dfsg/src/projects/mts/Reassembly.snake spades-3.11.1+dfsg/src/projects/mts/Reassembly.snake --- spades-3.10.1+dfsg/src/projects/mts/Reassembly.snake 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/Reassembly.snake 2017-09-28 09:05:13.000000000 +0000 @@ -0,0 +1,74 @@ +include: "Common.snake" + +import yaml + +READ_LENGTH = config.get("read_length", 100) #TODO: auto-detect default value + +BINS, = glob_wildcards("binning/{{bin,{}}}.info".format(NAME_TEMPLATE)) +if not BINS: + raise WorkflowError("No bins detected!") + +rule reassemble_all: + input: expand("reassembly/bins/{bin}.fasta", bin=BINS) + message: "Reassembly finished." + +rule reassembly_config: + input: "binning/{bin}.info" + output: "reassembly/{bin}.yaml" + message: "Generating config file for reassembly of {wildcards.bin}" + run: + with open(output[0], "w") as outfile: + conf = {"k": PROFILE_K, "sample_cnt": SAMPLE_COUNT, + "kmer_mult": "profile/mts/kmers", + "bin": wildcards.bin, "bin_prof": "binning/{}/bins.prof".format(BINNER), + "edges_sqn": "reassembly/{}_edges.fasta".format(wildcards.bin), + "edges_mpl": "reassembly/{}_edges.mpl".format(wildcards.bin), + "edge_fragments_mpl": "reassembly/{}_edges_frag.mpl".format(wildcards.bin), + "frag_size": SPLIT_LENGTH, "min_len": 100} + yaml.dump(conf, outfile) + +#For bin reassembly, we choose only matching reads from samples which were marked with + +#in the corresponding {bin}.info file +BIN_READS = dict() +for bin in BINS: + with open("binning/{}.info".format(bin)) as samples_info: + samples = [] + for line in samples_info: + sample_data = line.split() + if sample_data[0][0] == "+": + sample = sample_data[0][1:] + if not os.path.exists("binning/{}/{}_1.fastq.gz".format(bin, sample)): + print("\033[33mWarning: {} contains no reads for {}\033[0m".format(sample, bin)) + continue + samples.append(sample) + BIN_READS[bin] = (["binning/{}/{}_1.fastq.gz".format(bin, sample) for sample in samples], + ["binning/{}/{}_2.fastq.gz".format(bin, sample) for sample in samples]) + +def bin_reads(dir, wildcards): + return BIN_READS[wildcards["bin"]][dir] + +#Returns the filepath with left/right reads for a sample or list of them for a group, used as Snakemake input +left_bin_reads, right_bin_reads = reads_input(bin_reads) + +rule reassemble: + input: info="binning/{bin}.info", config="reassembly/{bin}.yaml" + output: "reassembly/bins/{bin}.fasta" + params: out="reassembly/{bin}", + left=lambda w: " ".join(expand("--pe1-1 {r}", r=left_bin_reads(w))), + right=lambda w: " ".join(expand("--pe1-2 {r}", r=right_bin_reads(w))), + log: "reassembly/{bin}.log" + threads: THREADS + message: "Reassembling reads for {wildcards.bin}" + run: + COV_THRESHOLD = 0.3 + subtotal = 0 + with open(input.info) as samples_info: + for line in samples_info: + sample_data = line.split() + if sample_data[0][0] == "+": + subtotal += float(sample_data[1]) + bin_cov = COV_THRESHOLD * subtotal * READ_LENGTH / (READ_LENGTH - PROFILE_K) + shell("{REASSEMBLER_DIR}/spades.py --only-assembler -t {threads}" + " {params.left} {params.right} --pe1-ff -o {params.out}" + " --hidden-cov-cutoff {bin_cov} --series-analysis {input.config}" + " >{log} 2>&1 && cp {params.out}/scaffolds.fasta {output}") diff -Nru spades-3.10.1+dfsg/src/projects/mts/scripts/bhtsne.py spades-3.11.1+dfsg/src/projects/mts/scripts/bhtsne.py --- spades-3.10.1+dfsg/src/projects/mts/scripts/bhtsne.py 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/scripts/bhtsne.py 2017-09-28 09:05:14.000000000 +0000 @@ -0,0 +1,242 @@ +#!/usr/bin/env python + +''' +A simple Python wrapper for the bh_tsne binary that makes it easier to use it +for TSV files in a pipeline without any shell script trickery. + +Note: The script does some minimal sanity checking of the input, but don't + expect it to cover all cases. After all, it is a just a wrapper. + +Example: + + > echo -e '1.0\t0.0\n0.0\t1.0' | ./bhtsne.py -d 2 -p 0.1 + -2458.83181442 -6525.87718385 + 2458.83181442 6525.87718385 + +The output will not be normalised, maybe the below one-liner is of interest?: + + python -c 'import numpy; from sys import stdin, stdout; + d = numpy.loadtxt(stdin); d -= d.min(axis=0); d /= d.max(axis=0); + numpy.savetxt(stdout, d, fmt="%.8f", delimiter="\t")' + +Authors: Pontus Stenetorp + Philippe Remy +Version: 2016-03-08 +''' + +# Copyright (c) 2013, Pontus Stenetorp +# +# Permission to use, copy, modify, and/or distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +from argparse import ArgumentParser, FileType +from os.path import abspath, dirname, isfile, join as path_join +from shutil import rmtree +from struct import calcsize, pack, unpack +from subprocess import Popen +from sys import stderr, stdin, stdout +from tempfile import mkdtemp +from platform import system +from os import devnull +import numpy as np +import os, sys +import io + +### Constants +IS_WINDOWS = True if system() == 'Windows' else False +BH_TSNE_BIN_PATH = '/home/ygorshkov/opt/bh_tsne' #path_join(dirname(__file__), 'windows', 'bh_tsne.exe') if IS_WINDOWS else path_join(dirname(__file__), 'bh_tsne') +assert isfile(BH_TSNE_BIN_PATH), ('Unable to find the bh_tsne binary in the ' + 'same directory as this script, have you forgotten to compile it?: {}' + ).format(BH_TSNE_BIN_PATH) +# Default hyper-parameter values from van der Maaten (2014) +# https://lvdmaaten.github.io/publications/papers/JMLR_2014.pdf (Experimental Setup, page 13) +DEFAULT_NO_DIMS = 2 +INITIAL_DIMENSIONS = 50 +DEFAULT_PERPLEXITY = 50 +DEFAULT_THETA = 0.5 +EMPTY_SEED = -1 +DEFAULT_USE_PCA = True +DEFAULT_MAX_ITERATIONS = 1000 + +### + +def _argparse(): + argparse = ArgumentParser('bh_tsne Python wrapper') + argparse.add_argument('-d', '--no_dims', type=int, + default=DEFAULT_NO_DIMS) + argparse.add_argument('-p', '--perplexity', type=float, + default=DEFAULT_PERPLEXITY) + # 0.0 for theta is equivalent to vanilla t-SNE + argparse.add_argument('-t', '--theta', type=float, default=DEFAULT_THETA) + argparse.add_argument('-r', '--randseed', type=int, default=EMPTY_SEED) + argparse.add_argument('-n', '--initial_dims', type=int, default=INITIAL_DIMENSIONS) + argparse.add_argument('-v', '--verbose', action='store_true') + argparse.add_argument('-i', '--input', type=FileType('r'), default=stdin) + argparse.add_argument('-o', '--output', type=FileType('w'), + default=stdout) + argparse.add_argument('--use_pca', action='store_true') + argparse.add_argument('--no_pca', dest='use_pca', action='store_false') + argparse.set_defaults(use_pca=DEFAULT_USE_PCA) + argparse.add_argument('-m', '--max_iter', type=int, default=DEFAULT_MAX_ITERATIONS) + return argparse + + +def _read_unpack(fmt, fh): + return unpack(fmt, fh.read(calcsize(fmt))) + + +def _is_filelike_object(f): + try: + return isinstance(f, (file, io.IOBase)) + except NameError: + # 'file' is not a class in python3 + return isinstance(f, io.IOBase) + + +def init_bh_tsne(samples, workdir, no_dims=DEFAULT_NO_DIMS, initial_dims=INITIAL_DIMENSIONS, perplexity=DEFAULT_PERPLEXITY, + theta=DEFAULT_THETA, randseed=EMPTY_SEED, verbose=False, use_pca=DEFAULT_USE_PCA, max_iter=DEFAULT_MAX_ITERATIONS): + + if use_pca: + samples = samples - np.mean(samples, axis=0) + cov_x = np.dot(np.transpose(samples), samples) + [eig_val, eig_vec] = np.linalg.eig(cov_x) + + # sorting the eigen-values in the descending order + eig_vec = eig_vec[:, eig_val.argsort()[::-1]] + + if initial_dims > len(eig_vec): + initial_dims = len(eig_vec) + + # truncating the eigen-vectors matrix to keep the most important vectors + eig_vec = eig_vec[:, :initial_dims] + samples = np.dot(samples, eig_vec) + + # Assume that the dimensionality of the first sample is representative for + # the whole batch + sample_dim = len(samples[0]) + sample_count = len(samples) + + # Note: The binary format used by bh_tsne is roughly the same as for + # vanilla tsne + with open(path_join(workdir, 'data.dat'), 'wb') as data_file: + # Write the bh_tsne header + data_file.write(pack('iiddii', sample_count, sample_dim, theta, perplexity, no_dims, max_iter)) + # Then write the data + for sample in samples: + data_file.write(pack('{}d'.format(len(sample)), *sample)) + # Write random seed if specified + if randseed != EMPTY_SEED: + data_file.write(pack('i', randseed)) + +def load_data(input_file): + # Read the data, using numpy's good judgement + return np.loadtxt(input_file) + +def bh_tsne(workdir, verbose=False): + + # Call bh_tsne and let it do its thing + with open(devnull, 'w') as dev_null: + bh_tsne_p = Popen((abspath(BH_TSNE_BIN_PATH), ), cwd=workdir, + # bh_tsne is very noisy on stdout, tell it to use stderr + # if it is to print any output + stdout=stderr if verbose else dev_null) + bh_tsne_p.wait() + assert not bh_tsne_p.returncode, ('ERROR: Call to bh_tsne exited ' + 'with a non-zero return code exit status, please ' + + ('enable verbose mode and ' if not verbose else '') + + 'refer to the bh_tsne output for further details') + + # Read and pass on the results + with open(path_join(workdir, 'result.dat'), 'rb') as output_file: + # The first two integers are just the number of samples and the + # dimensionality + result_samples, result_dims = _read_unpack('ii', output_file) + # Collect the results, but they may be out of order + results = [_read_unpack('{}d'.format(result_dims), output_file) + for _ in range(result_samples)] + # Now collect the landmark data so that we can return the data in + # the order it arrived + results = [(_read_unpack('i', output_file), e) for e in results] + # Put the results in order and yield it + results.sort() + for _, result in results: + yield result + # The last piece of data is the cost for each sample, we ignore it + #read_unpack('{}d'.format(sample_count), output_file) + +def run_bh_tsne(data, no_dims=2, perplexity=50, theta=0.5, randseed=-1, verbose=False, initial_dims=50, use_pca=True, max_iter=1000): + ''' + Run TSNE based on the Barnes-HT algorithm + + Parameters: + ---------- + data: file or numpy.array + The data used to run TSNE, one sample per row + no_dims: int + perplexity: int + randseed: int + theta: float + initial_dims: int + verbose: boolean + use_pca: boolean + max_iter: int + ''' + + # bh_tsne works with fixed input and output paths, give it a temporary + # directory to work in so we don't clutter the filesystem + tmp_dir_path = mkdtemp() + + # Load data in forked process to free memory for actual bh_tsne calculation + child_pid = os.fork() + if child_pid == 0: + if _is_filelike_object(data): + data = load_data(data) + + init_bh_tsne(data, tmp_dir_path, no_dims=no_dims, perplexity=perplexity, theta=theta, randseed=randseed,verbose=verbose, initial_dims=initial_dims, use_pca=use_pca, max_iter=max_iter) + sys.exit(0) + else: + try: + os.waitpid(child_pid, 0) + except KeyboardInterrupt: + print("Please run this program directly from python and not from ipython or jupyter.") + print("This is an issue due to asynchronous error handling.") + + res = [] + for result in bh_tsne(tmp_dir_path, verbose): + sample_res = [] + for r in result: + sample_res.append(r) + res.append(sample_res) + rmtree(tmp_dir_path) + return np.asarray(res, dtype='float64') + + +def main(args): + parser = _argparse() + + if len(args) <= 1: + print(parser.print_help()) + return + + argp = parser.parse_args(args[1:]) + + for result in run_bh_tsne(argp.input, no_dims=argp.no_dims, perplexity=argp.perplexity, theta=argp.theta, randseed=argp.randseed, + verbose=argp.verbose, initial_dims=argp.initial_dims, use_pca=argp.use_pca, max_iter=argp.max_iter): + fmt = '' + for i in range(1, len(result)): + fmt = fmt + '{}\t' + fmt = fmt + '{}\n' + argp.output.write(fmt.format(*result)) + +if __name__ == '__main__': + from sys import argv + exit(main(argv)) diff -Nru spades-3.10.1+dfsg/src/projects/mts/scripts/bin_profiles.py spades-3.11.1+dfsg/src/projects/mts/scripts/bin_profiles.py --- spades-3.10.1+dfsg/src/projects/mts/scripts/bin_profiles.py 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/scripts/bin_profiles.py 2017-09-28 09:05:14.000000000 +0000 @@ -0,0 +1,12 @@ +#!/usr/bin/env python +from __future__ import (print_function) + +import pandas +from pandas import DataFrame +import sys + +profiles_in = pandas.read_table(sys.argv[1], index_col=0, header=None) +binning_out = pandas.read_table(sys.argv[2], index_col=0, names=["bin"], dtype=str) +table = profiles_in.join(binning_out) +profiles = table.groupby("bin").median() +profiles.to_csv(sys.stdout, sep="\t", header=False) diff -Nru spades-3.10.1+dfsg/src/projects/mts/scripts/calc_kmers_mpl.py spades-3.11.1+dfsg/src/projects/mts/scripts/calc_kmers_mpl.py --- spades-3.10.1+dfsg/src/projects/mts/scripts/calc_kmers_mpl.py 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/scripts/calc_kmers_mpl.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,38 +0,0 @@ -#!/usr/bin/env python3 - -import os -import argparse - -def parse_args(): - parser = argparse.ArgumentParser(description="Kmers mpl filter") - parser.add_argument("-om", "--one-min", default=3, type=int, help="min kmer mpl in one sample") - parser.add_argument("-am", "--all-min", default=3, type=int, help="min kmer mpl in all samples") - parser.add_argument("-kl", "--kmer-len", default=31, type=int, help="kmer length") - parser.add_argument("samples_dir", help="directory with samples") - parser.add_argument("output", help="output files prefix") - args = parser.parse_args() - return args - -def calc_mpl(args): - if not os.path.exists(args.samples_dir): - os.makedirs(args.samples_dir) - - files = [f for f in os.listdir(args.samples_dir) if os.path.isfile(os.path.join(args.samples_dir, f))] - - cmd = "/home/toxa31/work/algorithmic-biology/assembler/src/kmer_count_filter/kmer_count_filter -kl {} -one-min {} -all-min {}".format( - args.kmer_len, args.one_min, args.all_min) - - for f in files: - cmd = cmd + " " + args.samples_dir + "/" + f - - cmd = cmd + " " + args.output - - print(cmd) - - os.system(cmd) - -def main(): - args = parse_args() - calc_mpl(args) - -main() \ No newline at end of file diff -Nru spades-3.10.1+dfsg/src/projects/mts/scripts/canopy_launch.sh spades-3.11.1+dfsg/src/projects/mts/scripts/canopy_launch.sh --- spades-3.10.1+dfsg/src/projects/mts/scripts/canopy_launch.sh 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/scripts/canopy_launch.sh 1970-01-01 00:00:00.000000000 +0000 @@ -1,17 +0,0 @@ -#!/bin/bash - -if [ "$#" -lt 3 ]; then - echo "Usage: script.sh [thread_cnt = 4]" - exit -fi - -thread_cnt=4 -if [ "$#" -ge 4 ]; then - thread_cnt=$4 -fi - -/home/snurk/soft/mgs-canopy-algorithm/src/cc.bin -n $thread_cnt -i $1 -o $2 -c $3 #--max_canopy_dist 0.1 --max_close_dist 0.4 --max_merge_dist 0.05 --min_step_dist 0.01 --max_num_canopy_walks 3 --stop_fraction 1 --canopy_size_stats_file stat --filter_min_obs 1 --filter_max_dominant_obs 1.0 - -#/home/snurk/soft/canopy/cc.bin -n 32 -i $1 -o bin_canopy -c prof_canopy --max_canopy_dist 0.1 --max_close_dist 0.4 --max_merge_dist 0.05 --min_step_dist 0.01 --max_num_canopy_walks 3 --stop_fraction 1 --canopy_size_stats_file stat --filter_min_obs 1 --filter_max_dominant_obs 1.0 - -#/home/ygorshkov/Projects/canopy/cc.bin -n 32 -i canopy_mod.in -o bin_canopy -c prof_canopy --max_canopy_dist 0.1 --max_close_dist 0.4 --max_merge_dist 0.1 --min_step_dist 0.005 --max_num_canopy_walks 5 --stop_fraction 1 --canopy_size_stats_file stat diff -Nru spades-3.10.1+dfsg/src/projects/mts/scripts/choose_bins.py spades-3.11.1+dfsg/src/projects/mts/scripts/choose_bins.py --- spades-3.10.1+dfsg/src/projects/mts/scripts/choose_bins.py 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/scripts/choose_bins.py 2017-09-28 09:05:14.000000000 +0000 @@ -0,0 +1,20 @@ +#!/usr/bin/env python +from __future__ import (print_function) + +import re +import sys + +from common import contig_length +import numpy +import pandas +from pandas import DataFrame + +in_fn = sys.argv[1] +d = pandas.read_table(sys.argv[1], names=["name", "bin"], dtype=str) +d["sample"] = d.apply(lambda row: re.findall("\\w+\\d+", row["name"])[0], axis=1) +d["length"] = d.apply(lambda row: contig_length(row["name"]), axis=1) +del d["name"] +info = d.groupby(["bin", "sample"], as_index=False).sum() +info = info.groupby("bin", as_index=False)["length"].max() +info = info[info["length"] > 500000] +info.to_csv(sys.stdout, sep="\t", header=False, index=False) diff -Nru spades-3.10.1+dfsg/src/projects/mts/scripts/choose_samples.py spades-3.11.1+dfsg/src/projects/mts/scripts/choose_samples.py --- spades-3.10.1+dfsg/src/projects/mts/scripts/choose_samples.py 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/scripts/choose_samples.py 2017-09-28 09:05:14.000000000 +0000 @@ -1,61 +1,90 @@ -#!/usr/bin/python +#!/usr/bin/env python from __future__ import (print_function) -import glob from operator import itemgetter -from os import path -import subprocess +#import subprocess +import os.path import sys +import yaml if len(sys.argv) < 3: - print("Usage: choose_samples.py [CAGS+]") + print("Usage: choose_samples.py ") exit(1) PROF = sys.argv[1] -DIR = sys.argv[2] -CAGS = None -if len(sys.argv) == 4: - CAGS = set(sys.argv[3:]) -DESIRED_ABUNDANCE = 50 +FILTERED_BINS = sys.argv[2] +PROF_OUT = sys.argv[3] +DIR = sys.argv[4] +BINS = set() +with open(FILTERED_BINS) as input: + for line in input: + bin = line.split()[0] + BINS.add(bin) + +DESIRED_ABUNDANCE = 999999 #sys.maxsize MIN_ABUNDANCE = 4 -MIN_TOTAL_ABUNDANCE = 20 +MIN_TOTAL_ABUNDANCE = 15 + +prof_dict = dict() + +make_excluded = True +excluded_dir = os.path.join(DIR, "excluded") #Assuming that samples are enumerated consecutively from 1 to N +#(it is forced by the pipeline) with open(PROF) as input: for line in input: + exclude = False + samples = [] params = line.split() - CAG = params[0] - if CAGS and CAG not in CAGS: - continue - profile = map(float, params[1:]) - - print("Profile of", CAG, ":", profile) - - weighted_profile = list((i, ab) - for i, ab in enumerate(profile) if ab >= MIN_ABUNDANCE and path.exists("{}/{}/sample{}_1.fastq".format(DIR, CAG, i + 1))) - weighted_profile.sort(key = itemgetter(1)) + print(params) + bin = params[0] + profile = list(map(float, params[1:])) + if bin not in BINS: + print(bin, "was excluded from reassembly") + exclude = True + else: + print("Profile of", bin, ":", profile) + + #Sort samples by their abundancies + weighted_profile = list((i, ab) + for i, ab in enumerate(profile) if ab >= MIN_ABUNDANCE) #and path.exists("{}/{}/sample{}_1.fastq".format(DIR, CAG, i + 1))) + weighted_profile.sort(key = itemgetter(1)) + + total = 0 + #If we have overabundant samples, use the least. + try: + i = next(x for x, _ in weighted_profile if profile[x] >= DESIRED_ABUNDANCE) + total = profile[i] + samples = [i + 1] + except StopIteration: + #If there isn't any, collect from samples, starting from the largest + for i, _ in reversed(weighted_profile): + total += profile[i] + samples.append(i + 1) + if total >= DESIRED_ABUNDANCE: + break + + print("Chosen samples are", samples, "with total mean abundance", total) + prof_dict[bin] = total + + if total < MIN_TOTAL_ABUNDANCE: + print(bin, "is too scarce; skipping") + exclude = True + + config_dir = DIR + if exclude: + if make_excluded and not os.path.isdir(excluded_dir): + os.mkdir(excluded_dir) + make_excluded = False + config_dir = excluded_dir + config_path = os.path.join(config_dir, bin + ".info") + with open(config_path, "w") as out: + print("total", sum(profile), file=out) + for i, ab in enumerate(profile, start=1): + if i in samples: + print("+", file=out, end="") + print("sample" + str(i), ab, file=out) - sum = 0 - samples = [] - #If we have overabundant samples, use the least. - try: - i = next(x for x, _ in weighted_profile if profile[x] >= DESIRED_ABUNDANCE) - sum = profile[i] - samples = [i + 1] - except StopIteration: - #If there isn't any, collect from samples, starting from the largest - for i, _ in reversed(weighted_profile): - sum += profile[i] - samples.append(i + 1) - if sum >= DESIRED_ABUNDANCE: - break - - print("Chosen samples are", samples, "with total mean abundance", sum) - if sum < MIN_TOTAL_ABUNDANCE: - print(CAG, "is too scarce; skipping") - continue - - for suf, name in [("1", "left"), ("2", "right")]: - reads = ["{}/{}/sample{}_{}.fastq".format(DIR, CAG, sample, suf) for sample in samples] - with open("{}/{}/{}.fastq".format(DIR, CAG, name), "w") as output: - subprocess.check_call(["cat"] + reads, stdout=output) +with open(PROF_OUT, "w") as prof_out: + yaml.dump(prof_dict, prof_out) diff -Nru spades-3.10.1+dfsg/src/projects/mts/scripts/clusters2csv.py spades-3.11.1+dfsg/src/projects/mts/scripts/clusters2csv.py --- spades-3.10.1+dfsg/src/projects/mts/scripts/clusters2csv.py 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/scripts/clusters2csv.py 2017-09-28 09:05:14.000000000 +0000 @@ -0,0 +1,22 @@ +#!/usr/bin/env python +from __future__ import print_function +import sys + +from Bio import SeqIO + +from os import listdir +from os.path import isfile, join + +if len(sys.argv) < 3: + print("Usage: %s " % sys.argv[0]) + sys.exit(1) + +path = sys.argv[1] + +with open(sys.argv[2], "w") as output: + for f in listdir(path): + if isfile(join(path, f)) and f.endswith("fna"): + cluster = f.split(".")[0].split("_")[-1] + record_dict = SeqIO.to_dict(SeqIO.parse(join(path, f), "fasta")) + for k in record_dict.keys(): + print(str(k) + "," + str(cluster), file=output) diff -Nru spades-3.10.1+dfsg/src/projects/mts/scripts/combine_contigs.py spades-3.11.1+dfsg/src/projects/mts/scripts/combine_contigs.py --- spades-3.10.1+dfsg/src/projects/mts/scripts/combine_contigs.py 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/scripts/combine_contigs.py 2017-09-28 09:05:14.000000000 +0000 @@ -2,8 +2,9 @@ from __future__ import print_function import sys import os -import re +import os.path from Bio import SeqIO +from common import sample_name replace = False @@ -13,16 +14,13 @@ else: files = sys.argv[1:] -sample_re = re.compile("sample\d+") - output = sys.stdout for file in files: - sample = sample_re.search(file).group(0) for seq in SeqIO.parse(file, "fasta"): seq_id = seq.id if replace: seq_id = seq_id.replace(",", "~") - seq.id = sample + "-" + seq_id + seq.id = sample_name(file) + "-" + seq_id seq.description = "" SeqIO.write(seq, output, "fasta") diff -Nru spades-3.10.1+dfsg/src/projects/mts/scripts/common.py spades-3.11.1+dfsg/src/projects/mts/scripts/common.py --- spades-3.10.1+dfsg/src/projects/mts/scripts/common.py 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/scripts/common.py 2017-09-28 09:05:14.000000000 +0000 @@ -6,24 +6,40 @@ import os import os.path -try: - import yaml - def load_dict(input): - return yaml.load(input) - def dump_dict(dict, output): - yaml.dump(dict, output) -except: - def load_dict(input): - def load_pairs(): - for line in input: - params = line.split(":", 2) - yield (params[0].strip(), params[1].strip()) - return dict(load_pairs()) - def dump_dict(dict, output): - for k, v in dict.items(): - print(k, ": ", v, sep="", file=output) +import re -FASTA_EXTS = {".fasta", ".fa", ".fna", ".fsa", ".fastq", ".fastq.gz", ".fq", ".fq.gz", ".fna.gz"} +default_values = { + "threads": 16, + "assembly": {"assembler": "spades", "k": 55, "groups": []}, + "profile": {"profiler": "mts", "k": 21, "split": 10000}, + "binning": {"binner": "canopy", "min_length": 2000, "min_nonzeroes": 3}, + "propagation": {"enabled": True}, + "reassembly": {"enabled": True} +} + +# Taken from http://stackoverflow.com/questions/36831998/how-to-fill-default-parameters-in-yaml-file-using-python +def setdefault_recursively(tgt, default = default_values): + for k in default: + if isinstance(default[k], dict): # if the current item is a dict, + # expand it recursively + setdefault_recursively(tgt.setdefault(k, {}), default[k]) + else: + # ... otherwise simply set a default value if it's not set before + tgt.setdefault(k, default[k]) + +def fill_default_values(config): + local_dir = config.get("LOCAL_DIR") + if local_dir: + default_values["bin"] = os.path.join(local_dir, "build/release/bin") + default_values["scripts"] = os.path.join(local_dir, "src/projects/mts/scripts") + default_values["assembly"]["dir"] = os.path.join(local_dir, "bin") + setdefault_recursively(config) + config["reassembly"].setdefault("dir", config["assembly"].get("dir")) + +def sample_name(fullname): + return os.path.splitext(os.path.basename(fullname))[0] + +FASTA_EXTS = {".fasta", ".fasta.gz", ".fa", ".fna", ".fsa", ".fastq", ".fastq.gz", ".fq", ".fq.gz", ".fna.gz"} def gather_paths(path, basename=False): for filename in os.listdir(path): name = os.path.basename(filename) @@ -54,7 +70,7 @@ for ref in gather_paths(data, True): yield ref else: - yield (os.path.splitext(os.path.basename(data))[0], data) + yield (sample_name(data), data) def get_id(internal_id, sample): res = internal_id.split("_", 2)[1] @@ -65,7 +81,7 @@ sample, _ = os.path.splitext(os.path.basename(file)) with open(file) as input: for line in input: - info = line.split(" : ") + info = line.split("\t") id = get_id(info[0], sample) if normalize else info[0] bins = info[1].split() if id in res: @@ -74,6 +90,13 @@ res[id] = set(bins) return res +def contig_length(name): + if "(" in name: + start, end = re.search("\((\d+)_(\d+)\)", name).groups() + return int(end) - int(start) + else: + return int(name.split("_")[3]) + class Row: def __init__(self, data, colnames): self.data = data diff -Nru spades-3.10.1+dfsg/src/projects/mts/scripts/contig_name_filter.py spades-3.11.1+dfsg/src/projects/mts/scripts/contig_name_filter.py --- spades-3.10.1+dfsg/src/projects/mts/scripts/contig_name_filter.py 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/scripts/contig_name_filter.py 2017-09-28 09:05:14.000000000 +0000 @@ -0,0 +1,30 @@ +#!/usr/bin/env python +from __future__ import print_function + +import sys +from Bio import SeqIO + +if len(sys.argv) < 4: + print("Usage:", sys.argv[0], " []") + print("Operation mode is \"retain\" (default) or \"remove\"") + sys.exit(1) + +f_n = sys.argv[1] +names_f = open(sys.argv[2], "r") +names = set(l.strip() for l in names_f.readlines()) +input_seq_iterator = SeqIO.parse(open(f_n, "r"), "fasta") + +filtered_iterator = (record for record in input_seq_iterator \ + if record.name in names) + +if (len(sys.argv) == 5): + if sys.argv[4] == "remove": + filtered_iterator = (record for record in input_seq_iterator \ + if record.name not in names) + else: + if sys.argv[4] != "retain": + print("Wrong operation mode") + +output_handle = open(sys.argv[3], "w") +SeqIO.write(filtered_iterator, output_handle, "fasta") +output_handle.close() diff -Nru spades-3.10.1+dfsg/src/projects/mts/scripts/convert_output.py spades-3.11.1+dfsg/src/projects/mts/scripts/convert_output.py --- spades-3.10.1+dfsg/src/projects/mts/scripts/convert_output.py 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/scripts/convert_output.py 2017-09-28 09:05:14.000000000 +0000 @@ -0,0 +1,53 @@ +#!/usr/bin/env python +from __future__ import print_function + +import argparse +import os.path +import re + +argparser = argparse.ArgumentParser(description="Binner output formatter") +argparser.add_argument("--type", "-t", choices=["canopy", "concoct", "maxbin", "gattaca", "binsanity"], help="Binner type", default="canopy") +argparser.add_argument("--output", "-o", type=str, help="Output directory with unified binning results") +argparser.add_argument("input", type=str, help="File with binning info") + +extract_num = re.compile("\d+") + +class Parser: + def __init__(self): + self.bins = [] + + def add(self, line): + sample_contig, bin_id = self.parse(line) + bin_num = extract_num.findall(bin_id)[0] + self.bins.append(sample_contig + "\t" + "BIN" + bin_num) + + def parse_file(self, file): + with open(file, "r") as input_file: + for line in input_file: + self.add(line) + +class CanopyParser(Parser): + def parse(self, line): + annotation_str = line.split() + bin_id = annotation_str[0].strip() + sample_contig = annotation_str[1].strip() + return (sample_contig, bin_id) + +class ConcoctParser(Parser): + def parse(self, line): + annotation_str = line.split(",", 1) + bin_id = annotation_str[1].strip() + sample_contig = annotation_str[0].replace("~", ",") + return (sample_contig, bin_id) + +parsers = {"canopy": CanopyParser(), "concoct": ConcoctParser(), "maxbin": ConcoctParser(), "gattaca": ConcoctParser(), "binsanity": ConcoctParser()} + +if __name__ == "__main__": + args = argparser.parse_args() + parser = parsers[args.type] + + parser.parse_file(args.input) + + with open(args.output, "w") as sample_out: + for sample in parser.bins: + print(sample, file=sample_out) diff -Nru spades-3.10.1+dfsg/src/projects/mts/scripts/cut_fasta.py spades-3.11.1+dfsg/src/projects/mts/scripts/cut_fasta.py --- spades-3.10.1+dfsg/src/projects/mts/scripts/cut_fasta.py 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/scripts/cut_fasta.py 2017-09-28 09:05:14.000000000 +0000 @@ -0,0 +1,45 @@ +#!/usr/bin/env python2 +"""Cut up fasta file in non-overlapping or overlapping parts of equal length. +""" +import argparse +from Bio import SeqIO + + +def cut_up_fasta(fastfiles, chunk_size, overlap, merge_last): + for ff in fastfiles: + for record in SeqIO.parse(ff, "fasta"): + if (not merge_last and len(record.seq) > chunk_size) or (merge_last and len(record.seq) >= 2 * chunk_size): + i = 0 + for split_seq in chunks(record.seq, chunk_size, overlap, merge_last): + start = i*chunk_size + end = start + len(split_seq) + print ">%s_(%i_%i)\n%s" % (record.id, start, end, split_seq) + i = i + 1 + else: + print ">%s\n%s" % (record.id, record.seq) + + +def chunks(l, n, o, merge_last): + """ Yield successive n-sized chunks from l with given overlap o between the + chunks. + """ + assert n > o + + if not merge_last: + for i in xrange(0, len(l), n - o): + yield l[i:i + n] + else: + for i in xrange(0, len(l) - n + 1, n - o): + yield l[i:i + n] if i + n + n - o <= len(l) else l[i:] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument( + "contigs", nargs="+", help="Fasta files with contigs\n") + parser.add_argument("-c", "--chunk_size", default=1999, type=int, help="Chunk size\n") + parser.add_argument("-o", "--overlap_size", default=1900, type=int, help="Overlap size\n") + parser.add_argument("-m", "--merge_last", default=False, action="store_true", help="Concatenate final part to last contig\n") + args = parser.parse_args() + cut_up_fasta(args.contigs, args.chunk_size, args.overlap_size, args.merge_last) diff -Nru spades-3.10.1+dfsg/src/projects/mts/scripts/Dataset_analysis.ipynb spades-3.11.1+dfsg/src/projects/mts/scripts/Dataset_analysis.ipynb --- spades-3.10.1+dfsg/src/projects/mts/scripts/Dataset_analysis.ipynb 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/scripts/Dataset_analysis.ipynb 2017-09-28 09:05:14.000000000 +0000 @@ -0,0 +1,2246 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "# Load contig profiles and binning results.\n", + "# TODO: Make profiles and binning results the same for all algos.\n", + "profile = pd.read_csv(\"/Users/tanunia/PycharmProjects/biolab_scripts/canopy_profiles.in\", sep=\" \", header=None)\n", + "clusters = pd.read_csv(\"/Users/tanunia/PycharmProjects/biolab_scripts/canopy_binning.tsv\", sep=\"\\t\", header=None)\n", + "\n", + "# Add binning column to profile\n", + "clusters = clusters.rename(columns={1:'contig', 0:'color'})\n", + "cols = clusters.columns\n", + "clusters = clusters[cols[::-1]]\n", + "clusters[\"color\"] = clusters[\"color\"].apply(lambda x: int(x[3:]))\n", + "profile = profile.rename(columns={0:'contig'})\n", + "profile = pd.merge(clusters, profile, on='contig')\n", + "#profile" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "

      \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
      color12345678910111213141516
      count874229.000000874229.000000874229.000000874229.000000874229.000000874229.000000874229.000000874229.000000874229.000000874229.000000874229.000000874229.000000874229.000000874229.000000874229.000000874229.000000874229.000000
      mean114.3450237.0218159.68245716.02375912.82663111.0292189.77326515.42432512.30941815.03763411.23512717.82974619.18348513.39538819.27728314.63600311.643889
      std175.73204122.34905737.19896035.92196036.78067236.52169528.56973645.45141436.72762838.22871926.67255332.77397950.95823134.55544051.37833427.74598136.868664
      min1.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
      25%20.0000000.0000000.0000002.0000000.0000000.0000000.0000002.0000000.0000002.0000002.0000004.0000003.0000002.0000003.0000004.0000000.000000
      50%61.0000003.0000002.0000007.0000005.0000004.0000004.0000005.0000004.0000005.0000005.0000009.0000009.0000005.0000007.0000008.0000004.000000
      75%127.0000007.0000008.00000021.00000012.00000010.0000009.00000015.00000010.00000014.00000012.00000024.00000020.00000014.00000018.00000020.00000011.000000
      max1899.0000002962.0000002890.0000004656.0000004791.0000005243.0000003941.0000006316.0000003737.0000004412.0000003137.0000003693.0000006464.0000003996.0000005213.0000003440.0000004765.000000
      \n", + "
      " + ], + "text/plain": [ + " color 1 2 3 \\\n", + "count 874229.000000 874229.000000 874229.000000 874229.000000 \n", + "mean 114.345023 7.021815 9.682457 16.023759 \n", + "std 175.732041 22.349057 37.198960 35.921960 \n", + "min 1.000000 0.000000 0.000000 0.000000 \n", + "25% 20.000000 0.000000 0.000000 2.000000 \n", + "50% 61.000000 3.000000 2.000000 7.000000 \n", + "75% 127.000000 7.000000 8.000000 21.000000 \n", + "max 1899.000000 2962.000000 2890.000000 4656.000000 \n", + "\n", + " 4 5 6 7 \\\n", + "count 874229.000000 874229.000000 874229.000000 874229.000000 \n", + "mean 12.826631 11.029218 9.773265 15.424325 \n", + "std 36.780672 36.521695 28.569736 45.451414 \n", + "min 0.000000 0.000000 0.000000 0.000000 \n", + "25% 0.000000 0.000000 0.000000 2.000000 \n", + "50% 5.000000 4.000000 4.000000 5.000000 \n", + "75% 12.000000 10.000000 9.000000 15.000000 \n", + "max 4791.000000 5243.000000 3941.000000 6316.000000 \n", + "\n", + " 8 9 10 11 \\\n", + "count 874229.000000 874229.000000 874229.000000 874229.000000 \n", + "mean 12.309418 15.037634 11.235127 17.829746 \n", + "std 36.727628 38.228719 26.672553 32.773979 \n", + "min 0.000000 0.000000 0.000000 0.000000 \n", + "25% 0.000000 2.000000 2.000000 4.000000 \n", + "50% 4.000000 5.000000 5.000000 9.000000 \n", + "75% 10.000000 14.000000 12.000000 24.000000 \n", + "max 3737.000000 4412.000000 3137.000000 3693.000000 \n", + "\n", + " 12 13 14 15 \\\n", + "count 874229.000000 874229.000000 874229.000000 874229.000000 \n", + "mean 19.183485 13.395388 19.277283 14.636003 \n", + "std 50.958231 34.555440 51.378334 27.745981 \n", + "min 0.000000 0.000000 0.000000 0.000000 \n", + "25% 3.000000 2.000000 3.000000 4.000000 \n", + "50% 9.000000 5.000000 7.000000 8.000000 \n", + "75% 20.000000 14.000000 18.000000 20.000000 \n", + "max 6464.000000 3996.000000 5213.000000 3440.000000 \n", + "\n", + " 16 \n", + "count 874229.000000 \n", + "mean 11.643889 \n", + "std 36.868664 \n", + "min 0.000000 \n", + "25% 0.000000 \n", + "50% 4.000000 \n", + "75% 11.000000 \n", + "max 4765.000000 " + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Information about profile\n", + "profile.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
      \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
      color12345678910111213141516
      count350711.000000350711.000000350711.000000350711.000000350711.000000350711.000000350711.000000350711.000000350711.000000350711.000000350711.000000350711.000000350711.000000350711.000000350711.000000350711.000000350711.000000
      mean21.8288136.71119812.78442622.15833316.60563012.97320611.56384318.74725914.53636219.20545113.46337921.72018023.32945919.49207528.52468319.64327317.859403
      std21.42948817.49565147.77237429.76822630.13488421.40715919.74967428.54166832.81106227.86904518.85269621.70646136.84061932.84348054.12075319.49114733.627913
      min1.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
      25%6.0000000.0000000.0000004.0000002.0000002.0000002.0000003.0000003.0000004.0000004.0000007.0000006.0000005.0000006.0000007.0000003.000000
      50%15.0000003.0000004.00000012.0000007.0000006.0000006.0000008.0000006.00000010.0000009.00000017.00000013.00000010.00000014.00000013.0000007.000000
      75%30.0000007.00000012.00000030.00000015.00000012.00000010.00000023.00000013.00000023.00000018.00000031.00000029.00000019.00000028.00000028.00000021.000000
      max122.000000922.0000002890.000000965.0000001499.0000001290.0000001281.0000001273.0000001922.0000001199.0000001492.000000925.0000002252.0000001641.0000003422.000000897.0000002124.000000
      \n", + "
      " + ], + "text/plain": [ + " color 1 2 3 \\\n", + "count 350711.000000 350711.000000 350711.000000 350711.000000 \n", + "mean 21.828813 6.711198 12.784426 22.158333 \n", + "std 21.429488 17.495651 47.772374 29.768226 \n", + "min 1.000000 0.000000 0.000000 0.000000 \n", + "25% 6.000000 0.000000 0.000000 4.000000 \n", + "50% 15.000000 3.000000 4.000000 12.000000 \n", + "75% 30.000000 7.000000 12.000000 30.000000 \n", + "max 122.000000 922.000000 2890.000000 965.000000 \n", + "\n", + " 4 5 6 7 \\\n", + "count 350711.000000 350711.000000 350711.000000 350711.000000 \n", + "mean 16.605630 12.973206 11.563843 18.747259 \n", + "std 30.134884 21.407159 19.749674 28.541668 \n", + "min 0.000000 0.000000 0.000000 0.000000 \n", + "25% 2.000000 2.000000 2.000000 3.000000 \n", + "50% 7.000000 6.000000 6.000000 8.000000 \n", + "75% 15.000000 12.000000 10.000000 23.000000 \n", + "max 1499.000000 1290.000000 1281.000000 1273.000000 \n", + "\n", + " 8 9 10 11 \\\n", + "count 350711.000000 350711.000000 350711.000000 350711.000000 \n", + "mean 14.536362 19.205451 13.463379 21.720180 \n", + "std 32.811062 27.869045 18.852696 21.706461 \n", + "min 0.000000 0.000000 0.000000 0.000000 \n", + "25% 3.000000 4.000000 4.000000 7.000000 \n", + "50% 6.000000 10.000000 9.000000 17.000000 \n", + "75% 13.000000 23.000000 18.000000 31.000000 \n", + "max 1922.000000 1199.000000 1492.000000 925.000000 \n", + "\n", + " 12 13 14 15 \\\n", + "count 350711.000000 350711.000000 350711.000000 350711.000000 \n", + "mean 23.329459 19.492075 28.524683 19.643273 \n", + "std 36.840619 32.843480 54.120753 19.491147 \n", + "min 0.000000 0.000000 0.000000 0.000000 \n", + "25% 6.000000 5.000000 6.000000 7.000000 \n", + "50% 13.000000 10.000000 14.000000 13.000000 \n", + "75% 29.000000 19.000000 28.000000 28.000000 \n", + "max 2252.000000 1641.000000 3422.000000 897.000000 \n", + "\n", + " 16 \n", + "count 350711.000000 \n", + "mean 17.859403 \n", + "std 33.627913 \n", + "min 0.000000 \n", + "25% 3.000000 \n", + "50% 7.000000 \n", + "75% 21.000000 \n", + "max 2124.000000 " + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Leave only clusters with significant contig length in profile\n", + "\n", + "#CANOPY: bin_info$third_largest > 3 000 000\n", + "cag_str = \"CAG0001 CAG0002 CAG0004 CAG0003 CAG0005 CAG0008 CAG0007 CAG0006 CAG0010 CAG0015 CAG0014 CAG0009 CAG0012 CAG0074 CAG0018 CAG0040 CAG0016 CAG0029 CAG0013 CAG0017 CAG0021 CAG0020 CAG0085 CAG0019 CAG0028 CAG0047 CAG0057 CAG0032 CAG0039 CAG0027 CAG0024 CAG0122 CAG0062 CAG0048 CAG0030 CAG0022 CAG0025 CAG0056 CAG0071 CAG0077 CAG0049 CAG0034 CAG0023 CAG0051 CAG0036 CAG0059\"\n", + "filter1 = [int(x[3:]) for x in cag_str.split(\" \")]\n", + "#CONCOCT: bin_info$third_largest > 20 000 000 \n", + "#filter1 = [89, 243, 312, 278, 109, 250, 60, 59, 195, 277, 190, 394, 311, 301, 333, 51, 143, 327, 338, 147, 256, 163, 18, 141, 134, 317, 81, 371, 288, 216, 388, 135, 71, 341, 367, 92, 232, 119, 252, 293, 361, 350, 168]\n", + "profile = profile[profile[\"color\"].isin(filter1)]\n", + "\n", + "# New profile info\n", + "profile.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Get fraction of profile - profile_small. Normalize profile_small data (like in CONCOCT) and convert it to numpy array\n", + "\n", + "import numpy as np\n", + "\n", + "profile_small = profile.sample(frac=0.1)\n", + "data = profile_small.as_matrix(columns = profile.columns[2:])\n", + "v = (1.0/2000)\n", + "data = data + v\n", + "along_Y = np.apply_along_axis(sum, 0, data)\n", + "data = data/along_Y[None, :]\n", + "along_X = np.apply_along_axis(sum, 1, data)\n", + "data = data/along_X[:, None]\n", + "data = np.log(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Run bht-sne\n", + "path_bhtsne = '/Users/tanunia/PycharmProjects/biolab_t-sne/'\n", + "\n", + "# Save profile_small to tsv file\n", + "np.savetxt(\"data.in\", data, delimiter=\"\\t\")\n", + "\n", + "import sys, os\n", + "os.system(path_bhtsne + 'bhtsne.py -p 50 -m 1000 -i data.in -o data.out')\n", + "\n", + "# Load coordinates from data.out\n", + "ar = np.loadtxt(\"data.out\", delimiter=\"\\t\")\n", + "len(ar[:, 0])\n", + "\n", + "# Save bhtsne result to profile_small\n", + "profile_small[\"x\"] = ar[:, 0]\n", + "profile_small[\"y\"] = ar[:, 1]" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAD8CAYAAAB0IB+mAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzsnXd8VMX6h58529N7SIAAoYMKCKgoKnYBr4qoYAMrXtu1\n/VSu5XrVa+/92kUFFBA7erEAigoKCKFDICEJ6b1tP/P7YzfJJrtJNsmSgJ6Hz37IzpkzM2ez+Z45\n77zvO0JKiYaGhobGnx+lpwegoaGhodE9aIKvoaGh8RdBE3wNDQ2Nvwia4GtoaGj8RdAEX0NDQ+Mv\ngib4GhoaGn8RNMHX0NDQ+IugCb6GhobGX4SgBV8I8bYQolgIscWnLE4I8a0QYrf3/1ifY/8UQmQK\nIXYKIc4I9cA1NDQ0NDqGCDbSVghxAlALvCelPMxb9gRQLqV8TAgxF4iVUt4lhBgBLASOAlKB74Ah\nUkp3W30kJCTI/v37d/piNDQ0NP6KrF+/vlRKmdhePX2wDUopfxRC9G9RfA4wyfvzPGAlcJe3/EMp\npR3IEkJk4hH/X9vqo3///qxbty7YIWloaGhoAEKIfcHU66oNP1lKWeD9uRBI9v7cG8j1qZfnLdPQ\n0NDQ6CFCtmgrPbahDmdiE0LMEUKsE0KsKykpCdVwNDQ0NDRa0FXBLxJCpAB4/y/2lu8H+vrU6+Mt\n80NK+bqUcpyUclxiYrsmKA0NDQ2NTtJVwf8cmO39eTbwmU/5TCGESQgxABgM/NbFvjQ0NDQ0ukDQ\ni7ZCiIV4FmgThBB5wP3AY8AiIcRVwD7gQgAp5VYhxCJgG+ACbmjPQ0dD46/Cwu2FXLw2DwHkn9mH\nXr169fSQNP4iBO2W2R2MGzdOal46Gn9mxLv+328BqJeP6/7BaPxpEEKsl1K2+yXSIm01NLoJUwCx\nB4+nwyubCgIe09AIJZrga2h0E442jt3wR0CfBg2NkKIJvobGQUKd09XTQ9D4k6MJvoZGN1BQ39b8\n3kPE/I2cuGx7N4xG469K0F46GhoanePM/+3gfwW1QdX9sbiu2cLuU2NSue2IFIQQB2p4Gn8htBm+\nhsYB5OrVWUGLfSD+7498lHnrufO3oFKlaGi0iSb4GhoHkLcyy0LSzpPbShDvrmN7eV1I2tP4a6IJ\nvobGIcSIz7dTY7X39DCaUVxsY9uWClT14Inp0QiMZsPX6BGs1q+pt09p5agBHR8TE/O3bh3ToULU\nR5uRB0Gg1t49VYwe9I1feal9OkajJi0HI9oMX6Pbcbp2tiH2AE7cnE1ZpfC+Rnbb2IqLHVxyyT6m\nTcsmI8Pabf12lPw6W08PIaDYAySYPqa+3tnNo9EIBk3wNbqd6toTO3jGNsoqjzwgY/HloutySJ62\nlwVGJ58muRn1cj5izh5ib9lLVkn7bpWBmDvCmwHW7YYQpjHpvXgLD27Ibb/iAeKxB/9o83iv8KXd\nNBKNjqAJvkYPUNSJc/7gQOZ9+vinGj60OeEwA5gUEMLzUgSVVkn6fXnU29UOt/vwuL4Y97tgfRzs\nDAe7AJWQiP/9GUXsrOiZRdxXns9st05paWk3jESjI2iCr9EDmDp1lpSh8XgJxKwXCsGIR+Rb4i2b\n+UbHb1T//jEbx/44UHVQaYY/YmBjDGyJCInoH/F5zwRqXXJ5ert10hO/74aRaHQETfA1up0Iy7JO\nnSdERIhH0kR9uAgs9j58t7PjNv3XV1eD6ruAKcChgNUA7q4HUzl6yDHm0afH9kzHGl1CE3yNbsdk\nOhmYhJQ0e7WHEOYDNiadoR3xlRLrDieFhYUdatfhbKVdARziO0S8Mu+wnh6CRgfRBF+jR4iPWcGs\n3CO4NguqXKC2EP+WN4DYqOoDOp7zj2rDxCKlx+6+0k7KaRUdavecI82gBEiKJgFD16fnPen8eOms\nkRx3Ymyrx7NKT+vG0WgEgyb4Gj3GswPf5ZM6GLQDkrdC4hbPq99WkNIM6BDMIz5GoiiRB3QsC65K\n9Jh0Wop+w/v/1Xv2btvupNff21+wbOClKQOJTrCCziv6QgXFzZijakHpukmn5MLDu9xGV/h65emE\nh/tfx+lTkoiPj+uBEWm0hSb4Gj3G0PAxLB76GmFC11iWrDfz++g1JMZZiY9xERczq1vGoigKex/s\n02THb3jMcEtYUg+5XuFXoGhP8LaYcIOe4vsP4+7peoYOreHYI+v57s54Nlw+mgdGp7R6nkURPDO2\nd5ttZ5w1hJiwzi2Ah5KC2gupljPIrZxCie1cquUMlnx1Uk8PSyMAXd7iUAgxFPjIpygd+BcQA1wD\nlHjL75ZStrlap21x+NdESkmePY8wXRjxhvieHg6mG/bi+MkOW5yYU5xEj3VSGReN3u0mqqiaAn00\nclnog8HcqmRTeT1RBh2Doj3rFZ/uK2f6ir34OoTeMCiGlyYOCnn/GocuwW5x2GUToJRyJzDa26kO\n2A98AlwBPCulfKqrfWj8uRFC0Nfct6eH0ciNk6J4pqQcU34d4tQwik3RSEXBDtj7GulVVn5A+tUp\ngiMTwpuVndsvDvflB940ItLfhCzvTUTvYMHHcVx0ds+nb9AILaE26ZwC7JFSarlcNQ5JnE4nW3+7\nnmv/9wHjemfh1OuRStOficugpyQhjhGTv+3BUYYWYVgOWYPxuA4p4DJx8Tl1nH/H5z09NI0QE2rB\nnwks9Hl/kxAiQwjxthCi9eV8DY2DhMn3zmDQK4OxVOsoSEnCZfB/CFZUlTqM7bbldDu547c7CHs/\nDPGuIOy9MJ7b+hwAqlTZXrGd7OrsUF9Ch1DMy8FlwiP2DYuvnv8/fiqqp4alcYAImeALIYzA2cBi\nb9GreOz5o4EC4OlWzpsjhFgnhFhXUlISqIqGRki48NiLOSJ8HM/f/zIAhTWFzKiZwdc1XwNQ46xh\n0JuDMNR7xDy2sgrF5b9AK4XA6Gw7RXGds47IDyJ5attTWN2egC2rauXW328l9v1YdPN0jPhsBAOW\nDkC8KxDvCoZ/PPyApo9oSU5OHdJuoEnofRGtlHc/Qvzo99LoHKGc4U8GNkgpiwCklEVSSreUUgXe\nAI4KdJKU8nUp5Tgp5bjExMQQDkdDw8Nz97/EEHEYG3/NwFZv4+UHX6XfgEGkuFNYxCKmMAVRI1hc\ntRRTpaXxvFFbd6GTzfPnKG43UTV1/LJwTJt9jv50NHYZ+KZQ6a4MWL6jZgfKPIUqe1UHr7Bz3HVX\nNge7o15r4q6JfucI5W/7InzMOUIIX5+zacCWEPaloRE0rzz4X78yY7aJxDt7NSu7ynA59UlN2xFG\n1dYx7avviK2oQnG7UdxukkvKmKAvo63JSV5dHpl1wfvqtyRmYQwnLTuJcuuBWRxuoFevtsxSEkTP\nhgLfdpsmGaEmJIIvhAgHTgN8c6I+IYTYLITIAE4Cbg1FXxoaHeG+6x4gkJFEIIh6v8WykgIZt/6B\n09KUCjm1qJSLv1jCMfuXcr1lP/mfHsOXC9qOIH1n1ztdHvfK4pXEfxRP2HthXW6rNebOTcNjtmn5\nCXnef/hleMtTupVnnz2wN7y/IiGJzJZS1gHxLcouC0XbGhrBYnM5eGTLUjZX7iPGGM6+mhJ+Tv6d\nsPMTcKSZMGXZifmhAn2VZ+YqHP426vrrStjNblJe60VkXjSV6eW45yj8fOvDQY9jddHqkF2TVbVy\n+CeHs3na5pC12UBysombburFiy8W0kz0FSdVFccQFXXgktVp9AzaPmQaB4TbH9vEM/9s8P4AQ2Qd\njurQb2KSXVPEpG/vJ7e+DDXQXH6gBUeaGYwK1pHhVJ0RS99/78NQ6MAZZ4BfjgahgDUMEorpp5zG\nl3c+DXd2fkxJlqTOnxyALVWdM23YnW5OeDqfjDwnCeGCxdckcsyg5iL+wgtDuPnmvlx++Q727LFx\n442p3H13v1AMu8tIeYJmqw8xXY60DSVapO2hTWFhLb1757fYzNrXZCCRclhI+rK5HaQtnkOJq7b9\nyr6oEsvWOlKf2U/OvX1xDrS0qCCI00dQNvPdTo8tuyabAR8P6PT5gZCXB/93WlplJ/Gu/a0e33l/\nCkNSWl73wYnB8COuQLnn5AndP5iDmGAjbQ/uJXqNQ4bMzFpSUvJRVWjpz93k4if4/Js9Xe6r2l6P\nZeHFHRd7AEVgHRlO9jPpOAeGNRtbw3jLXbXsr+38Ziv9I/tjFO376R8INuTY2hR7gKEPFFBRd2jk\nZnY6T2gm7uHhmth3BU3wNULCkCH5LUoC+3CfM7lt//VgSF5yVdcaUATuWEObVcZ/fVeXusi5IKdL\n5/vywKgHgq47/pGWv4fA3LSwY3n9exopPcJfW6uJfVfQBF8jJARvGezaV+7Hgq3YpLNLbQRDgT2w\nr3ywhMqOb8DAvzf9G8t7Fv614V/t1g92193PN3f9xqtx6KEt2mocUpz3U/fk4jMrgf80cmtcvLGt\nmuc3VVLtY1sWwHnpYbx0QgK9wvQIIbh0wKV8kPVBp/pPj0hnb+1enHg6sal2Hsp4iF3Vu/hw0oed\natOX4b107VfS+NOhzfA1QoKhbQtJIzqLrUv9lDlqunR+sFzQ79hm76WUvJhRRb/3c3hofXOxB8+y\n9Md76+n9bg77ajxPIO+f+D6n9jq1U/3vrd3r03LDCz7K/hi3u3X7uyXIKdz8q1rPxd8aUkri45un\nODjrrI0dbkej59AEXyMkOBxDfN41F6mmMqirHNGNo+ocBqHjnQk3NL63uVROWJrHP1aXBQzi8kUF\nDluQ25gT59szv2XZqZ3ZtL21P00Xt/56T6tn7X6wT7stX3WsnkFJHVtU3ry5GkX5ifIWsVBffVVN\neLjmOnmooAm+RsiQcghjxjQk45J49gT0Cr/Owb6CZEzGrnmvHJMwuOsDbQW9UJiaeiTzDY+Q9H8Z\niBczES/vwfJaFquLgl83qHXDupImG/nkPpO5c2RHHftbt8a/mPliq8d6xxlxvpTG6NTAJptd/0ri\nzVlpHRrJ11+Xc8QRrc/k6+vhqqu2d6hNjZ5B88PXOKQotVWTuORKz30khMkcnztyNv8YfhZnvrSb\n5YpXLEXnO/jvifFcOzK6WVmds45F2YuINcQybeW0Low2HHl5k0vq66/ns3hxKePGRfCf//RDpwud\nfX7ZsjKmTt0aVF0hoLBwPElJh4aP/5+JYP3wNcHXOOSoc1hJfelqqhN9PE06qc0CODN1DJ9Nuotn\n/qhi7prKLgl9A39ckMroRHOrx3Orckn7pK2Ztg5ozVZ/N+5ZD1FbqxIdvwpcPoZ7ReXnn47k2GOj\nWzm3YyQm/kxpacd89p3OY9HrNX+Q7kQLvNL40xJutHDlxqu4b9ZE/jVrIkO2JUIn9tdJNcey8rQH\n+eqku3GpCnPXVoVE7IE2xR6gb3Rf5g6f61MSSXOnudZENgKhm4yiKMQO/MIr9j6BY6rCcaeFJpeP\n06l2WOwBhgzRJm0HK5rgaxySPPveyeT3j0MCitSBIQ0IPtnXNYNOIW/665yQPAIhBC9sCl0O+nXT\ng/OAefToR0FZCcp3oHwOynJQPqX5dTT8LIDRoPuas/p4bm5qeQz+jzYC6iP4dU1xl64BQK/v3M0v\nK8vRfiWNHkF77tI4ZHkzaxYLPthNn0XfsWNsGZiGg7sa3EUgWw+cOil5JK8cNQfhM5tfld81d9EG\nfjuvF2OTA9uwF+4p4spfdtOsJ6HDY77xIuNA+QLUkzgq7mjCza+zoqi68fCJSVF8euJwb93WBXnK\nq09SccyTnb8QoK6T6RdC9JCkcQDQBF/jgPPss9u57basxvevvz6Ea64ZFJK2L750MBdfOpjIef+h\nNswJOgvoBoBUwbkbZH1j3XhDOL9Nfoz0KP8Z+OgEI1/nWrs0lv2z+pIa4R+QsLfGytBP1xMgB5g/\nQoDUYVFWsfZsTxqBaoeL7VX1DI8OI8ro+ZN1uV2Qmgf7++I3yzc4GTCg6wunZWVBjdiPN98c2OW+\nNQ4M2qKtxgElLu5rKir8v2NJSTqKis4IaV9zl//EC0XfI4XKnMQTef6MU4I+t8bhJurNfZ3uO0Iv\nqJnjnyFzWV4ZU1d0zmVRXjaxzePitaFw3X9BKvhmJdXPfJXzK0eyd83P2CsrALCEWbDowkjqnchN\nT/2d46ZOaLd/VVXR6Tq2HmAygc2m5bvpbjQvHY2DAiFaDzqqqDiVtWsr2bixhrw8K5MmxTNtWi8U\npWdsAtOWFfBpdudm+S8fH8/1h7dww3Q4ifhobafaOyUpku/OGNXq8dLiUgaumkA1lSiP3EhYRhqR\naj5JrMNMJRKBQHpvA55/NuqxEAEozPr3NVx//8x2xzFq1O9kZAT3mdx4YwIvvnhwBNZlVlu5bd0e\n1pRUE603csfgAcwYGku06c+5bKkJvkaPs2ZNERMmrO/UucceG83q1cc2s7MfaHZVOhi2IK/daNqW\nXJBuYdGZ/mai6IU/U+3q3N9X/UUTsOj9/elX/7CaS8+aTa21BhNmbKYo0u1x6FAQ7fimSu+VqUiq\nqKSScuITUhlwzMXMvG42l5wZhaL4C6LZ/CP2VnKtnXFGFA89NJBx4yK69XfVFo9vzmHuRp9spQ2/\nAhe8fPgQrh8f2g1qDga61S1TCJHt3b92oxBinbcsTgjxrRBit/f/jvvNaRzSqMGmbgzAL79UMWbM\nqtANJgiGxBi5+fCoDp+34LRkv7Jyu7PTYg8EFPv3Pytg8rSXqbI6cVpGUJZ2H1HKEV6pD05sBQId\nCrHEkkZ/KkvzWfvlw9x44SwijtvL1r3+HjZVVRO57DL/Tdvvvbc333wzmvHjIw8asf+tuJK5f+xs\nnr61wWtVDzds34V4fzWnLs/oqSH2KCGZ4QshsoFxUspSn7IngHIp5WNCiLlArJSyzSTj2gz/z0db\nJp1gWLRoNBdckBqi0QRHxOt7qQtyvfL2UVE8dVyCX/mqoiomLe/cPrSROkH1xccB8OJz3/Dm/Qsx\n1RbiVl0UkomdJhNLX9Ix03yB1oYVE2a/m4DqTdcgUVF8PIOKKaCaCtzJN8HwS6lYEdrdurqLKkcV\nMQsuAOW+oFyF7hnZm/8ceWhea0sOhsCrc4B53p/nAecewL40upEBA75HiGUIsYxly9reXSmq4xPm\nZlx4YdezMdqsdu678DmOiTmLCbFTePO+BX517HY7/3fuP7liwhy2nRnOkCifGbaq0ntHGcd+sYcx\nP+QSmVkFyws54X95AcUeoE9Y53MG/T51DAB33fYBH9z6FJbqHHSqg1Kym4k9QC01jaYaADcuSihA\nRaWCUrLYxV52UEge1VQikQgUVFTsWBEIkkghjAiii96nskIGnOUfCty89mYQs4Ku//DWtr+7f0ZC\nNcPPAqrwhAe+JqV8XQhRKaWM8R4XQEXD+9bQZvgHN9XV1URH+3ttREfrqKxs3eNGp1vWJfPOo48O\nZu7c4JOmbf5lN3eefS9UuxmUOoRvcv9Hpbof2SwhmSDF2I+TTpmCyWTnnU/fatbGYb1GsSJ/Oev3\nVrJk0Md+fWzBwFd47Naqel7AcejfX91qvGxrVJ5/NNEWj2vnGPN0TPYyBAIHDvaxO8AZgoEMRSph\nSBTsajEF5KFDjxtns5uBDh39GITO642touLAgRkzLlwUkU/xsA9Y9v5oTh53aOXDkVJied+CXS4E\nxd/81Op57XhCHSoEO8MPlR/+RCnlfiFEEvCtEGKH70EppRRCBLyzCCHmAHMA0tI6lsVPo3sJJPYA\nVVVuysuriYsLPJ3vitgD3Hff7qAEv2RnOfcMexIjgpEcDgiy9u2imoIWYg8gKXBks+DrVwK2taVw\nEw9f9Th18zwz+JYGgsNw8jsuiqWB5csLOf30Xp5WpeSyy3Yzfz5AIhxnhTm1QeX62TD58Eax37hh\nF9jzARMAlQTeY1ci2ZQ2C0e0xxXSZMvDlHM30r7Xr66KShUVxOERRAUFg1cCdOgwiliEauWE0ab2\nB9sJ9q3OY/4ZX+Cqd6EYBPowPXGDYjjpwWMYNLlfl9YBJBKn6gTxKcirgzLpHByrDt1LSEw6Usr9\n3v+LgU+Ao4AiIUQKgPf/gLHeUsrXpZTjpJTjEhODvzNrdA+qKnnvvRzS0r5vs17fvr8csDG4grCn\nl+VUcMmwK/mMd1nMO3zMPJbzCVtYhztAyJNAwUzb37fX3vkvqK0Lwzg8rivnn9/keqkoDWLv6YWf\nLfBsFG25/ijA/unjGZPgcet02B3MOOZmjBgb7fAtTTkNSCUMVWdBKnqkosdm6Ud98uUBu5NIrNQ3\nK/O15RulQvj+F6i3htakY62w8VTvN5l3/Ce46j2/C9UpcVQ5KVxfwsKpX/BE4uu4bJ0L9AJQhMKk\nXpNALgacQe25+cDh7e8d8Gejy4IvhAgXQkQ2/AycDmwBPgdme6vNBj7ral8a3ce+ffUkJi5Hp/ua\n2bO3kJvb9h6oDkfr0/iBA8NCPbxmSCk5v98lbGQtbtyoqNRQRT45lFMa8ByBnnSmMYa7MBHYgcxJ\n2znwGyRFp/OI8rJlgfLxCNhohrtj/VLcJ5v1fHzCUFyXHkdqWNOs+tT4qZikEeErxgROxiakG0WJ\nIr5iLTHVGQjpRjX2pVm6Bh+MNF9baLgZevx8XPR2wGefdi52oCWqS+XTK77lybg3qM9v25ffXubg\nEcurLJn5NW5n51I6vDrhVWKNZgzMBJnnEf2GVwvO7h3DfaP7d6qfQ5lQmHSSgU+8j2N6YIGU8hsh\nxO/AIiHEVcA+4MIQ9KXRDezZU8egQR1ziVy6tPUgoe+/P4r+/Vd2ejxJSa1/TVVV5Z7IJ0kjHTtW\ndrPNa792NQYcBUKPmWgGIoHhXM1Gnmxx3MAABoMOpDvwLP93r7ll+XJP1OrUqUWtX8R+A4MfS+Qf\nn0RQ5XBz6/BUwgz+1zVz5Gz2REwhUskmvKbJdTCWeGq8i64NSHQo+l6M2vsCqvC0JYWOHem3oRqS\n0DkLmrUtEEQT3/TZoeLGjR4DKipFiWejGhNZv3g5l112YuvXEiRfXPMDGe/uaL+iD9s+ymTHp3u4\nNe9KwhM6NlEYEj2EzOmZvLP7HTaWLyPOeBjV7tMw6yKZ2ieeyamx6HooqO9gocuCL6XcC/j9tUsp\ny4DgY9s1DhrOPPO3IGt6xMeCysSj4hpLf/utnCefzGbixBiuvjqNfv3CyMk5ibS0FZ0azw8/HBOw\n3O12c73pLuLdyfRnELlkcSnXEUEUblxsZj1rWIlEoqAg8KQSFugYzlUIFARgIpowUqinABNxqFiJ\nJopjmMQ/S2byaNyHfreNTRgpRY/ZLBg/PjhTpMMquHFY71aP//79BnbvqqJg+BRc5b8QVbsDnfSY\nV4yYSKUfefpacFeClChKBP2c0Si4UGSTOWTY3ufYETEKXVVzwTdi8kq8GycOyijGSh0xJKMY+1GW\nfAkoCku2LuO5oK6odWoL69j0budSSqh2ydNJb3FX1bWYIjvm7RRniuP2w27vVL9/Bf6cccYaXSIz\nM/j0AgOwch1FPJiygC/fyURRlnH00WtYsqSQW27ZQUTEchYtyqdvXwspKZ1zVRw50n8x2O12M14/\niQR3Lwx4FjoncBJRxHgXI40czliO53R06DibixjG2QzmYsZyLwKFGnJQcSFR0XtNJlEMYiz3MIUr\niCSamwb8H/+uvwxzjAEVKENhKWF8QwQPPjgUq7Vp56pT2pnePPCAf4BWA98u+oH/m/pPqiKGIqSL\nkrgJ2I0JuEVTMjYL4fSTg6kb8F8QepLUCPQB/ICEdBJdtcav3I6NPPaxh0xy2Us9tUgk5aKC7OQz\nwRtlu99yfNsXEgQ/PhzspKEVJLw4aB51JfXt19UIGi1bpkaXKPFahVUpeerK35FE0dIAMmPGRt55\nJ5eSktAsBqqqymn6OzAgUHHT8DXWt/g6GzAynCPYw0560w89RnZTzSaexUElAk/SsQGcSy153uv5\nDR0GLPyN4VSTVjWEuyMf567NN5A0PJ7WSDcOQ+fU0Z84KjmbKs5E0tzb5ZJL/G9cdruDUyOm4HJ5\nhNtg9AicVIxsGXwPyWUr6FP0BYpq93yqigGLo4jKoQtIzHkbUZ/t16aiWtHjItCqi4qKaPG8IqQd\nc+kSHDEngWIGXfD7CrTGH69v63Ib9cU2nk5+i5syLyM2vU2P7gOO3eEgbvFv1Pusw8wd0ZtHxx5a\ngVvaDF+jCwicXnFXgI20bnP95puyoLxtWnLzzf6uurNiXiKHddiwsoFfeZvnqCHwBiYSOBPPLDwO\nO9t4AxulqDhxY8eNjUwWofos0BbyMzv4kkzyEQjC3ZG8OOJdSneV+7W/N2svg8VIdE7PIqmBChKY\nRxo3Inwy37tcgwNuKDItbUaj2ANE1WxHkU6QKqrOTFHCySiq51O2G+LJGHI/lXEnopp6U5pwGm7h\n/9QkwM8bp+lg4MVcxV0FDU8TatdvzO42FvE7hIT5k7/oUhOG+QbEfMHQT4Z2ug3zR83FHuCxbfsZ\n8PFv/CcjB1dXfY+7CU3wNfwwm4Nd2JKkeIWyGD1VNGy3FxpGjjTy3HOHNStzOlxU10Tiop797GMj\nv2GlnmIKAi7QCgxkkchGYthCLMO5kghauuM1pRqIJo5BjEAli++YzxK+5Rs+RQWenrCUkpzaZmee\nNmiqt5+mfDYCMFDKVX/7L1IOQcoh6HSCB//5IEkihUSRQh+RxtrVv1FRXElv+pHOUCKJRsHNyD3P\nYHBVo7itKG4rEs/MPDv5dNw6E3gXaMujx1BnjMXtFfGmrJjWxjQKLVECrGFL9LgjjiLJVg9SElH7\nIw7HwRNtW76r9c1s2uKIz49AzBe4vJ5Iu+p3IeYLcvNzO9TOs1tbr59d7+C+TTlEf7iGwvqD5zNr\nDU3we4CioiKE2NX4io7eRX5+LSbTUoRYitm8lN9+Cxxo0x3cfXd6ELU8ynGGd2b9PoFTDHSW778f\ny5Ytp/qVF+woAQTReMbY4Fa4mR2oLW42bmA3EdgweBMFCyLow0iuxdLCBz+OBP7GTC7iak5mCmdz\nERcwmxp2IujHD3yBo9zKDSM/Y/e6JldPRQ2cpVIgWPHFSsDjNposUnnxsZe9JigVGzamHn8W4zme\nwYwgjXT9j7tCAAAgAElEQVTGcDRHcgxn2Q5n1rYPOLbwF9Jt+dixksVOaiNGNIq94igmeseFFNjX\nUCvLmwl8S9NWw3giiCaBXkhhbLw1SmFA6iIwxs7gvJwthJV+gSn7X6Sa+vDHmk2t/4LaIaxXz0bq\nVjuq2VwTOJdR2oqOBXi+uruw3Tr1bpVzV3XdjHWg0QS/m/nppyJ69fI1P6hUV9fQu/cyHA4JSOx2\nydFHryI5+VNqazsfjNJZ7ruvvUdfSTQuLqOYaNwUoPeadkIzu5dyCiefHHiBM2mwxxsokv7NyuuB\nnRip8S5j2lHIxYItoPjpSWVS43sTJiYznb4MQIceA0aMGIkljlOYgguF3WzDgcBW6+LlOb+2fw0+\nTxt3X3kfCqKxzPfY5yxofDrQoSeCaCSQm3YGa0dMwlC0CImLNAZicjZ9b8JzHkBxlpJABJFEer2Q\nPP/0GLAQDng8c8xYiNUPpvKIJ9lx4kdUH/E8rqijcZkHYos7G5l4Bddn76GPrYpr8isYwGB60Zup\nE85q9zpb4/IfA6eb6AzR/SPbreN0O7n+l+sR8wViviB6cXS75wTLgDCTZwe1lki35+Xl99Ja/zoH\nGZrgdzMnnNDS1qwCJYCBpjyuHuEsLla59NLfu3V8DbjdZ7Z67K67BlDqPIspj41HObEvewcGt2l3\nMKSlBQ4wasBsMRFJKfEkNYsSrSMfK0YyiSSDWLYRTTkm1ABRtgo6IujNMZzIVdzKldxKJNF+M3Ud\nevrQn3CM6DGQQzUSJ9kZFfz67W8M1R3epq//0aeNB+D9d99vNYirlmq/fDflJiM/9E8nqngNEXU5\nhBGOASO9i5d5Fm/ddRjqN6GgEks8SoA/4zjvE1caA+mtDCH7qOepih0Jig5X3BiqRz1D1ejnMZV/\nTkzh2zjcJQgECSRxJtP5GzOZydXcfdI8v7aDIWFwHBPvHdupc1ty5ZoL2jw+d/1cjB8aeTXr1ZD0\n15IhMWFAJsiWJhsnyK8PSJ8HCk3wexwnYKW12PsvvijAZutc5GFXUBQFKadgtZ7O8cdHkZZm4uGH\nB1JTczqPPTYCvV7HzLsO49GVZzDthsGEana/alVgn3tfbvnmJAaikEAyOq/oO6ikhI24afqjLGdr\nwPMlbuIxM4qjMWNpnB0HrivphwkXTlwkYGcfJe7fmX36lUhVNrPdN9RvEPDCPR5TgImOmTf2xaWg\nSgf99yxu1nZszTbS9i9E564D6QkOC/S9EQgMXg8hB3bKE47ErTODovPML6qBah1Y46ka8RknhP2z\n0Z1Vh44wwrAQRjiRiJUV/Pplx4KnGjj5oWM576OubWN5yhPHEpkc3urxTWWbeHzH4x1ut+FJoL6+\naXH79p9ubywftaQptChM2Q/qbSB/9Yi+tIMsBPVukB801jsyruveTQcaTfB7HDttiaWqQn199wt+\nA2aznh9/nMi+fadw991DiYjwN5Fcd12/LvcjBNTVnUb//u1HV44/YywOKpnIaZi8gg2CSlYRQxlG\n3ICkmNWY2UsKNcTgoEEcVdz0RxfQ1t2SGiopJpdk+qDDhAuFEpoCyFreKITPv9y9+/n1hzVE0bZL\nYcs29Kobg6MandvWrLyKCqzlbzJwx40YhBkXTgJ9dyQSh9cps4QCrJZEVJ3Jc/nV0PTQI1D1cXyY\nPpEag6XZ+A0YvDcA+ODvnd+I5rALh3Bn1bVE9GldtFsSNzyGI2YP4x9ZszjujrafEo779rhOjw0g\n/BPPuMR8wTM5zzSWZ9gzEPM9n+0TG64FWQXyXlDPAnUGqBcAv4PPk9vnk4Z3aSzdgeaH3+O0HYwU\nHq4jNtbQZp2exmzWcdNNfXnxxY55PzQgBLhcZwbcXq81XJGQVJNKOOGEE8nZzPTZCKQaFzCCc1EQ\nGHDhxkUvFNZTgY2NmDietuY7DTP1lXxDfwbRyxtMXsEfHbq22adeTQxx+G4y7ks4kd4c9U3CfVR/\nO1/rm9/4JCqlFKKiUiqbtu+rpJwYYpuZtySSajybl1upR63bgOKejCotfvl8ANxC8Ftcb04rap5h\nU3jXHQylbecUag9zlJHbcq+kaEspu7/Mpiq3hqzvcinfXdXsIxl/8xFMfq5jKR3q3HVdGhuAbn5g\nV1UA8foQMOz2ua9avS/wmGFPAmDJCUNJCT8wWUZDiSb43UxhYbTPoq0EzHh+DYFduhYuHH/QbB/X\nFs89dxjLlpWwZ4+t/cpewsIEl1/em5deOrzD13jjx7OYf/qXTOUilvERGfzOkXhy2ngMEwKdT7ZJ\nHWDAQTg7yGcPghP82mwQXtX770s+pJh8TmYqmaTixk5NWDZKR4I/Jbz49TNcOnkWlZQ1s9cLBCMY\ng0RtTJQ25YVJTLxpPJ+P+y+lieOIL1mHTnXiwBHQ6FdGESpury1fhwM7JZRS1ucsjGW/IFw1KGUr\nwH41KH0I9CfvVnSUG/2frBrGmjS8/UXTTy9ezo6FWY3vDZF6Rl09jPG3HoGt0o4xzEDSyHiSDwut\nN5fi3cylK7R6vgSMgfYgAKQANZaT+tzAw0cewYTELu70001om5j3EDrdLlRVIlC9udpzoDFQRwAq\nJ58cy/ffn9Rzg+wgUkruv38XDz20p816DzzQj3/9a2SX+5ulewmLqhCDi/3ks5u9xGMmggSG0Teg\nXb6OWgrIYxDD/I65cJHFbkooYBdbceLgDKYhGEwxFtZc8QGK2EX027Gt2vwDkWFdx3/6vsTi0nmU\nUwx49pU9ibPoQ3+Et28Ao96I2WQmZUwSj9clELXtdcz2clw42ceugAvE0jsXB5CKGWvvmVjT5mAo\nXEV01j2gStz6CGoH3onTfCotzUAGt4uz8ndxVPl+b3sSFZU64UKRKrfnXEls3yZBs1fb+fHedez+\nIpvq3Fo6ssvLYbOHcNa7oftO377udp7Z+Uz7FTuDhABr/k3HSjcRbYyg8uZg3JgPLMFugKIJfg+z\n4ttqLjovl5JanTfaswYo5pNPxnHuud2Xr/vTPdXUOFQuG95kb1ZVlWKbSoJZQd8BcwtAXp6VJUsK\nMRgEp56awJAh4SF/Uln32VYePXctbgQqCg32AR0qh1MdUJLdXvu+LsBMV0XFhpVyygBJPImsJ4Pe\nfY4g+Zwonr7vZvS5evqNH9Qhwd9U/Tsvn/Iexb+X48BGNVXEk4gefaMpRkVlF1uoohITZsKJoDeD\n2D3mWHbtXU5s1VoKyMJGfbMZqcCALeZkdPZspC4Sa8KFOKOOByGIK17BEQWF5LGQCu92FDX9/oMj\n6jhQvOYv1UG008FtO9di8LoYqqjkmExkR8Tzj2sSmfpok528aFMp745dinR3XjdOef5Yxv/j8E6f\n74vNbSPyw8jGG2ZIkYArsCkONQrKNniq3TGwsXhVYSVRBj1j4rt3AVcT/EMIKSXFxW4sFkFUVOv2\nxAPBLT+W8PyWmmZlAjgyQc/6Ulfj+/MGhLHojKSg7ewTv1rAz7W5jZPJqBw47Y5EHOiQ1BFHHkYc\nxI2O4PE//t3p8RdklvL0lKUUZzmoddmow4CBCI6gBn0L23iDXV547emihQ3fgYs9xBCGm0oK+ZlX\nGHHECFZtWkHsjWdT9c+vkFEqSdf3IuqDuKBEX0EhjQEk4nFdjSCKcCKwENZoI2+YtTtxkME6aqhC\nItGjZxwTMWGmaEgtzl3lbOF37xaGOlRUBhnOpCTuUvpUFKNIlfzoBHYnp+HU6UkvzmFkYQ42ssnn\nCwrJQaLDlnA+tvjzQDERUf0zNxVFEeHVSxUVm6Ln1UHjGBG/lVW/XNrsel7q8z61+7uW0EwYBXfZ\n53SpDV9cqotTvz2VVaWBF5e/P/F7Tu5zMuYFZuyy7X0dmiGB6hlgWQS+G/apFqi/AazXARJ5x0D+\nk5HDfZtymp0eqxcsOXE4J6fGcaDRBF+jXbKr7AyYH/xGzqf0NvHdOa2n920gbvFLVEjvYprPBEmp\ng+lX2kjH7PUikRgxY8PKw9V3Eh4Zuo1SXpj9BiXv1TSKskRix852NjGE4YQT1Zg2GcCNJJdwytDj\npJJMFlPFHgoc+zEYDMTfeC4V//wSGeWZBcfflUzsa/GNi6Ctif+ZnEc4ntleCUVUUspAhmHA2CK3\nvfSOw812NlGCx6UziVRGMhoXTgyY0KOjlmrqqSOWBEqJxIFCPTps6HELgdVgYtWQI0HAydvXYXE5\ncFBAoe5nCtUMXNLz2esjJQOPDWfc6pkk1fUFoDhKsMboJv3YKOZ/chmKT/74umIrLyTP69DTTWvc\n4bganSH0k5s6Vx1O1UmM0d8zSkqJsiD4J9UrUq5gRdYUsh3/BdMaEG6P7d56JdTfQsOi/+rZ8Uz8\nX+Co3gYSjArPjBvIZQNbz5jaFTTB12iXhLeyKLN37PcvgLpr+mMxtP6HIxZ5/KIjil1EFjip6mOg\nPkEPUjJ7RhEpNA/UUlGp1VXzsuuRDl+DL263m/tOf5iMH7Y0lsUQjxEj9dRSQzXgEdcqKhjCUSSR\niBNJEW6KsGHjFyII81wntZxxyWnc+OZ1XHjfPSyfshriKqFvDhhciDqBcbOZ6MeHceOI81n67mfU\n1XhudEm6XhznPrXRx38LGyilCInkJKY0C5ZqGYHrwM7PeLaU1GPgeE5rfDJpSAWtoHj39mqyyFdh\nIJtwVKGQihWrroyNYXEMqLZ424cV44+k3mAFxYg0RHl3hHITsfdzjOnHMOefW3n0jGv9PtufHlzH\nz/evQ9L6za0jxA2N5uqtF6Louscz3Kk6MS4MLj23RWeh4LwCoo2eaN25v27m8dUKKJWgxkFjFlSJ\nSRGkpZexuza4J4dLByTy/sTOJ3FrDU3wNdpF/+peOmOKFYB6fesLVfoPHuWEZ8votdmGahAoTknu\neAt7Jscx+z6dtw2PaKhe2dKh54oV5zNkUucXwKZHXIq9LrgEVipqixm+GydOTJgay1RUXLj46XEF\nZ5yC2wwoLlDccPRKRHgdlg9Hsn/WT8TENM0of35+HV/e8oMn4RmZ7GdfY86faGI5kgl+0bHNzU1Q\nTAF72IEZC6M5GonEiRMLlsZgM9WbIagBN7AfC2WYKWItRSznKI5nDyq9OdmzTpDWi90Dhng7daJz\nlKGTLvoOz8Kd9QHR2eWs2PA9sXFN2z7uW7Gfhad84ZndenENz0e3PaVL4n/8f8Zx3D2hicZtid1t\n59kdz/LWrrfIrM8M6pwr0q/gnD7ncE7fc/yOfbuviNMX1eC74K0gcN+Rjnh/dYfGVj9jAhZjaJ9u\nNMHXaBMpJcqrWe1XbAXrNf0wt/JIfvTpdzFwZR16H/dtl1GQqCYS5jL7iYRH9CWuaBvPVD4QVP/F\neSU8cu3jbM/YSZ++fTBJC/vW5HZIgBoEtiFQyRNs1Pya3LjZeaqNrNk+0bKqRF/vZOivsWyZ1zz1\ngKpK7tU9hUCwkbVU0jyx2WBG0If+AdMh+Aq+REXimck3uB66cKFHjxEjTpzsYiubWY8LJ4MYziiO\nwoWFbZjJ5VvyWYkePYmkkMpsjIRTwcf0PiOJuLEq+fkJ7F4qmFwdQzgRSCT55JByXTT/eaXp9zB/\n0ufkrMpvHFv9v75At74vxq9GdXm2f1PJLMITQpto7au8rzhrVcfzAMlL2tfC6tp6fsp3c1IfHWFh\nYZTanCQu7tgewM+OHcAtI9o3jXaEYAVf88P/i/LO9pr2K7XBI+vLefAY/639CgsKGbSyDl2LWB29\nQ6JvkYagAU8AaCWyqn1/6vr6eh698Uk+emdxY1lWXhYKCv1Ix9BOIFsgBAIjgYNmdOhIzTCgfKJS\nrdSgFNThitVTdWIMPzzytF/9HZ/vRiKpo4ZKKhrF3oyFI5mA0Sc2wJeWwVcN2y82jE9B8Xr1KBRT\nwB+sYQ87vdG2sJ5yMtnO2VwFGCnBM3Fy4aKWKqrZSqoxgolL8lBP+Q5cOobuSmDUvNsa+xAIetOP\nuldrSZ8/iAhHNKcqU0mqT20cm3tYATLSFhKxB/jysh+Y8fXULrfTQKmttFNi39fcN6h6URFhTB3S\n9L7K2XHvoF6Wnguk7LLgCyH6Au/h2cxcAq9LKZ8XQvwbuAZPZjCAu6WUy7ran0ZoePwP/808OoLN\nFdj5umRbOUorgZkOHJjwT46mILBSjznAMfC4h3721pe88NDL7M/d7z1H53mkRiXCmy2yhlriaNsj\nwldYmwtsUzbLlvUjinWYl2ajw+ExvAiI/bqcv90xHTMWpMXNHS/fwt+umIqt1omCQi01zeTwMMZi\nwtzqzL49GkTfE/37P0rI97qYenDjpppK1rOeLLbjpClzYz11qOxgyN1m3KfsgDAn4MR4x98aF9Qb\nPheBIJwIplVfRjxJ1FBFFVVEe9NDOM7OwPzMqSERe4CSjK59D1tyzx/3dOq8nOk57VcKQP/wtpP9\ntUQAMwckdaqvUBCKGb4LuF1KuUEIEQmsF0J86z32rJTyqRD0oRFiEk0Ku7oQoXj5iNiA5YefMqJV\nKXAE2HRPIqmllnDCOeXv/mH1Ozbs4qKxs7BjR4eOaGKJIBKLd3ctXy+c9kTIszlIvU/+neYIBG7c\nVFJBLdXo0BNLLHbsOL1iD14PPTdUUEY0sShWhUevfJqPX/uMt35+lY8v+5owwhtl3Iip8aYUaEzB\njL0BB3aqWpiJGnDhZAvf+d1AVJ2LsRlrcaUIhMENdh2Y3Dgy4ljNF+xhOyoqfejPiZxJNLHYsPMq\njzfunBVOBNOZRYRTQVSFzpsqflhoty58Y+8bHaq/eMJizk8/v9P96RTBc0f255YN2UHVX3AAFmw7\nQpeXyKWUBVLKDd6fa4DtQGgNVBoh591TO+8eFm+AEXGt5w0xhrd8ZG0QNaXZrFQiceMim11I4Jzn\nT2885nA4uPaUGzln7PnUUYsLJw7sxBBLGOGNtu2GWWnDzxJJPXUUkU8RBX5b/e0nj3rqWp3N55NH\nOSVIVMII894AyludhVd5c9ZUUcmGtRuoLPWkzYgihjBvTnrPImvg863U48LVzB/fF7/EahjQo291\nPIHKJ7xSS0SiHuMVF2EOfwJz+BMYxt7KZ5WfkMl23LiRSPLIZgnvUkM1S3in2WdXRy0f8BrORUNx\nTtgT1FNJMEx9L3DUrZSSN3cVMPf3TPKqrQHrtMSlujo8rgt+vQC1i9sT3jyyD1elt50yQgGWHD+E\nmQP8zaDdSUh9ooQQ/YExQMMqxk1CiAwhxNtCiIBTQiHEHCHEOiHEupKSkkBVNA4Ag2JMXDCgg/Zu\n7wJ/7uVtZ8d8s/ZZUob5frEFKcMSsepqqKYSJ57cMEUUkEsW43pN4In8e9H7eC6cmHA6K35YiSfP\njCCWOPqRjhFTm7PhYgrZTw5VVFJFBXnso9QbZeq9CEopDtiGCyc26okgkjTSiSWeKGLazapZSYXX\nlq5wbq8ZrGUVv7KCfgzCjAUb1oD58N24KaOEDfzKPjKpoxYVtdlNscGM41lYdvDrWSsZzii/thrw\nRPAaEAgswsK4q00MvcqO6YSb4Muh7HJuZ417FQUb3FSrld4duBo+GYkLF7/wfbMxNKDiZuv+PWyI\nXgE6d5dF/28LTiKqd1NEqt2tYnW5eWZLLsoHP3PN2j08vqOQvp+tR7y/GmM73jAPb3m4U+O4es3V\nnTrPlzePG0bR9KM4Oj6cMNHky2MGIvUKz49PZ3r/njPlNBAyLx0hRASwCnhYSrlUCJEMlOKZ2jwE\npEgpr2yrDc1Lp3UKd1fyyrQfKN5TQ2SimUnXDyVtTBwf3fobRbtrMJh0XPTSMRx1wQCMYfqg0xjs\nzKth9NJCbDrFk7bS9/vg24aUIOGRCbH8c2zXIgd3fJnJ1i8yGTltEENOT/eL3l306lLuvf5fje/7\n0r9V+7cvNqzkku0nRAJBP9LRoSefXBzYSWdIszpOnBR7nwgGMrRZX9VUUUR+mwIXTiQppHoXW0Wj\nyUXFjYKeSKJJx7NvgA4dLlw4sOHChQs3f/ArAoGZMIyYiCSCgQzHgBEXTnLJ4tvbv6B4Yj7HlIST\nMycOO76J6gTjOY5a79NQA0q4m/Qra+j79nQ+qVtEBWVM5QIqKednvguYksBAOE7azkIZZ47kctvt\njdfbUWKGRvH3HRcBsGhvATN+bjv/UgMmwHbZxIDHxnw5ho1VGzs8Fh06XJeEPjVDud1Jkc1JeoQZ\n0wGON+hWt0whhAH4EviflNIvk5F35v+llPKwlsd80QQ/MC9N/5aNS/OCrm+JNjD98XFMutY/QVgD\nNpubUyZ8y+aNHhOEEDDgiv5sGh2HceE+HJOSITXMM1Vxqii7aoj5PJeyPH8fZYCXvy/lxsWewCYj\nkBIrOH1EGC/MTMTcRpBWICaaT6HYXgR4bMcp9PZzlwxEKcWUUxrgiCCRJKKJJZdsLISRQJI3eMlN\nPXVYvaFZRoyk0KfR1x08M99iCj2eRK2Ifh/6E4a/bduGjTDCSGcoVuobTU9OnEQSjRETv7OWLGyY\n6I0OGxFUIFoIrkDw6yeePPypd/fhtO3nkkoa2ewGJOkMAyR55LCTzY1ngcAY4yamMoW1rMKFiznc\nQQmFfMmHfk8eCgYiSaeKne1+3pFEM4fbOyX4N5fPxhJrxuFWMS34pUPnylYEf/Bng8msDc7n3q/N\nIFwyD2a6zS1TeKaSbwHbfcVeCJEipSzwvp0GbAl0vkbrFO6t5N6Bn3T4PGuVkw/+/itL5v5OTGo4\nI05JYeq9o5h37WoyPm3IiAjDvK8fEJRJC3vfzmburUayEk0se3RbY5xNQxqRr1af7NfXmj11THiy\nqFmZA9hXIXnj5zre+LmOMX0U1t/TL6injj9WbUaxN4mthTC/XO++AVO+tCY8wnvMgQMdOqqoJJZ4\nhHe2HUEkEURiwEgNLbeg9JybTAqxxGOljgrK/RagLa3samXCRB21hE/WMTRxBDu+z8RV4cZoNjDq\nguG8M8+B3ZaK78O+J/PleiCjsSzn5izia8dg2HAt8dvfII10BILBjAA8NvbNrKeQPOqoozcnE8UY\nQMFdWclO3muczduxkkIfYoinjBIfs45Ah4mBTGMDj9PaukMDNVTxIW9zHpe0a2rzZU7mDCyxHu+W\nwz9fH9Q5vqhSogT4LkUZDo0UxT1JKJ4zjgMuA04WQmz0vqYATwghNgshMvDsEnBrCPr6y2CtdXZK\n7H2xVboo3FbFDy/u4PbkjxrFHnx3zoWTkYR78/G/+UomH352PM+8OpbICB16BdIHhrMxczLHHNd8\nwemOJcV+Yh+IP/JUrn6vEJtDZVeRHauj9UWyxy96GpuPqcLozTnjwsV+ctjNdjLZQS7ZOFrsIRAV\nYF/aBgwYyScXGzZiiUfvjQrwfBYCJw6s1DXzxmmJESNRxJDGgMYUBw0EsnkDjTenktxSrpt3Bc/n\nPczUV+eQUXcY/33Njt3WkKyg6Z/nz3IcMJbhMYfzVf5SlGHjqZr/PvwyhKhexzfro5JyFvAaf7CG\n/eyjknK28yl1FCJQ0BOH0+eGtJHfqBWClPBbiWc8CkYEOsxhoxnJPwgnhuO51LuFYtvMmHQRiWkJ\nQd3MhV5wU+ks4gbGIKXk66+r2bXFCbaOza4DiT3ApORJHWqngel9pnfqvEMRLdL2IOXG6PexVR+A\nlK8BkHg2avuUMISAKnVGu+c8+20Ft31c0aV+LxwbxodXJzcTi3OiZrKzZnvjrLMv/TFjIZvMAOYH\nHekMavYEUEUlxRQ0XhdIFHSouIkkml6kNtZtEHw7NkooIokUr5BLfGW4JZ7AqlryadrhK44E4kho\n9uSholJNJVHE8LdbzsQUdxRv/Dsj4K5TrXHMGb145PMTiJ+xCWt1PIZxdi586yvSy5pSVHzJR+zD\n3wYeTh9GcTMAZWwmk4W4cZJAMummuSh2V7MZn0unkNU/hml79pBFJFU42cZb1LIv4NgUDORaszGZ\nPTeG4k2lvD36Y796Qi8YPmMgp714HJZYM0VFTtLTd1BfL8HkWRtikgMmOyDOq0ctP3avKyy/6nD/\n9+iAWVv31e5jwGcDOryYrF6sHhKbDLWFFml7CLN/a0W3iT14/rYa5nIxscF57tyxtGtiD7BofT0Z\neblsfyCtsWzwuIHsWNG0+bgLJ3Xehc2WSFSqqSaGJgewaGKIIJI6aqj1vlTcmLHQi9SAAl5BOan0\n9RFrgUTFgR0bVizehVRPnx6f+TDCSSCp0QOogjL0GIgiurFOLTVYCEOnU8jOHsiPz2X49d0ea/5X\nyIIf1+O0RkMCuPQ6rOFh2Cqt7DiiP9tGpFD54VMBNyGpYz8qLu8c/zDiTaMpsf/BkZxBpUv1e7xX\n3CpmB7ixkYSeHaykRuxvlhkYoD/nksIEQHC+ZQF6EyyqupikUQnc6bqGRVO+ImdVAaYoI9M+OZ20\n41LJyqolLG6X/yDt3t/HcqPnBZ4v46kOOM/uWaVtEPuHLbBHj+6NLURGwo8/DmT06Ka9cvtF9GPt\nmWs56pujgv58d5y145AX+46gCf5ByLJHOy4MoeKJ58cEVa8L+1+AlJitNs7/aiUJ5eXc+JQbXb0n\n72NYaiThRFCLJ/VDBeVEePd99WsGGTCYy2OXj6LMZwE3iV6tDqfBX94XBQUDRvaTixsXvUnDjAUV\nNw4cWAgjhrhGl0yBQgVllFOKif9n77zjo6rS//++907PpHcCSQi9Q+hIsyGCiiIgsoq4NuyuZXVd\ndVfXddVV165rFxuiroCCqIAFAQFBQDqBFNJJz2Qy7d7z+2Mmk0wykwKBZb8/PnnxIrn33HPLzP2c\n5zzneT6PCQthRBJFXHosCQMv4sel7Zehbo5/n3cI7awRYAShU9h05jA2GPSUx0fgNuiINpqR7S1F\n4yQUGjT/NZ0HaUokdWc4sd6XSGUQkpMBk8ODk3rKErqxfdzfQHOD5kGuK8JQvI5+B+wkq5kBA6fT\nI5M57CNW/zyNxIg45n59YUC/X39dzdSpwWcJTa/WDzfwlQF+0sHZbi/hf2+A6sYhqrYWhg07RFyc\nTK1SAWAAACAASURBVElJf7/FPzJ2JI65DkyLW8+AVVAom1VGlLHzEr+O7i1n+dWrKdpWhubRQILw\nrmHE9o0i55vGzz99Shfmf/3fcSOdHG3S0+gQwhM6lq59vBBAkQRvfDCGy65I73gHUlPzUhB6sU8g\naRrdCopJLShm6dQJVFuMWOwGzIR5ibdQYjzn+KNkHNRjpy6oZ94bxmjy9dz4o6FRSpE/lLM7vTDS\nUrSt4bimCpnN9+nRIxAUU0glFeRymErKKaOUAnL9ricNlQSSiCaGrep61ovVvFf0HjnFE1n/5fHN\nhhQNYooqkOtdoEkUdE+kNDESt8FrrzlGzEToAhPhJHTEMwwJ2ftE9G6yL/ya+j4VlCTvQgmSbORR\nZCqsTqpju/HR+NGgM4LBCqYotNh+OPpfRxIjWyaDqRpJOXq6PH87TjVwAH7ttbJ2kH0wSFCrwFIT\nLDMFkH1TlJVpjB8fGJljVIx45nqI1weuOY2KGcVLI17CNseG53eeTiF7IQSbX9zBI7oXeKX/hxRs\nKkVza95XQIPavLoAsgfI+aaQR6QXjvvcx4LTPvxTEPU2F7eGf3BSz/mGuLrV/fZaOy8tfIPqo9Vc\ncMt5zNkYz5FKAZIHuh+Cwm6gKhBdDuXxoOpo7oiNrqzm0i+/xeB2IwBF05CFgllTWujDV1PBj3wb\n1IJvgA496fTwJye5cQESKh7qsVNBGV1JQyAwYQ5J+FVUeCUSmpG+hkY2B0MuyAaDhMTH6z6g99B+\nTIlegurpnPdLlSXyLoihpEcvCAea+rA9bqyfPYTh4HpkTYDmIYwEBnEjQmegbNBedl+9GFu3QgCi\n9sQw9ZGnqPbo0Kmav3+nUY9s2MXO0ZdQEhEk10II+uzKpff+Iy13SbBiXjrv/6OAed3mAbB8eTUz\nZhwL2XccQgxuZZ/wu21cLpWPFpew9Ug2F0yPYcrQfsd8zqyvc1nzp/WU/Fp+TMfLBokHnLcc8/mb\nor0+/NMW/ikIs9XApU9knpyTSfDYweDTS9WjsmfDfi4Nv5I5EQv44cOf2P7tbzw64ylGvvkP0DTQ\neSAtG874HiaugUHbYfjPXs14/+qkAKExY9VaLPZ6DG4PRrcHnaoha25Es1VMCYkIojhyRWSLa2qQ\nVIggklS6B5C0Dj1uXNRRx/XPLOCH/DW8sP6fRHeJCqo9o6FRzlG/dEJTt1HDgmtHyL4B0fFRvPjH\nrZ1G9gCKJjgvT+PlmRXobNWBCXI6PbbL/kHNtW8jD56K0msel8pXMgwX5b/7gC33P+sne4Cq/hWs\nvvd+jOF5uCwG6i0GyuIEcux+7OfFUhIRXCcJSeJg/1TslpayGnVhZoQ7lvUV6/3bbr+9sEW7E4WC\ngtB1ECRJYtmXxchdPsZ41c0sOHgVL2xfzHkTDiEZv+X6e3/G42n/Svq6xzbziPQCH05dfsxkD6C5\nBG9N+vSYjz8WnLbwT2Hk/lrG3zK/OGH93/T5WQybkRp00erbd9by3NWvtnq8dVAvPh53NYz7AczN\n9E7qwiC3K5QlgclGfC7M/nwtBk/LxWgZuYWKpkDwyfxdVNUWEPN5Y+hnEimEE9HCWm+QHojL7I19\nzl2omuCxy2MYnG7k6/fX8NyVr/gHh4bZQAXl1FBFGFbvsSRgIQwNlUrKqaLjrpiE6AR+qljLBOOH\neFoJPz0WnD+/Ow+9ewb6O39BM4ehGZoQr9DAWApDpoKsQb2Frnc/TllRXxxJJRBfAPOfhpScJscA\nRIDnQfy2nwA+uxC0EIlumsaAndlkZDWSuUeR2TaqDyVDDvHMXcX8oac3Attk+g1nByuqHSsOHepD\nRkZwfae5z77Ox+/Y4B8Per+niua1RVxGuOY1OJIGCB57LIM//Sk1aB/gnSk81+NdarKPT1q8Oeb/\ncAnpE7seVx+nLfz/A0gbFscLNb9DMZ2YKIKhFwUn+1dufbNNsgew/XaQH88tZ5RnEKgyaL6+VAnM\nNrj69/DYYHhoHPqwXESIaIhQC7LTF3Wnu9KLg2/3xhIVTzQxGILo3QskdmTOYvWlz7A4YyHLf6lj\nxbZ6htxTwDmPFHDeFWejmT1+K19FJZfD1FAFgBMnEhJF5HOI/WSTdUxkD7Bo3VvM7be008ke4Idl\neez5pRTX08NR7DVIHjeyw47srveS/bCrILwWwuogrIZ8KRZHfSxkD4DN58JtK2H9eYFLLJ4bCHC9\nSUD3HEKuw8gyJYnR1ISb8Sgy1REWto7uS0mXaKSh/2R+6nx/05SUk6f7XlAQXJP75+zdfHxPOly9\nCMx2L9mDl/mMTnjpVl9Lifvvz2Ht2tCf+1vjP+l0sgdYfNGXnd5nKJwm/FMc5nADagcTU9qLpgWq\nG/DD4nWsePHrdvfxxKyn2bhgBMsnnk+fiEgidHrGxCew5JxMiG6Ug7AOegMpxGxSaSabIBDUUo2E\nxBmfJqFzK+x4IRb7Z0N5+D8PkNNjFB5FjwCqI5PZOOk6CjJG+rR/mqaUwZrfnHz6cy2r7EvpMTGd\nskgPOWMTUJsMPh7cfpG248GGkh+4c/ov5O6ztd34GGCvVrlm5DdMNH2I4+mRjEvXI45oaM73YMxE\nCGuyeLliDuSn0/gsJK/V/tRzsN6rUKmX9QyJ9CW5CQH2aKjqBil1oNcIRfplyTH8MGU4X108jh/P\nGUZ5Nze6MQvZetmbxBpi/e2efz456PEnAjExwWckYydu864nDd0BSrP7kYDIGui7z7/p3nuDV4Er\n2l5KwYa2kwyPBa7qEAUkTgBOh2We4jjyW+cWiGgLT1/5UoeP+eattVx47Tlc2KV7wPbz687nK+0r\nECBbjyLJ9Wjom1FyYIQNeGUC9L7KUALBOW+msOqWfN6eeBtbD8ewZ0Qce4a1P6zt/g8qMJoLePHG\nRJAbIjfSCduQQ5fnvAqMDUldTQuhSEjEk4QbF5WE9tXKKDz8ygPEJcRSkusI2a6z4HEJzgz7iO9r\nLid2zRPUhO0EXbO1hjXTwRlM7kGCp19l5UyNmx/dTImqIyqhjqqek6DpwNsHKAeK/RVSQICuLht1\n0E5EZD2yM5wPL5zJ5EE9SDStaXGm6dMjeeONFK699thDUtuDqCiZfv1aRrYVFjog3+ei8ejAEIJY\nexyCgz1B1XPkiD1ok0VnHV/W+6mC0xb+KY635v9wwvreuTIw2sJhd3rjhzuIj/76SdDtK8NWcsd/\n7mbQ6yMwVBnRuXTItEyiVFFxUO9PdGoqewAQXqHniWFXMiKuJxV1Kkihrc8WEIKs/Hou2rMca5VK\nTJHvUEmiblw6+X+Y7A8kNWGhO7180shx6NDhxkUkUf44/Np0CwfnjCO731wKhowl9+xh9OydwWUL\nZ3f4uR0PXA7BpT0+Z9PzV8DmM8DejNzNwYkLBETbuehJiTzjGIojxlAVd5bPZy8F/ouVoI8EXSQs\n9l/I+HIeaWv/QMazbxP33VdoGd8zd/8dJJpC11a45prYkPs6A0ajxLJl6f7Z6qKX3ifN0IM4KYl+\nKWf4Wkmw9kxwBXExeRQoSPG5egSZmeFBz+OsDL0o3BnY/XGQpLQTgNMW/imOwj0txbw6CzUlgQut\nRVnFHTpeAMX05ruC83lfOcDhw8mkpTW+MKpHgwVWzqZ9NUaDVX6SkLj+ruk49ts4a/QcfutaTYTm\nxJU5FkfSHCiIoOUQ0uR4Rznx255m6u7hJObIaDK4TPDlzYLsoRL1Y7qRJX3IYLGCweT7z9lg5Rdw\nhByfbEFlryjKqu6ApRF4XCY8kgoGNytmLe7AU+s8lB6pZ/t7Ndx0eRIvH+oLPfd5FyU1YNqnsH0U\neJqSnPeewkZr1Ol0jfLXEVJo008HhLuJ3/AairNRwTNyXTXCKFF+eSLK+7NQrwgdbZKWpiM3t/Mz\nxy0WyMrqS3Ky9x5Xf7GGu265G28BeJk6bsA3usPzt0F6DqTmef9WVNC7oKALbB/qjTYDXn65d6jT\nnVD8+tYeBlx24s99mvBPcZjC9dSVnxjrIvPS9IC/rVEdL133K79DIIMG6elFTJ5cwXffeQukfPuP\ndcd9jXZsLFi4ABcuVL0M+13oAV1OFvrkn6kd80ZIbRql6iARa6/jUnEF4RIovkVlgxNmPg1vPSmo\nTALFqDLYURAw2DSQfgrdSCKRAmsVBwekwIoocPsWjoXiLRe4fCa/f3Qpbz1w8XHfr4+e2o3n7vyV\nNbYr6fpFL+5/cwWMXA92K2wY73VjINDj8clnqEwz/sJ/zOMbyb65fy0EaoZPIW7Ne/6/ZZcganUV\n5bPi0dqQv168OI2xY9und98R5Of3Izq6cUC759r7AEE4kbgYTcWN34JYC6/eALZwuP41GLAbumdD\nciHIAvb1AoMLVJmoGIn09OCqpycaMb1DhMJ2Mk4T/imO2U+N5J2r17fdsKOQwBIRGPESk9L69FtC\nQo8BDQ03bhxYEc0WXL//vjFRateyfc276BCKdfn86PkGJ/UISUJyN1qJktuFvjgLHWvxSGfh13Ju\ngrBtj5OkxWDBgtJsv+KGzFUSaxYIMhzBNdQbBgAFI3m2fbD1okaybwqPjrc3r+YtLiZtgIXc3aHc\nKYHorKX4f1y3ib99OJE/zZ0YsH2+9BhWwikgDgtuulOGTTLgZ/gwoCEKsbWRRqenPq1lKQtJE8gu\ngdZGMM6YMWG88EIyt95a1HrDdiIiAn77rU8A2QOUl5aTSk90KOwZYwGnDHmpYK3zEj7A7v6we4A3\ndFUSXkvfI8PwLeStuTvkOXtdlMbB5Scuiey8p4Nr/Hc2TvvwT3GMX9Cb7qM63w+aObNlmcJznsln\n21lzUZWWb7CETBRxWAjHSiRRxJLL5KB9P/64t1RlrzO7B93fHtROKuJ7z1e4GqSS5ZZRGJLLifm3\nJTAkSNETzY2uYi8WApO3FBR06DBqekZ/o2f0pwpm2kPQbogIEZKnKlDr1eOxhrdfFkMK8a+jyNoe\nPJQwBjfROBlEIT04iowgwuEkttYGaF6yb+q6DwVNRV/ecuFVDVPQzO274ltuiSc/vy8JCccXCbVr\nVy+qqweTmhoYc++s9xBBDDp01JOCbful8P4VsHqKj+wbbtK3iqQp3ugdl9E7U9syhsxr3gh53ovf\nmRJy3/FAkmH2Z9NQDG0X+OkMnCb8/wH8edNFndpft2ExLFwyucX27w95KOoxmNwBo1FlBVXW+a3Q\nBq15ucnPaH4hk3VMZjkD+AWDj5wLC72W+PS/n93ha2sQRFv6w39QmhbrDqL9IiQJLTISXM1sZSFA\n0lE5fTkHpz6LLHun6QqKX/JYQkKnSkz5RCGqi5eEqiJi2TZwPFuGTKIkLiXAAnf0WQjDtoKpWYKZ\n4oY++xHfeeup5u6v6vA9Hy/6jQxuEOhDTOAv2bQZvcVDR+YYURuXBbTWDBJHL08AWebQlCcBuCX8\nCRZK93Oj9CDZO1pWaEtJMVBcPJAff8xg2rQw9K3ODBpcThIms8TFF1vJz+/LgAHBXS57firFjBkZ\nmSxmgsPsXaxp2lcoCO9AkPV1Ssgm5mgTv984q/V+2gFZL7Pwt8uZ9spkZi+dxoPqrfSb2eO4+20v\nTrt0/gfgqOucON2YNAvXLJpEn4ktlSPnve7LnJRl9o69gMODJxJVmoeq6Jn89RcoWqBt4CV/jf78\nigR0IZd+bGMlc3j+ee/ik86gMO6m4Wx4uX1VjQSCeux8xjsAOJpa3kLzunWaxvLrDTjPOp/5zu3s\nSJvOjlzfc/L5p4UplhpJYmd6GgNz8rBoLfXtJSRGl9bwaeY55KX2Q5MVhARZ3QeRfmQ/o7d+iwuZ\nqpTBzHv/EB/O/BK+uAj6VcL8QjC6oajRwrZVHl/CVXt9+JriZvffH6Wu92E2I7PAnk2qJTBL1I3b\nL+vcFLG1tYw8sJUNGaOAYJZls4FAU3GX7sOAEaHIuOP1VMyIxTbIxNaJDxAmwrhZuo9kuqH6ROre\nGPoh1QYPLzrvD+hKkiQmTLAyYYK3eLnDofHzzzYuv6qE4lK5oVHAMQ5g6dewtFc+EeGw/NMkJo43\nByQNyor3dw9mHBzjjLgmApfHg0EXnBa7jknmsuXT+fiiFcfUvT5cx1VrZ5IwMI6EgXHHdo3HidOE\n3wGUVy0E/t1s6yhiIn8+oZrahbuPT21xwbvjGD+/T6ttfskJFClzhkVQ0t3rt/WYf4C6lslETe9Y\nQWDEwVQ+ARr1yGe/dAFdhyez5JrWswkzzkyldmgpz/zr5aD7Jbyp7UKnR+j0SJqKfc5lWJctIj9u\nLDvigwyKvs/k62HDqDVbOGv3gaBk6tTryM4IFN/y6AzkdOtDes4e8rqkccnaT0iXevHQf2QefX0T\nWkSTVycxAunjJxCX3dvYZ7SMoVrzRpB2CBrCd5WNg1MDAXsXkuvS8tj9+CMIo8vrnUAj7Zs0vhr7\nFVMTp/p7sqaY0QqEv46uV0lURQBGw1HfQ212+iYDqlywF+vSR9CV5eAC3NR6demLwfjvAozAlJun\nMIBMzmY6Ot8P4A1pdbX9vTWZZKZeUobTqbRv8bgWJp/XGE0WHyexbEkSI8YnIIiCVsT2QsN34sjq\nkGTfgD4XZnDldxfzycyVODoQqjny1sGc98wEZN1/16lymvDbifKqe2hJ9gCbycoOZ/SI7dTVafTv\nb+Dtt5MYOrTzJI4jkzoePRORZOaBg1O5RJrBJO1atKMW+O1KqOiFArw6oBfX9msMl9x4ZwRxfw4S\nAioE5iBkHwwyEEZ9i+07lu/BmmClqrQKCW9h70rKCCeSzLMHM//92UQkhZNmbt3nLwF43EgeD7rw\nBCwfvIciZNZeGtr36j1QYlPvXkzYfwijJzBBSQC7ugWfyidXVtO/xkBmeR46eiIJwZbJTrQIfUty\nkkD68AlG6ruy/w8JyC6Nnq+UoWtDS0Yg0PQuysdupi4jl6phO3HHl6MZXSh2C0krzqXrkouRNIn8\nS1ZQMv1b3DFVQcnx/I3n477IjU72vtbxsQkcLagIEJiTkPiCxaT8mEa/6F7sHdbHN62QwWVHn7UJ\ny7K/ozhtDbfVJg6zjykEuh1lFMKJ5Cb9o7zsfiDksTPmFOA8Fo724WiZYNxZRfTsofDMS4/wl5tv\nJ5w8akgj+OwlFAQTr2yfYdV9cjf+WHEDqktlyewVrS7mJo+IZ8G6S9GbTp7MRGs44YQvSdJU4Dm8\nT/8NIcTjJ/qcJwZPhdwTHV3HM89ex9VX/Zvt211kZubx66+pDBnSOaQfm2pts82tX55DjzHxFOyq\nJKFfOH8Lf4iunt9T/8XN8PMLoFfJSP2Np6c9xrjYA6hCZkl+DOPDJ9Il8lNiY2MhSAFv6Nhr0xTP\njHuNoxurkJFx4/Zbf2FYCcNKEUf4dM0n9Fvcm7PvGE+do66NHr1XE0kGnloboGCJGstRqe0rVBWF\n7/v34czd+zGoXtJvoOIdad284mNSIzFa6+uZ+9NGDCrQpLZrzx0Kkqoimr85kgCdxN6746nrYUTo\nJNxRCvJRD3IIK18gcCQXs+ufD6FafIvTTRhWtdopnPklnvRc+r15OflXftImA2+r3sao6FH8uvQ3\nSneW+5+5t2sJGYXBjGANX9JnmY3K7eFUDJ0IOgPGXd+iP/BTSAmMUKjHzvP8DSNmhjKKMUxERvHq\n8AcRy/PfvxAs//I42L4Jsg6pXHJvJNu2fsnvz3yD7TXJqBhoXKZsSK9rrGrWuB16XHSYH15oXSK8\nORSDwuXLvAOd6lIpP1hFdW4NW/+9G3OciSnPjMcceXJrW7SFE6qWKUmSAhwAzgXygS3A5UKIPcHa\nn8pqmeVVrb9pQkBC7B6EUEDngks+5IuXx3FBXPuSjtpC4b5KHuq3NOi+3pMS+eP30wC4qf4mXvG8\n4t2xZzK89yJ4jESYbPxyz5lEmqtRZO9n7lIVChWV3spRwsO9PkXzLYdxNHlHh3aRGPSXpzt0rYvE\nixRnF/N8xntISGhoOIJY/gDb2USULprP3R8QJyW00bPEcP6MhMoBXqWHvg+j3nyNx5bbkTWvjz+U\nQFsD+hYUMn7vQSLqHbhliRWZQ1B3PkLu2U+BrnFBcNzefYzfux99s8Vip0nw+R0ahzKDvDcCcIzz\nu5L0VR4y3iwnclc9CG+CsNeLInAklFB0yUrKzlyPZmzDNSDAUGXFFWVrk/C3n7mdIZFDmGWYT1/3\nkBY6RfjO/w4v0JU0qqiklM6TMdahpzf9mcYs6rEz9ZOzGDtrSNC2O3Y6GTqm82UXVi1LYvRIhYED\nt1Fa6qZXLzP6Xjns2OKCwmSamjCmMI31P2aSmdlSivt/CadKTdtRQJYQ4rDvohYDM4CghP+/jk2/\nnMOo4d+BxwC/jOPCvReyY9gOBltDF2doL7r0jeZV11WMemkq5IXRbdUghuiGcdGDQxkx2+sK2eXZ\n1Uj2Alj6V/B4SWxO5ucY9U4/2QMYFJU4IbFfTWSET/e9/sWMFuee/5f2X+eTuQ8B8HTfNzFgREJq\nVVM+lQyyPPvQmhCrO2YA9X2vQrWmoavcg2Xv2yi2PFI4EwMRAAzkT8y6ZxAzZsbx2PI8huUUsiM1\nGY+udWt/X0oX9qV0Qe/x4E5aQvQv85Br6wjf9AC20Y+hEzJIElG2uhZkD6B4IDyYvJEAPFLAgqM7\nSsf+uxKRXBqSRzD85iMoKuTOXULxnC/bH4MpgSu8PbMfGBzh/a7tc/9Gf4KXqxQI0uiBhkoZnSsI\n5sHNfnYzhsnUYA9J9gC2uhNjbE6dUcyOTSnk549tsnUk4J1VFBW5CA9XCA8/NvrTNI1P3jpMzoFa\nJkxNYt2qIr5YnIvLqXHNnX259u5+KMqpGQB5ogk/BWgq2JIPjG7aQJKk64HrAVJTQ2tRn+qQJOje\nvRB/nEWc90U6Z+c5lI4rDXmc3e6NRLFYWvfT59Tm0J3ucI3371//upTlQKp4nRFcC8DF9b5MT1sM\nHE2D6kaNk36JBwgztLSyZQG7tiaReZYIqp551QPf8VX8pQw/uo54Pzk0xpI0jXqZtHAsSak+K72J\n0dq8wElTeC1QQZ/5R7D1uRp99T5sY/4BigEkGZc1BVeXSaR9/zRp1ZMDjv34sV1cs00DcxST9ueQ\nEx9NudUSGOWhaYHVoRp3QNf1SG/YAAlD0XouWvE8nuSJaIqOPoUFwaUeVJn8nmrTddRGyDK4XSDr\naBrxLPReuz5W3ck1P49jdtExyOHqWiFH364zPN9y3apShieZKOeob1fLe9BQkZGpotJforGzsZnN\nfC+Wtdpm+LAgSWydhCGjCxD2lsaLJEl06RJcN78tlJU6uP2yn9j0/VH/ttf/GZhc+M/7dvLP+7w1\nqXv2j+Clz8bTo++pM3v4ry/aCiFeA14Dr0vnv3w5IaFXNuBWx7XZbt+BUZx95uekX/cG64Gj6tGg\n7fLy8jh69CjNXWp2FSaObjkz606zBU1fhMV10nXc9tN7RFz7R0rmToboe+DgRJDdNE2B3FEwkDqn\nmTBjIOlrkuDQOyN4Z8tH/P5P8wL2jb/6Wza9k4IHhbVMZQBPEokbCQ8unJgw04v+6DGQOWMQV78y\nx3+sNd2MK8dL9Ao6PAT35RZTiIZGVk4tDLwBp+oAXRO/p6wDScE+6Bb4aVfAsTJwzqqt6DBilu3M\n3riZRZPG4ZFlPLKMJsskVVdTHh6Ou1n0hapIkLSVppcV7pboltc0flwKIEwVlYIwhbLNsyD5c3wO\n/sbPQ1bBtBNcQ7wfDgqggiTo+UkuOnrz9Kwf4XmOLcOqKRq+NkKC+gFQtIT1wsh66nhzVx2RkxdQ\ntKGCZFeQUoW+fApbiDWb44WKhzG3DmiznckkYzZDfXBv33Hjm9XVTDnn2MjWUe/h8/ey+embYvoM\njKJLuoX7rt7coT6y9tRwXr+VGAwStz8yiBvubfuZnGicaB/+WOCvQojzfH//CUAI8Y9g7Y/Fh/+o\n6UW0ZpEQDwlvUYPsdUd4b2Kj3ztheAw3bJl3zCGU7fHjgy9HSIZZ2bCpzoBzQuPC1IVXbKA018ST\njwrCwgKN0YbPok6DSaMCSV+qDXFuAXRdDrURMNsI8RGgmpo1kAgz1LHp7rOJDatAr3iJyiEgpySc\nT/v9gbA5xTz6cWMUUrndSVLkQTwerxXWh+eJZC+yz1pXwj0Ip0KcK4Xu9EKPngeLbiciqXGB+T7p\nCVQ8/tKDzV079djZgfclClcy+PbiP4ckQsXtYdqyjS1uHQRJ1FOLHnuD/aJIOLqEczQuCpdO4WhU\nJJVhFjw+0td73IiMd/Fk/oOoW2JQqrxuoEGMYCQTAhY6wTuLqcNGPtmUUszB302h+EKNoKuxAkAB\nTyIIK0h2UEpAc3PhvCSqu+by4/N/OS7Cl5HQqmdB2W2gdQneSAhQPdz6xFqiqhskm70F3nM5zBbW\ndbo7pymOaiXtes/++a9K/vjn4ws7bg233yTx7FPduWbhYd5aFLgvo7vEod2BhtQ3nx/h3t9vpLaq\n82c+PfuFs2pP56zpNcepUvFqC9BLkqTukiQZgLnA8s7q/BHphRZk37B98ewvAsgeoHRrBX+TX8RZ\nc2xiZDr57Vb3Sz4XrqJ4gzbujgtnkjSFv2/xFiTPPPMnvv00ltQkPU3FChuP926wdJQMHCYwKRAR\n34zswcssGnUuC+e++B9WHpyAXZOpFrDEZmTlqDsBCVUOnOau3FbqrVwFSLj9ZG8dWUPmzi2MKVvP\n2Jofif14Nc5oN06crHtmS0AfhRzx69rrMWDAiIKCjEIlZX6yB3Co+aCqgbVam0CTJX4d0ZuaiEbX\nV0OyfAlmH9n7tqgC05EaJv22jz1p3dB5PPTPLyS+uprkkjyUtL/gGea1OequqsWIia50p546aqkK\nmI148JBDFpv4gQLyUPEw6uNdQbV7Gi9MBX0hGA6APt872/ItLUTkpyK5jz2NXkKHdnAblD4emuzB\n++XS6XnhT+fy3rVj2TMwmZowI7XUUhCfTffxg/jDhL+2KNzeGVj181ftNqruviOKhPi22x0rtqjq\nVQAAIABJREFUnntZIFlakj3A4WxB156H/X9/vugwN8386YSQPUDW3lp+23rsNXA7AyfUpSOE8EiS\ndAvwNd6v/FtCiN0n8pwNOPBpTsh9Lw5ZxF3Z13a4z8iIBZRXtS90S5ZhTHgtC6QVrHOsZuuqfeT+\n9jucTonUVIEpRLSW90URaJqGHNT3DDiiwREH1jzADW4jhMmghcrTFESlLKX4ore4LuUgikOP5IJ5\nl9xArEOHrHNx6YxrEEKQ48xh/06V9Z9raKIxpE0CjN0cDF6zAyW88YWImXGU8r7LqBsyCdEsxj2J\nRkKSkFBQ0KNHRiaMDJLpRhZ7yCcXNy7+sHQJ/7p0rk8aIWDqg1AU8lPjcesVRm7cG6Br2TCLsZFP\nMRtwUUsM/YjweBcwi2JjKIqJxrLtcUx5XyPjQE6RCVtkxbDbiBMH5ZSSSBd2sIVEupBIF1RUCsmj\ngkatHg2N8oha73MXUrPrDP5xNYWERObfb2Trwy+23bg5BIjDy8C3cN0uyDI5vRPI6eVlVYPxP7is\n34Pk4WvnDwwfMYPsf3VucY8Ro4f7f8/Zn8tzt71M9t5s5tw+i8vvnB0wGEiSREluBpLlcLCuTjgK\nCqGk0M4VZ60le3/nly9sjkdu3cYnG8494ecJhRPuwxdCrARWnujzdAR1OfW47W70lo4nQ1jN32Gr\nP7NdbWVghEXwrc3BV4530GqvAGDvXqkFpzVACK9rxyM0DM2tL7cRfvkrFE/wWo1CAdc2QEBNqHhn\nAeYqrnjcjP2z89h3XjLbJm7EEV7PO889j6TKmOoNfB7zPtnrsv2HUDkLs/kenPXhaBioJYP0m79G\nMgS6MWSjILLXYfaeO4fxd4xCExqyJFN1tAal2derQYOnwSduwEBvBuLBw1HqKCOG839Yz1eTfIUr\nNK1x2gQgy/Tcnx9kSJMQaOzhdTzUA4IaDlHMBhTHBFRjGHJtHqacL5GEinGdwLgpHqlJ3dl66jhC\nNun0opgCSinyRW5rzc4kUdXXCs2jMISAeou38IiQAsddAUpOrK8/N/oiV/BF37YgFND6duCAphfu\nnem5RBIoXhePMBfyy4BFRHQ3oc8+/sSgaGJJHOgdWIrzi7mo/2yqa73aQjIKL9/9Ok/f/RwpfZN5\n8ctnSO3RGKTxn4/imHl5EBG8Ew2hcW6fFdhtna/XHwxmy383eufUjB06CXDVHdsHbDROJswcPB4+\nGIb4Qrtls8PvrVizpvVxdvXBbJ7NCYzk0KwarP7QS/aaETxWUM0gj4ZxXj16NlWDuylBeUBfC1Mu\n4bVP/ohcGs2mKT/gCK/3ez+ETqM+3EG2OztQsnHBp9SvGE3Y+e+jDN5ELpdh6edENgYpOK4quK7N\n4V5Df+4VOm48msRdOx/0Fw1vQFOyb4AOHb0YSRQXUIkJQ5nKtM9/YtLXv6D3qC1GRYs9eAlBDQ8K\nBhqYVMONgwoSt7yJad8iuqy+iVTRjXAiiSAaxSVacK2Gio0a3+8aYVhbXK+MzIislhnBkkslem9B\no0tKNPlnM8PGSWRnxOOijJGVafTY3A/MQDTtN7uyv29nw1CQwdMsRFh2UD+jfXLObSGOJKZPvYAz\nos9kcrcpVNVW+ktXqng4SjGRRJO7L49zek5jYvI53Dj9NmprbFwyI4K+fU5+DIlO2E8a2QOcO7MV\nN9xJwP+3hF9TeOzTN5NxBtER7asOtbuhRrSlju796tDrBS6XQklJS3e1EOB2w/17F1HkDLR2xi7b\nBfZUL9k3haSD+ZVeyzK3BL4/CiUOMBRBj89g2nSwFuAKt/PGR39B1attW5UNpK9A7T2PIT95HdJH\n86g5MBjV3tL/rDM6GDrjAyKTCpAkgWFbMuZ3wT3xaAvCDAYDAhkdEjIqEooGEbX1GJwt9XGqosOD\nek4EGi4fWTdAw41S8h2xuxeBVk8BedRSjY2qRhXOgD4Ebl88qYKOQdJIEkhG8r0mViIYymgSKsyY\na1ocTtXgKK8vr+H5HbbAQ/3hxn6oa/az2xlNF9lCuMfAmS9cCBa8b2Ak0LyyXsNgAfS29MY2TW3d\nZ99eyM1ChCXwpAYv69cR9FUGkTA0liVPfY5aRYvBHrzPt5Zq/7pBcXExa1auZXjkGK4+43quG72M\nj94ITvoGA+h0EBYG/froePiBKCoLU3n/zTiU41AWDvcEj6I7UXj3ueC1F04W/uthmceDi7+cytIL\nVh3TsSW/lpE8pK3MztCQ5URiIjUqqoOPmUKAXYOvar1ugEiiuPAqC5+/7KEgX8dtt5l5//36AIlY\nlwsuvTMfHlvD/C6BPt5dNa2koFsMsGwG7BkARif02g/BiL2jw7sE6MCtc+IxlLHFvZtkZwRGYzWy\n4n2h3SrklGvUvzKCQfduoXryP+m6zYpO1dAkCUkWODV7yLBMAEeTzMemA0TGwQL2DM5AbZJMdaBf\nKvEllShaI2ELNI7wbdB4fx0WEkgmm51BSSjwdmWMmJCQMWHCIPT0Zyj9fNEtDVmrqhCozTwgQicH\nPu/lyfB+mr9nXAaodfKurg/3ajupiCn3Evrhc2HHfHBGQI9VMPhdEHYm3PggZ8h9+MfB3wHQ/43Q\nei3thx3MzYrUC4iSJvLq+/O45Yrb8dAxZVZJknjnw3cIjwpn4fm3YsREBaFJtJZqNDRkZPQY8OBG\nRWXLhq2UbqiEf//AJL2NFZVfcCBLo1uKQlxcaJqacaEVnVKGeozrrDFq6ByZEwGj8eTo3ofC/7SF\nP3h6Lx4St2KM9SVwdCCPI21S1+M+vyRJRIUHEpnXBw82DaZnmbFgIY54ztVe4Kk7+3LwQD9GTirD\nElHDBbcf4MU33az5XuK5FxUmLNxP8TV/5gzLtQyLSg/o9/UzQml1q2DaBiYXZP4KA/YEJ/vjhNAJ\nylNL0UQ9Lk1D1aDOBT8fgeVZGt9M2M53c58nZZsVg6ohAzohUDQwGEw4lryG29hShE0DCgiucZ52\nuJjU7CJkVUPn8iB7VPQuN27JhkD1Owvy+Y4ifmxxvIyBPpyBnbJWs30boKAQRSwppDKEUf7Bp2HB\nGbxWalFXd+MApmrIThVLoaMxwcumgw/SaDT18f0vgwKbSCQtP5Vub/yVC++fzYLXS5i+rIz4r38H\nS5air4gi6nAP9md5Zxvd/53L3srjjRzRwPICmJskQwlASDw/8SFm/W4WB6r20pX0ds3K+g/uz813\n38Th6iymzz2fl/72KmFYsdO60J6GIJYEMuhNV9LoTi+SSPGF7XqQkLG4rZwVczbDhhhbJXsAq1Xm\ngfs6Hms/eCCcn3HyE/6f+XDMST9nU/xPW/gNuLfsBv/vj0gvtNk+cWgs0d07EOnQChRFITZKUF51\nFbDI53LuzdajbzDGuZ6RWiS395/LkARvzUpZlln3tW9RkjEU2CqY9v4LHBz/M3EL6rk55SEe6jWn\nxXku75HMvHUNuiO+QH88IDkg5tXAxidAqdmsh/nDVMwGFVnyrqcaFCi3+4KDjA4yfo1Dr7aMqfZo\nOsxSOGLmPsRHI/yE4kQijzDqCL5gKAGDdxxi8N5dlEUmotl3IOo2UIMGaNiwU0xOs2MUZHRoqKQy\nkeF0o5Zoishv4cZpLGGooyf9GM0kTJipo84vC9EUDf7orrkGpvyrkt1jVXRuyPzByP6EQnZfn44w\n6+GX6NARO04TWUQwSBfGRR/Vo6iFyEBEnY30kkKWTpwARd5SeyqSN3KqpjPCBD0gNZFn8F3f4hGb\nuWzwUAAiIiNYkbuUS3vNpdRVjIN6oolFs6iUO45iNBl5/s1nuWTuJYG35HByeGMutdQEdZU1QEHB\nSiTRxASEg1oJRyMRJ050vu+C0WVm19bdDBzedrLSA/fF8uAjoZPIbrtF5pLpEUye1JiEVlvrZFjE\npjb77mz0GRgsEe7k4f8E4TfFDbvn8u8Bi0Pu73FBKpcvvbDTzxsb9S7wrv/vS6Pg0r4TgrYVQrB9\n+3a/fsxbIy9g+PC/4PF4KCgoICsri4SEBP66czHvut6mQtkJmhE5diJa+b2gKwY0MG+H6LdB3/kC\nVM0rcYzu6iX9BvUFWfbmGgyIgYP5gALuFhKSDX1JCBmULwYGkKhCgzun8WSBUgACHRruoXUkbt5A\nETvw0iCARBhm4kikhiqf711Pd91E4jwmdDjxIFNILgMZRhZ7A9wVMgrRxHEhl2HA6CcggcCMOaSV\n27B92PZohm1v3FZ3aB+5Z9dg6xmB0FobcQXpVKMKl1+103s9IKsqE3/dyZr+U4A9mFHJzNwH4QIO\naZAkwRQjxBzLxNwA9deBeTGggORiyYQlzO4RmKuTktqFn50tZ0utoaSwFIMwtCHTIJFIl4Bn3QAZ\nmQiiyGIfYVhJIgUNja+WrPITvqqqLBx3Gwc2H8JGDRbCUNBRRy0DJ/Tntadv4/q7WsqTWMPguSfT\nW2x/48njq7d8LLjpz63XpDgZ+D9H+In947nPfgNPxr2GZvdaGz0vSmXmovMwnQJSpUIItm3b1mL7\n1q2NVaGqVCd/DrudT5WVjeoIsh0tYpWX7EufhC4LwZADcj1okpd9oXOse19XsktBM6jIbpmesVpA\nJKLLA0vWQ1kNNLznX/59IT3uvpu4ssAptixpeM7Zg64ucMFZh6Av1X4fvhMJEyo6wIaeWhRSqEe/\nWcc69iKaEYqMTBSxROCdPVVQySBPMvvYgYZG0agzWT1hOu6wcOJ+7oZh9b8QmhsNlTilB32i5mEs\ntweQe0P5w2AItV0gsGlVDHh0KyUXDCZ7THpjekDzBxtRxRk1JTg8wZP/4qsrMdc7kRBstvQme7sv\nacsIXBfmXeg9VmjdoGw/6Laz+GIPs7vPPo7OmlxzUhwSMgaMuEIWIBHYqCWBsKB7G8pn1mHjCDno\n0ROT4K1cVZhVzPxe16FDRx02EkhGRvavBZSuq2Dpuoc5Gxd7jDMokicD8OHbkVx+WfDqV8lpx/Mg\nO46IKJk7H808qecMhv9zhA9gMBt4oO6W//ZlBEUwsm+O97s/yWr712jNDSbZCZbt3DZ8Pzd3XcX5\nW+dj0x3BIMK5IupaHq+5tXMuUgJsEj12DsARZiOuNhHrpEBZg/X7oLQa1CbroKq1nHfvXMxtD9wI\ngIa3zlLeU98SI4ugJKgH9D4itzax7o24aCwCJxFOZFCJ5ab+9Xji2Md2NDSOTLqQojPORzN4B5my\ncRegzzyXS95ZSUqxnfUpEaQfqUEK4U7qCAQCPXrcQEHfa8FpgsxK2GoPaIXeyQU12f7gVAUdBl8m\nckMt33q9joGH8phzVVeWv+sbOFVgisFL9p1QWe3ZCROZHh/D0KE72bEjeJgrwJVXxrBoUc82+zMY\nDRgxEU8ihRwJ6dappZpEkoMKunlw+xfVXThx4aTfkD6oqsp1/W5BQUcJxdRRSw1VWAgjjgRMmP1R\nVCaMDHWuYM5Ze/nX6idbzfa97JqePHDdiZNiT0wxUWfzoNfL3PvUUGZddfLq1raGE6ql01Gcynr4\nnYWmlnww2HGycPB0Dh+owhFMhlcNI0Obz6GzWpYClNZ1vvN+Rb8VTIubxsKyeMJjy/x889JKsAcx\nUmWPwrC/fEl0nYxRc1M5rIwzli0Amx5T8iNItmObZamoVFHBNja26icG0BQdv9z7HJqh5bnSD5dy\n/qKlRBOLHTsWwkL66ht+b2sRUyAoJI8tPWT2z70VTe+LxMhzwtpqryJeP1iwYQex7noUr5YyOnQB\nfQsENVQz9Nbe9J6ewcSpVvxTtofDwHz8n69BBvmBMhyheT4AYWFgs41qs92Z0jQshGGnjgLyQp8f\nI6mkIzUpuygQFJFPXbMF3x79MzA4TLgPq+RwuEUEkQ4d6fQMcBFpaNipQ5M03tr2Mj2Hhibaj/59\nkAcXdi7fREbp+bl0BvrWK7R3Ok4VLZ3TaIIGKeTWYELPq7uWkqiPC+6ekVTGRPUKeuyz6c8e5xU2\nIqo7pIyF66On01WVMFtnonkUfxSSFpJzBRce3s/ZudmMLc4jYsB+pFV9MMz8PcJmao/6QFAoKEQS\nTTLd2mzrtkYS7OHJHpWwqhqsRKIhMPoKfDcdQEIt7LYFBw5izL3QmsphpBphQQLclMyUiiIS3HYM\naCiIFmTvP5chmvdWmVhw8cOBJzhew6zIDV/YcN3bfrIHqKuD6OjNuN2th7TqFS/BWQjD3IrfyYWT\nXLKpoRonDmzUkk9OC7IHyNt/hPLDFdiwBY2yUtGoIzCfpoH8XcLB7Mzfoaqhr/vyG3qx29E5bq3U\nHlYeeHYYWytnnXSy7whOE/5JRH5+fpttZGQsmpUXa99qoU8vIYGzJ+8PuyPosbd3u51Ijl97O2k4\nWFNAUkBWQJLhM9NrfKJXWS7gJyAsGdACvz6SJtE1Kx2jw4RmcpEVHU3Vtz0wTVuI8m1fFCUgn6jD\n0KEjjQyMtD5L0Ne1jNiILqvmmn/9h8lfbcaNCwf1ePDgwIEbVwurvjlC7Wv4W0XFFWmCILH+lhoH\nQ/bk+ix7L0INJFY3/DRgCHlTX0QepOL3a21yt0H6Ifa5PHBfGfyrGtY5aEd0agtUVUFycuuW8BnT\nxvifRSJd/KJ5weDGRQmF5HKYIvJxEHwEklUFFdX3+QRL5NJwNlsz0NCox04F5Qgh+PiZT1u9bqNR\nx71PDWy1TXuwNutCFtx+jLIXJxGnCf8koiPusySSWWtcS7TFJzcsQbRnHO/0XtSqb7JqQhXrBq5r\nlPgVIHmMRNWneTnBYYXCfvDF32Dlg+AMjIE3RHgr/TWVsGl6unoZCiQwDADFqiH5/OeypkOv6Thn\n9bm4Ix380k/h+8v2sHHLNTys3cHD4g4ect2ONd50XOvKViKYxHn0on9IpUfZ4yHp52+QXY1kMP3j\nHzDXOdC7GxlPRUVB8YdgNiX9Bghfu1pq2M9uaqn2t2tou5rlbOI7Kqv2kbDpO1Cb9KFCXEG1P7ao\nsd/g34Uaixk1WodnsA5tZiTcHAb3h3mjdGwiBOl7QHb4/m+85yPXpyA9WBVsDOowysvhqqtCZ4ne\n9vqN/u+lDh0aWpuut1CQUehKdyxYceJAj97vp28KCSlg8Be+BLkan86/hsrm1VtaHNcc1901iLkL\nW8pltBfHk+l7snHah38SsWPHDjytFHVuDoGgwJjDO0nPk7D3eub1msGkpPanwf+wv5C7FuwmzpDP\ngfhoqnUWqqu6oLq8U25Z58JgrUJbeAEus1e2NbY/mGPatzaoeaC+CFyVoLeCpStIevA4oK7Y+8+o\nmvmg7wdcEueN3dY0wcPKc4CXSJvr6zSQRDDrUCDwKB50qtcdkk8ue9hBc+tWQiJFTmfH9TM5mJiB\nVcpm4tYPiD0YTey+hGZtZSw+F4Tm+2mI1NEQZEcLius3c9hxgHqffnwEUQxmJLVUB0g8JyaeQ0RJ\nEaUDxnFkwiW4LRHojtrpuWEVFx1qWV22eUSQS1FYMWoYe6Z0a2jgv3MEUFEDXSN8H04TsR7rEYjd\n643UEoIbu47l5XFed64kdaxoR1vQtJEhDY6SnFIWDrwDZ53LpzR6BGcI6701jGA89vAYNk06jyrF\nTurB31D2LMZD4KKRDj3d6ekfrGupoYxS9Ohx4kBG4f4n7uWKP85t570JVi8/wt3zf8Ze22gYjJ4c\nz66t5dTVBh85N5RMJyGhc/J6jhXt9eGfJvyTiLYWbENBCIFOp2Po0KFtti3LruCbW9Zjq7GTu6mG\nV8cMoqo6DkXTcMuyN5A+QkJndpAx5jssUWUows2YC6p4qfglontBWOLxB4MIHxdVZYMtTyFnzCHS\nzGn+/WfNm0Z2nyxiC+IZtWICsUXx1FvqKeh1BIPTQJe9PVrEz9ToDETf5SJ5XSIFG0sQQnCA3eRy\nGLlJHH8sCXxS+y5m4bP+NDDUGtH0GolbuzDjsnkYbF7/vYTk9znbYmt47a1ncRhMSIu+xFRlxWlQ\nMOStJOzXx2nLGRUx7Brid2xB1lTyJ15A8fBJyJpG8sZvmLYtjAiPp4Wd2kD4miTx5Yjh7BqXClEE\nWYIQYM0HWxfm1cGHQ7/1Je86fPN0vfcgITDLKqUzbsSqN3Y64a9Y0Ytp06JD7vd4VDIto3G6nUQT\nSyRRFFGIg/YJtBXze2ojR8PFCaCTQCcjuR0YbBX0+ewBHKrXMAnDSjxJKMgIIJ9cnDiQkEggiXrs\ngMRm9cfQMuOtQAhBWYkDk1khPNKbwt9T+qhFu9QeFtZmzehw/52NU6WI+Wn4sN6znt2G3WS6Mtu9\nENgASZIwGFrXjdBUjTstb3DEZcCN9/XP7p9AdW0swgqyLEh22DgzJ5uoAbvRf/IEQhZIsoaseMj7\n4BLOungo63K2E5bY6qnaec2ABJHp4KqQ6f/ow9T9/S3//u/mfAVAzpCDbJ22IeDYsJJYkp5ayZVZ\n25FwIyEAHZGajn2vJJL2aiYPr++O2yOoc2jIbgd/GvlX7NV2Zt89k2n3ns1DtQ81diiDK9Lr6ige\nmc/ap1YwdeFMANwmN9VdCtg4ZxN7Rxdj3jmb65/oT6TjByQV6oSB3eZofmuHe0KN7YKm17P11n+i\nhjVafLnnX87rZ7m58fEvCGsW9dMwoymKimRXWqpXHiTU10P1DlIf9l0DOt9ipb+tG5zx4DJTL9sI\n//hV7uk7ms722h4+3LrFvnmznSFzPmb9stc5altLBWXt/r4XcTk2hsMZ0b5wIt8grjfhjEwk65yF\npHz9KOCdjTlxUIcNGzWoqEhIhBGOlQhcuIiKij4msgfvOxefFOjuzBKXs2VjCb8/93uiYg2sy70k\nxNGnLk5b+CcQmqbxQP0DPK497n+x1x9Y74+97ggyMjKIjg5tWd0Z8TqHa/VoTfrd2GcwZcmReB35\noDfVIznh5iVnoIsNlHt0O0wcevQGfv3bcwFZtg2Wfij9/rYgBNhLoeLxBey67hUG9Pda3fJSOaSP\n17z8IRZ+2JMEZ6BVqAFVBhPfJ6eQO85G+cE+CCA24yCTx1hZfPuZ/oXukCUhAcWpcFPS/SALdMKA\nyWOiICaaj8eN5dr1K7HYBDq393iPIhCazEfiLRyE1mvX9CYq71nlXekOVs7Mh3v+uhglSDjmzxmp\n/DR4GJ4uOognyPE+C78+Bnqv9H4+Lou3jq6kQl08lAwGj8Wrm2+sgqgs+lWGsff+YxcJbI6SkqEk\nJAQaH0IILrggm5Urm7srBeHhEkueK+K+a/6IXXilHRQUwonEiIlwIokilh0MZQ/9AAmuS2lM6W4K\n1UP/Ny7DghUZBScOEknCTh0qKhbCMGFGQ8Ohq+dbx3KUEA52R72Hn78rRZJg9OQETOb/bdv3tIX/\nX8Rnrs+Y5ZwVdN+8tHkszl0cNCwvFOLi4lole0edk9xaBRWJ+pGVKMOKMe4Mp1YbBQjGXfUco+e9\niqJ3IzQZobW0evQmBwmXr0LTfLIJzS7tWF08kgSyHjgwmAcfqeQ/i5MByD0rl9S1qS3a6x16rLsm\nE+s82GKfDFjdLrJMiVTu6U7DqHR0/wCWHqnj7KJdfPfkoDavSdVpHNDvpq8yirxIAz0KXSRVVDC6\n9idM9kayB9CpEh6dxhme+azlWX+0iJAk1MSeIMkITaX26lf+H3vnHR9Ftf7h58xsTe+BQOiEKl0Q\nLDSlqCBYsF6vvWK7tmu9+rv2Xq+9VxSxYQVUQJEqRToECCGk92STLTPn98duyia7yaaARufhE2Bn\nzsycmey8c+Y97/t9wWz1nnCwQZSus3no7wzeOAQV1TexqbOIrzm45wDxe80QM5y8C29ARiVRN3yX\nIDzgiIFYX2UoRyzYyrwGHyAyG8IKYc8U8JjAFQmKZFuvbfCPYngnjeCvDqGTmNg45HDOnD18/XWg\n8B9BeTlMv6gz4eHvsH1TImbVTXL3JKSUXGC6FumL782qVx2tViqqIYpKMinYfBO1Tlx48BBJdD1p\nDJ3kHom8sfeFoOfw0Wvp3H5JnatLNcH/PjmWyTPbLqj4Z8cw+G1gavlUvuf72s/hhLPEtiSosQfI\nsGYwNm0sM4pn8J/8/6Cqaq1vPiMjg4IC/1HksGHDgo5SAJzuKq479i1+75vK8A/uI23IWp/MgiAu\n4x3SV03gqLNfqou4UXVkoOLbgB6mBRxYtQUpoTpfheXT6HllnbFIjUqli7kLWe56OkASkvalEFGi\noAuBGsBwVqlmiq1h+BsvgdsRzobMbUy+MY6HL41ncpfJLJFLAto4NUdlX/VuYoaMYckFOj9JQXiJ\npPumUizOxhsICfGKjeH67WznDcpSoig762GkNcLbaZPN/ykZ7Omo6Cy+dQHfKZ9w/FnXkMVy8shG\nrwktlEDxTyQ+sRrHuS9S0aePt//mStAUiM3wfnbbwFbqX0RdAIoH4nZB3hCQJihK824zNRs2JMDv\ngWUGQkVVaTRhu2OHk/nzm4/1rKyE1N75VFR4M3eFENy5/AbuP+5JdE0SRzGlNSHF6Q7oHeb14dcg\nJTElJYydPQaX08mGRb9jdpvR0IhOjURUqyiqYMpFkzjvv2cG7ce4Lh+Qd9B/meaBy09Zzq85s0hM\nDqzc6ncuVToff1/J/hwPowdbmTLW3iiEuikcDgc/3baabsd2YvDphzeUs00uHSHEo8AMwAWkAxdK\nKUuEED2AbcAOX9OVUsormttfR3LppJSnkE12m/YhI9vmTnvz2ye4ZfM8CuMPMi11GAMnfYVQ6iUR\n+aL4QnFjuqrsLHn2bopvvA0RYITfWqSEkkcupOLeO3GXdWPvHgennfgT+9K9MgkSSfWk79HO+oyE\nBBt99P4MueASiszhHFGSh1nWGTWXUFiS3JNlnXp5fd0mvHHlArBJv9H1hSeG88alnetGijWBLTp0\nOakHQh1OzmVj8Ph85orbaz8vvgnicv0vmNMmKcFGfnU0TqvCoptGI1uYXCM8bkzmV3APeZwu88bA\nh6VIwEmgotaCzhxDN6aRnVrCzLcuYPZAO8O+/gjMVeAOA3sRqAEivhzxsG+S9/+mSq9Ou/R0AAAg\nAElEQVT7RwLLkuH5gS3qc0N++CGNiRNjaj9femk2r77askJCigJjxphZtqwbJl+dgw0Lf2f7qn2c\nfZ9v3sOiwMmJEOMbj5oEJreH0xN28sEjrRc+HNvlA/IPBl9/3tV9uee5pr0i2/a4OObCg7jcksoq\nSbjQGF6Zw7k9K6nKqaB4TwWaU8NkM9Hv5FRmPD2WiIS6h8gD4S/gcfj/3qY+fyxjrmo+IKMpDkuU\njhBiCvCDr1j5wwBSylt9Bn+hlLJFGQ0dyeA35SMOlbYY/Ec/v4Y79Bdw65I0c3emn5CDxd5Ya6ap\n2rm6pqKaNFyOMLK2jOCT21+h66NnI4duaLHBb+o4Wd1WMHtkOFvW7uFgVhOFXICJqTqjDpSzMzqe\nMJeHrlVleITALHU2RSczv8dgZLTvQAp1gTP1j60DTo2IqDzc5mw8Fz6EHLYR6yYLibekYN8bzp5X\n5+AOb/DmJGHoEpj2sqiN+tGFpDoc9lbEo6lmvv/XaNxh5hDjVnXwOEEoRH3+IMdtcbGs93d02X00\neaz2nUDgty07yQznJjxU8UHBP3hpdwm3bvsawoqgIhnC8/1H+N7OQml3yD7SezKx6dB5PQDJlghs\n140jM9NNdLTC2WdH87//NZayDsbll8fz4ot1MgWvv17CxRe3rXjIJ58kcOqpdXLBU6ZsYNGienNL\nKRas/U10iinix8eH0rNH66MJAkXYNGTI6DgWrJraZJthcw6wd0sZx2Xton9pvl+obbBvhGIXHHfj\nUHZ8uIWS3f5zZ5XA6yRS4ItJS0gwk58/s9m+NuSw+PCllN/X+7gSCO7LMGjEl5VfMiO85SMWl7ua\nlyyv4nZqoMKorydiOvmNlu1EwuoPL8UWWUb6yknsXT0ekES9fhVlz1zW4j5Bneu6vhu7+v1ZdCo5\nyKLPQ8s/6FKoY5aS/qWFKFJSbLZikhKbR2NB6iBkmPDeWTV3V4C5TcoAXaGiqjPQGeW+twgr/Y3U\nvU+go5PQZRB7lHpVtnRJyvpqYve7KU00kdctjATfSDA/FQ7Y7eztkkz6UX1BCTIh2/CJ53Zi3v0r\nll2/Ytm+FMVRwm66cvLuW6hExUUZJewMeh2sPveGQGHR62tJnT4MbEXeCdnwAu/krb2ogdFXvG4c\npNe9k7Ct9pocFdWTz/YN9zvGzJklXHbZPvbvD6zcWcMNNyTyxBP+iUnXXNP2SlGnnVaAlHUG//vv\nvaPcRx/dx86dlbzySvNa+KGQZm7e2AP0SItocn1uoQf36n1cmeGVVg51TKRVSb64bwPgov7MTB4K\nz9PJ98n7ClpQ4EaIBUh5aoh7bxnt6cO/CJhX73NPIcQGoBS4U0q5PNBGQojLgMsAunVrPIn3V+Z1\n/XVm0HKDn57xG1nOupu092d9KX4oClt0qZ/NkTo4KyIx2aoxWeqEp6SEHcum8fPrN/vt12R1kBiW\nzWYk3Rp8ne2EE0kUt/MwS/meT3m3dt0kTuJNsZCJ7oHs0neCSUMrjuLoXVfhemsqP1bkhnxuHzlU\njsNCL7NCgquaOHfdG0FaeQHb4hObHl278A2a69ro0k514gjCx05k4I9xuGKsrLB4DaWlQmfq3bmE\nF2ioLoluEjjMJbx/UwpVtlhKswahd4pEqjWyvk0cu+Yp56rClLuLyE/uIcHTj3hOQiKIo5+voquZ\ngfyTUjLYwosBdqSQwgTf0RT6j+1O736RqL/0RrMVgb3YG5lT0QkifLWVPTbIHuUtlai4oNciMFd7\nH4CuMF4cdUKjo0ydGkN0dPP+vobGHiAEWaiQWLGinHHj/JMJb765R/vs3IceYq7juEmdm1y/8f2d\nzGihsa9pGwlomKmvbfEaSfVa1P9XsnhxDscf34n2plmDL4RYDAQ68h1Sys99be4APMB7vnXZQDcp\nZaEQYiTwmRBikJSyUelnKeXLwMvgdem07jQ6Jrept7Vqu+jwZGy6hWqlmkg76H1yKb3vXOQdH2C2\nVWG2V+OutqJ7THx81RsIoTH51vvoNHAzADt+mEr6h6disnhdQLquoig6g/ou58eHvPHr+5uIO39l\nxh7KfhgPWhXW6FT+8eE9MBF+NG/lnmcX8vqnGznpqJ688sA5xP/0UYvOzYVgMXYipMplwoWtng9/\nSs4utnVPbPpuC1aSVUj0YWG8HTkWJq0FKVE8ghHvlRCZ46l1hysuCS6NaQ/lUSmy+eW0ODK7dKI5\nQ29P387I74qJKtKR0kW+vo4wLiKCFFSsAVQ3TUTRnWTGkou/9HR3TiSGvmi4cStZDD6mBwC/nHQy\nRy1/GqojvUZfL4OKBKiO9xp/3QwJWyF+F6hu35yFSoqnF50iA885lAfJHm2K/PzQs8Wb44cfqhoZ\n/D+KqJim52V+vNabL9IaZ64AVAROVBQEem2GeeC9TZu2Ao+n/Uf5zRp8KeXxTa0XQlwAnAxMlr4J\nASmlE7yhB1LKdUKIdCAN6BgO+hB4jueYS9s090eHNS872xC37uaceZfSo7InR/TvzIblU/ngiARE\nlcbB2f9k8CkL6HzEBgp2p7Fx/jlUFiaAVHj3vAVYostRLE7COxUw7+4+zPt0IZtyFHSpkhhRxFf/\nvbfJYzsqHPTveiYVpRtQcSIAT34Wp085k+c+fJmrbngeU9bPgGDBUo2PXvsIjVkoQWrWBkdQ6dZY\nmJDMtNIC7B4PTlVlzeAIr0FvKjlJgUblugAhdDTdBHtSoc8B6L6f8AIr3Vc4As59unCiSAu5vYPV\nEq53SJebSe/mImpvYjPJHOkTTlB8ZxQg+gcTAziJGKwUUYCZaDoxHjN2dNxUsodKfS3XpT3Af5Zf\nyZiusWw/8VL6f/sKuGygmyAyz/uTtM1/5xIo7USXnQnsf3h60L6feWY8Dz/csuCD9PSmXUAt4aqr\nYppvdJg4ZkrwEX5Fnndw1JaZO6/Tpqb6gyABjcwgiXGHSnCzrZO204AngPFSyvx6yxOBIimlJoTo\nBSwHjpBSBlJ4r6UjTdrW0NrJ22pbNVaztfmGDbhm8bVsnL+J8L4DWfzx1XicVpDBZi/9CYstoP+w\nn7nrquHM6jux2WOl79rDjq07SN+dyYO3PU1p99lUdT4TqYZh9uwh8sCjqO5yrMnhlJemou5dgtDr\nMjF1xYreaQz6wdZJ0FpUeGt5Z95YtpnkC25n6w+n8tPTd3h14eufpqz5S3jfmBu/R3pjK6PdoFih\n936YsIZhC0Yw+NNPa4uFNUQzq3z0r9Px2FMIel2l5IjXv6B7ZmyLk+kA7HjoTzlFmMkgAomLQn5j\nL4uQeCtfCWAox3Lj/Ms4/rSJZFWVM+Crlylv6KuoiUQChj8RQ9dVJsx4MCF5aM0seo5q/KLuculY\nrcHvObsdHA7/gcko8QrrOI72iOuXMq3N+2iOa878kW8+ymmyzbMfH83004O7lCvyqrg/+b2g61vD\ndsx8RQTuxl9mysqmExkZelWuw5V49Rze4muLfPG5NeGXxwH/J4Rw4/WoXtGcse+o1ETajK4czRq9\neWW+toZivlXwFtesvI5H1p+Jx0/psumbTwiN8JgCTLsTmdl7QpNtnU4nF8+5hJ8WLcWjSTwunYq0\nG3B2mg6q95hu0ZuimP95v58eQUzmaX7GHkDRnYic1UjrKUhn09IQgXBpcNLY4/hAewF7QjnCVAK2\nQKcqQRMNDH39Ub4AKaDU6tWpQRC2zcqgBZ82edWK0g5in3EW5T09UNgXvn4Cqn16xVKCx4n9h5eJ\nz4xHEKg4deM3jfoIJLE+eWYrFSTiIBuF3XyBjpswUoiiBy7K2MgKXj59HG9bdvBS0UWUnX4jBxxl\n3Pjbj/yYm02F7kCvNtPtR5VB7+ikenL8xo5PH/kSil3wYOFtWO11w0eLRcHpHBXA6HvFkByOo/yW\nnhN+J2lUkENXsujV5Pk1R3I7SHiEwrPzJnLyzi/ZvqGx5v7Q0XG8+vUEYuObHnxFJNkDuOVaRsNv\nQz/c7MPJZqy+B7tXBi4KN+ZDlCLV1iidgPXPpJSfAJ+0Zd8djQetD3J8VZPeL05QGk+ctQQpJSll\nEWyeVonys7uRtHGD1tT/eqlWFz36bERunBwwSSRrVymOMjc9h8Rx//UPs3ZVV/L6v4bHFEXYnmdx\ndj4JsKLmbMdS/CtSCcMZMwlp9vrUFU+QED/p5pLbbSz90M7ObZUtOt8uXb0ZleFqJJrbwq7lJwW2\nL0IEyMwMFEkD4CYxJ5OprwaPkAHIHZ3F94s+q9tVRB5cOQY+fA8OjsG6/GUilr4PmhMHc4ggNURj\n4H3gK4AFjXjcmLBgxoIdjSR00kkgnuOJpb9vCx2JB4Wd2F39uDT2Dd51XU7XsCjmHeMV7rqz0yNU\n5pb7zbw07I1eJfl32APcc/AGojvXaf3sWXSAxwdksHKbi71Ek4KD3pRjRXKb2AxmuDP7PNxuF4qj\nFAXJJL7iC86i1KsDEcJ5NyYnp250//nnOcya5f872b17KL17t72+A8DC9a2P3wfYvTx4Fa9QaXiV\nBDANByfgYB8qGgq9cGMCVjyzhUl3DA+wl7Zh6OG3E5NNk7mewIVJAHrSk+/Dvw+6PhSEEAzdPZjw\nM5citSDZtyoImwcsAiIkhGtEp2Qy86FriFmfwIvP+ksP7NhSRPLYj+h6azYTFyzh+DdP57sB+RSM\nHElc+V6S837DHX0ylEP45oeI3n4F9pxXCMt+nthtp2Mu+QkAj71fwO7o5jj+e/NJrNx0IpfNbb4+\nan3enDcOgEt7X4WUgqGz3greuN7co2IK5mOWmC3VTFm4EghupiTwwydf4deo5t/ZF0CvRaRc0h17\nv6OQZjv7WIS0udGVugiMumxmiYKOgg64ScBBMtUo6CRSzR4i2U04HrzVec2YmMRlJDAQFQsqFkzY\nMBGGg34k4cLtVjmwo+6F+dN/fUtlbp2POdhUYI3H56rjn61dtuiedbw183vyt5XSmyqOJ4eBlGGt\n/+hww30J77L4sV98DiYvJzOPeHIQQWfKA/Pyy3F+rpw9e0oaGXuAPn02tqiGxKHko8sWtWq7GhHr\nplCB3mik+Yw9QP720HMkWoJh8NuRJyOfZDWN5Wiz7dnsidzTLseITowieeAmwmKKCPhVCgNLfBVE\nABZBQv+dXPTxVNy7U3jypfMY0a/urUDXdYZfswaTK5Z/zjmHObedw5EXf8qR017lmK630zvvbbpn\nzGP4npfosusxrMXfIWQ1Ah0hXQjpJHL/PaA5cKRcgxQ2P4MgFSukTSbMHobJpPDYsyP5YdXkkM7z\nost7MmZcIgCjkobRK+Mheo7a5J20bWgEJOCU2GKKSRr9uzdWP9C1USTWoqomYiO8VMSV4IlyBx6S\nWd08+HsE42/eiunLr5CvD4ZNGpv2/Yuymb8hVQ1p0iibtoldHz9CsbKICn4jT/2BjTxEClWYfM6B\nLMKpxEQXqlB8V04gKMJGw5dv74SwHReCRJz89t3e2nXLnlwVyiWtxZqrsWPTAYozy/nh3vVIvXmj\nBLDy8XREvZYKkiksYCirsVJB8L14l69cmYqUaVx6aYLf2j59NgXdzm5dTkVRy94MDwWle6pb5c5p\n7rsWjIGn9GjFVs1jaOm0M0dGHtnqSj+hoNp0VFXn9Bcv4IPzP8JRFuf1TwPYwRTh4Ii+S1m76WRA\nJyqyAObcxucL726kg3LvXd/Rc5+TwYO/J/akDZgiJFW77WwYMwKtUgW3UnuDm8qXAwEyeYWKuWIN\n7ujxlPZ9ibCcZ1Gr96FbEzGnHUXx2of92o8ancCB0ln0Tv4MZxNKu0/8z3/+6dr+1+LQLuWno+ax\neeWxYPa5rCTeh4ATooYeoGBfP6Rq9upDu8GbhKSDrhDRPRvL3uZvv7gZTYcJ7nxhDUtKnwcFwnt+\nXZsWsO/T573HFAqYdKSER6vu52TLbDRN40jbiegejVJMvgpYAisaVnS/kVewb4+gRihaMnpG8OLc\nTSGAqELJ3St+ZtL74SFM9ddhwkoXjiCHLUhfrIkJjQFsJJtUjjktkk8+aTCmNUuUgTqD5kqOGGEi\nb28+j059nrw9hZhtJiZedjSKjPHFqDfuRbw7i6vivaG9U647lnOfnNNkxbdDhTla4MnXCVR5qz2p\nccSmDG+b7lEwjBF+B2N6/1loHjNx3fdx9btTOGHuA8T1T8eU7CCm1z4mz3ie37d4I3BM1mqOXlrF\nvx+/NuBN8v6Kb+i5rxDruV9j9k0m77uzJ1qZ19h7Eb6/ZRNGwbtGC09DT7yekqGf88LHCyloYOxr\nyM91NWnsAWLUj+mb8hmZ++u0WsJUO78/dgHXnRUJbg9USdAkkV2zSDpyHYX7+6J7LD6fO95sF7sg\nslsOaicHVs1J1I7mk8Aevvd+hFs0trwScAu+Wv2zNzBKBMgBMwOmOv/SbZ4rAVBVlVe+eI1M7ytK\n7fpAItGxuPxG0jWoSKzoOJF06tn6cEYpoMzpInOVN7AuVPNZU6EriX5oKLgw40FlO4M54dxezJ/f\nFynT0PQ0XtsRB2+Uw8sV6Nc7+N1URczLq7mh7/+Ru6sAqUlclW6+e/InUshrcBydFA7Sk330ILN2\n+fdPL+fy6Bspy28UhnXIOf4/jQNgDsXArubdtOZ3094YBr+DMfOEf+CYPwV3tQ09pYwh577FJddc\nwVVXXMqo3ktY/NlcpEViCStnVOwKXv7lIpL6N5ZWPuLu4YzM0ekpHVTVE1wr/TGmUXFygEhioFEN\nKhBSwx3h1W7plP8dA/Y+gT02l7OnBE5T13Wd4Wlfh3SuudlOBnX/mmce3+q3/Km5Scif+7Hj41S+\neH8Lc9+eDkJFrR9QL3zdtYMa4UY16aTPH82KTUcH9KtKoMpmZ9mcs/jH59WomYl1K+pvsPhKLGkb\nQ5LSEQJK7Lm1RbqHTOrMfpxI6h46VaiNHqVJOLGh1Rp9gURB0p0KNODJqsv92pvsLSyqKuCEYwag\nay1PuhIITFi48PnTGTm9D7ctvorf5CW8826v2jaKEFy6IqNuIylJ3pLFKbcuQNEaG8nhbKAvu5nF\nV5zFJ5zLfCbxM0ezhl74T5ZWl7uYm3Q7v30RzA10aJhw9WhGXtsNTbh9k+g6Yd0E5346mTFXDGDs\n3IFcuXImA2d1RzG17Q1EAlEpoYdktgSjAEoH5f6XX2GVZSmVZiu7vj+R6FWJTN27C+3IA+xIjufm\nG2cxcVxgfe+ps2Yx4puxmFxmNGHmkzPXceIrb2GOgLWDjqRqa3ijbSSSHAooF8VezQbh9QaW9/gv\nnqij6Jv5MnFF69FUK+FTZ/DjV1cHPPal561g3nuZAdc1xf7iWQC88MxO3ngxnfAIM69/OIbBIyK4\nVyYx755b2LdmFp5q/xtFMbno3G8j588p5IFjrwLg20d+5P1b64LI3CYTKyZMJatbbzSziRrrfuL5\n8/mmz20gBFLV4b1nsFQnkfSvcxBq6MbyAV7kPLxG+r//fJv3336F7szBTCQCQTROeuCo9fdqgAuF\nHUQhgXhRTbypGqe7hEeK/409xua3/x8f+4Uvbl4cUl8ksGmyhUXf38qrExeyd1leq3zM/9p5Ool9\ng79liDe997Gt1MHsGz9G1WoeXoEJJoHfFG/L51q4Rdtxuzxkrs8mLjWamJTGdWwr8qp48egvKcms\nQHO2/IEqAbdF5dGqC1okuWzUtP2b4XF5KN1bTnT3SEy24FMzK9avZOGY5Zjd3tF6oSWcN+LzmPDv\n++h3STV57yWz9/o0dEfgUaO3rFw5blM8Jms/rLqL8KqM2htZCsE9r97GtIsCh6DGmuehtV9mPgB3\nvOOm9OzLePeml8jePhyPs16R9rAKTNYCyudNq22f7crhtq3PoT2skb5/LOuG9cGlWWmUzRVeDkf9\nzJzk7rx51BTCrs9Ajc2h833jW6Qm+iwfcAreQtq6rnOEOgsbJqIZRRhJKAhsQDwuLOiUYaIIKxJv\nce5tx29EX/wNv1esIzy88cMY4LaYh6gubVqJVAK53QXjF8/iij5DOLiliGcGLwBCd+vU8J/y87FF\nBM+tEG+uRbg1zr78HRTZHilajbl3zc30HNX9EOy5beiazne3rWHZo7+3aLua6ahbtp5K8oBAeR3B\nCdXgGy6dvwgmi4n4frFNGnuAD96fjxR1I49ITzW6OYmV18XxxahY9m+pQO+bi0T3VWTyHxBYsRFL\nEqUplxNTlUFEPWMPoEiCGnsgtHCQFnL/P8xM+Pk33nywlPge20jq+zsxnTNIGbAes83f2HfbPo2U\n4v681fUJ3n3+CX59+lFcWiCtBgGOCHCZ+Sg3g7DPXoE+v6MVdkUr6hy0qFV9atrMpK4gh6IozNv1\nLEVD49jWYxnb7PPYbH6fLNNOsrCxlwgKqYt2EsCyN29mj9we1NgDTLhpLEDtDEHDH5cKa062ctKS\nM7iizxAAMtYXURP816Jfi0KTxr6G6f/98pAZe4D7JzyN29WykNDDgaIqTLp7RItOXOIVIzvjp5kt\nNvYtwYjS+ZsRFRWJVs9nb9E1hri6sEu1U7zNTck2E1CMQikWrESqaURqNXHkXvJjjqKo+xjClGy6\n7FtYF6ouBC8ufZamOOXUriz4+EC7n9fpEzZSol1NzgvB77JZWVeRmbgULPVmjHutBnsFVDV+PQfq\n5IcF0DUbNIW8J96n03+mgdkZVCm5hku4EYGg1O3kxB8/YkWpbzLujoHAwNqwjAPFcMJcgeKuyeb0\n7qTnSAcpXVIaHsIPl8PNBlsZjjCw11OxlEBmp3i+OGkyagS8crmV83rX+dqT06LQUClGI5bm8oLr\nuPCbac22eTTewsbMkkNm7AFclS6uiL6Z16qeOoRHaR3WCDOX/Hgir04IPl9V/yFr7h7JzStmEH+I\nfPc1GC6dvxmFhYU82uUVrM46P7AOfBtvYXfZCyieEkBHSDDFnIrDEkevvO+wYUMTVvZ3mUlZryMY\nMSWOb24cSFVeIZ8+/QVpo/oy6ezjmj2+x6ORZJ+Pp53dOgDPvDyCCy7tG3S92DYYUrZ6X0Nq+O1i\nWHk6ZPT2hq+EV4DbDE4rxBbBiHp5FTV6/78cBxYHsac9SNjoLxpZSd1tpjpzPKeYT+H1bnM5WFVB\n6sKXg5Q6qYcH+nxuI2GTGY9d47p/pHHe9ZOa3GTp06t44alvScwEVYPKGIG1UqKr4A1bVfhq4nHs\n7+4Vges+MIe914yrjdq6Nu49HMXeRLVoX3Ckb8uAXLtpNp2PaDpksLywgqsT/h3yA6St3PL91Qw+\nYcBhOFLL0dw6T4+YT/7mwJFFs189miMv6t/mUFPDh28QlMfeeIr8q6tqDZiqqSw8/2O6/NqH9Tv3\noVu74DZFgWMN4e5yBnQdiX3E6cQmxfLkI+OJi22p+qU/UkqilZbJJoeC3a6S6wheg0ek94IkX8KS\nMxJKu8JvV4Kuwv7u0NUXESIkVESCrQqsDbJ2pSRRaAiPmz6RsaS6q/iq826snX9Dq0ymbOMl6M54\nzOE5nNfdgkvXWF1wkF1VrQslLJlxJdG2wNd769e7eOWMebg8GmaX1z+fnNE4fNZtFrx6zhk4bV43\nzBd3K8xI6QlA/r5y7hv1JZWFXv+/CY1oJBXJxZSlFhG9P4HIvBhie4dz866zQjJMF1mvw+Nqvs5t\nezH2vCO58p1/HrbjtZb0nw/y5dW/oqgw/dEx9J3cfkXTD5d4mkEH5KYLr6fsjDJu/d+dpBemc8E5\n5/HAUK/wm8ej4Sh3EBEdjhJKMdxWcKgSZ6qqmjYy1uoEnK6DsPVsb6Fv3dcPRYcee/0bR5UG1e2x\n2KI4MOOi2kVZrhLG//w1e7JSEUInKm41ZdXVvJHR9vOM+fIF5Bn/8lt2b+enceQ4EChodhWLS0NX\nCGjsAaSQ9N2fzua0gYDk5m/3MeMir8FP7BHJ0wXnsH9jIXm7y9FOKGScGI1Og/RbAa6KdO6MvLPJ\n/hbsLzqsxh6gx/D2M5yHkt7HpHD9xtP+0D4YBv9vSlREFC/c8kyj5SaTSlTsn6MgRXuiS8ksz+XM\n27occoeCbOar34StrvT5o17/JYOLF9cYt6NQgIWXCE5csS5ARlbrqXA5ibB41Rzvjn0cT4mOCTMC\nQXiVRjUg9OBdVj1gEXVKkbqncQRWt6HxdBsaj628v9fYQ6Md3sVd3FV+V5OKrwc2N1El/BBxwjUT\nDvsxOypGlI7BIeX9Zz7krMHnc/3Jt1BZWTejePLs9tfGjYkLHEoqpWT68i+Yt7MKckY2b+yboYc9\nknd+rW/svbEwOpIT3y5s074Dce+WuopYzhJ3bcYrgMcnXNbU40UAGV3qCrk8Pj3wiHinthMnTYd2\nQtM1IFL6t39ZvqaYedcUTOYWJp79jTEMvsEhwe12M9E6nXevm0/plkp2fJXOqRHn8t5THwLw/oIJ\n7X7Mb5YGFmZbWZTDotyaZK+2j7zfHD2Z87/X8C864/txxYK75dr/TVHsq+tbWejAhMlPxEtvZipY\nAgUpFoqtnQBJp66lnNy1Z8C2BbIg5D5FlQeOakrqlRBw+aFg9r3TOf3/Zh624/0VMAy+wSHhvGEX\nYXeFo6Cg+v6YMPHmDR+wbl0xS5bkkVV2KvZ2ikKbMTuFQYMbS0gALMrNbLfw/wGRsQyNa1h8ugGF\nvQIvbyVz+w4DoLLA0SgvQmnmFtYU2N25N+FVDu47U5B954igcyijldBLbpZTzjuudwKuG3fekSHv\npzWcfNcUXnc9zey7Tzqkx/krYhh8g0NCwdbiRsZIIDBjZuKotzj++J+JivqS4cd24eUPxvLLxqks\n+mUyB0pPJTqmZS6Xbt3svPvJMUHXZ1aWB10XChahEKmaeWDwUWyddq5vaRNK5+3ovzcBw2K97q+3\nzvsIgaCEIraxiS2sp6JxPcfanmkKqDqMW7ONL0dXc8fEph9EJsXEBeoFIfftDucdAZef8/ipmO2H\nbnpwzv/NNNw4rcSYtDVod5YuzUcjBvAPaXRhIYvuuEigxlh+910u332XCwhefRs2Sn4AAB7HSURB\nVHUYY8YlsCt7FkP7LCQ7qxlJTeCY8Ql8vmhi0FHrxpICXt23LeC6UCiddQlRZlvjFaoDtCCvJ3G7\nWn28+kQqZjJmXAJ45yEK15aznz1kkI7ukyfOJ4fOpJJKz9rRv4rKgTSV3FTJ8CVeH/+B37O4t88T\nCLPg1g1XYbUGLun3RtgbmBwmXtVebbZ/4SJw5m9UUiSn3Dmd+Xd82eJzDgW3y43ZcoiqfP/FadMI\nXwhxjxAiSwixwfdzYr11twkhdgshdgghpra9qwZ/RjwevbYq0c6d5QixgAkTlnHAJ6FbQwFJ/MBM\ntjMUJ3b8azN5jfUll6xHiE9ITf2GzxdP5EDpbJavP4GMolOYPC0J1Tc8sYepTD2pE9kVp/L1T5Mx\nm4N/jW/e+HOrz+2yrv0CG3vg61NNeGXOGogYJG4Bc9uzyp4fNoGy064h1uKNwV/y6K+4cJLB7lpj\nD6ChcZBMHFRgJww7Ydiw0WW3zrAldbIDG17dRnW6h+rtHu61PcPV4nYutF3DZdE38vMb/gVUXgl7\nBS1coy/Bk9gAPrN+FnC5rumU5pW09tSbZfvy3Yds33912mOE/6SU8rH6C4QQA4GzgEFACrBYCJEm\npTy8AboGh4w5c1by8cd1IXi9e9tJT6+iJkF/H2lomLBTRTIHWMcxaCF+3QoKXAwYsJglS8YxaZI3\n6uPTbya2qp/rS0KfiGzIS2ODawJNHzgA88AnceemQUkPUDRI/cWbrNUOXNnHv57p1k+3U0Q+geYN\ndDTyySWB5NpRvklXMBGGk2oUVBQUpE+TUkfHghXVaUI6JfMuWshbl8/jyaz/EpHoHbUrisLOSG/Z\nwajyKMrxd4tNUibRz+xf1rIsv5z7Jz5F9pbmaw60hS79DlP1878gh8qHfwrwoZTSKaXcC+wGQp8R\nMvjTommSuLjP/Yw94DP2UFsMBTP7SGMbQ1jKiXhCMvb+BeEmT17Bt99mt6m/vcKDaOQ0w6qjm4/+\ncJ1xA126HYB+30Hfxe1m7F2nXefnovr1+fXkrixCxeRn7sMIJ42BHMEo4vDq99dF8EhcOBEIPLhx\n4cSJk2qqcPr0+Wsm0k2YsLsjuDPpEe7o1rhoTVlkGe9Y3yGVVPqL/my3bWdJ+BK/NtUV1cxNuu2Q\nG3uAuK6BxcUWfvg14xOO56xjz2fV0tUc3H/wT1MT989Cm6QVhBD3ABcCpcBa4EYpZbEQ4jlgpZTy\nXV+714BvpJTzA+zjMuAygG7duo3MyMho2MTgT0Ri4ucUFBzeF7XS0hlERbXOZ/trQTbjfvyk+Yb1\nmB7Xha8nzwZg9e6tTP7kTqpTw1ArnQzd2YVVjzYW6xIfP9Gq/jUk++TL6GSvKx6jeXT+G/kMnmoN\nN25+YTEaGvEkcQQjECi+0bv3Pvb4yqF7cKMR+Pfkrajr/6agoVFJOdHEIoXkSf0/Ler3vWMfJX3l\nob93b1txPQPG9qn9vPCTzdz7z3uJq/RgajCo8OChiAI0q4fFB74iLuHQqVD+0bSbPLIQYrEQYnOA\nn1OAF4BewDAgG3i8pR2VUr4spRwlpRyVmJjY0s0NDiPvvLPvsBt7gCuu+K3V245N6BxyWwU4eOL5\ntcb++S8WctSW16noMw6P+SicsWNZfVQqsbcF1+sJSAhjqlOSe+I67To/Yw9QvLcUze2NtTdjZgij\nUFEZyDBUTLWRUDUGXKLjxlUrbV1OKXvZyV52UkZJ0Lh9FRUPHixYMUsLz0x5rUWn2F7G/p7VN/O2\nfI635XO8qT1D2rG9UK0qR5zYn7flc37G/pMPtnLT6ReRWCkxYao9t5rENDNmEknG7DRzTNIkY7RP\nCD58KeXxoexICPEKsND3MQtIrbe6q2+ZQQelpMTF+ee33vC2hc8+a1u6/sDwGLZWBp9EjFBN/Gfg\nGG7q7+83vzH7TWTCONB9iVTSAhJKRo9g07adDBmQ1vzBJeBWwex7UPoG1rM69aBvZDzXpQ2nS1hw\nt9Nj4/+HqllrDXociRzN8ajUhSXWj803YcaMt7872cxBDtSL6Mklihj6MtBvewAND/lkE4c3cWrf\noizeP+czznl/VvPn2E6c9egseh1ZV9BEURTuXPavoO3vnvscKSQhkbhxY/bJTdRHQSGOREpkMc/c\n+zzX3TP3kPW/I9CmSVshRGcpZY2TdTaw2ff/L4D3hRBP4J207QusDrALgw7CW2+1ZATXvsK4WoA6\nqC1hy4nnIT4OXA5PnhHcADg79QZXvaxZTSd8XQa2nQe56OdlRBWpqBHw3Ion2TLpbAb98EHj0XxR\nNPx0LGMm5fLLxeNQ1dDix9877zM2vrcNFbOfQa8ZudYsk74/fueEpJIKDpLpN6LX0SijmHyySaQT\nqu/219CooLz2DUBFRSJZ+8Emptx3HAm9mneFmMPMuB2tK0bSb3xvbl10DSZzy8yRo2g7Nrx9sxI4\nzBS810xF5asPvvnbG/y2Tto+IoT4XQixCZgI3AAgpdwCfARsBb4FrjYidDo2u3ZVtqh9TeFtpakE\npRCZPDmp+UbNIM+YyymdetR+PqdLnyaNPYCQ9SqD7S5lyNUL6ffUWo78wkViUTg27Fgqwpg75GZW\nvb4Y16xrsCqKt8BARRisGEnUrqH879owVl52bEjGXtd1zhdXs/K9tbXlJCsob+R3F4iAFclqcFIV\ncJ2OTjFFpLODEooop5RsMtnNNiQSK7ba0b9E8ua0j5vtM8DV8y4OqV0grltwWYuNPYCieGsCN5dt\nLJF48JDYqe3fo45Om0b4Usp/NLHufuD+tuzf4M/Dsccm8Pzze0JsLWrHnyoSvY2j/bffbp9U/c+O\nPbnZNqu11SzVlpIoErHv3oejWzLDn9pD8qoC0CNRiG7kNjBj5s1b3iMyIZzqC68PqS/FGaW8NWM+\neTsKsUaZqS52gSbIIYsY4uu5XLzGzIMbS71RrIYH0YShi/D1s6HRFwgSSKQbvanCgY6OnXBS6EYV\nlSgoaPXeCioKQ4s8GnHyYBSzQHe37OE+5MQBRMQFL93YFAMH9kPbvKfR76M+EkkRBd42niZKb/5N\nMDJtDUJi9uwU7HaFqqpm6zbVIhF4EJjQ8KDQGjfPokVHk5AQ/HW9Lei6zv/NfYAFb30GboiMi0aJ\nU8idk8XBG/ajzTDR554Iktc5UHXAZ4R1dJxU48aNFStWbJgw8chFT3HqP2c1W0fgt/d+573zPq/1\nrVcVVKFioopK7IQ18q8DuH0GX0djB1vI4QB2whnFOJQA7c1Bbm2BoBNdcftUNhXfH4BwIvHg8Uvs\ncoSFLkvxTNYDzE26LeT2KYOSuemrq0Nu35CLbziOly9uehCio1FMIRBLTlbrHix/JQwtHYOQsFgU\ndu5s3QhJ9ctGbRk9ex6am1RKydCwUbz/wodYHFaS3J0RuQqF2/JR7jXTM6Y/CW9Fk/ZhBaqrrt8a\nGpns4wAZ5HKQ/ezlABlIdEop4cP/Ne8C+eC8LxCAGQsChSLy2cp61rOydsI1YJ994/XOdPH56cuD\njm4FCv04op50nTf5aiDDsGKrlVVuiAc3OjoaGhoeNh3Y0Oz51BCVGMlrridJ6hNEMVOB2C5RTLzi\nGJ7Ouo+HNt8V8r4DMeOf03Eq1UEjj3R08shBQSWTS+k71ogCNEb4BiHTtWvrjK8TFTseqjDR0gnd\nwkIXvXu36rBNcveV9+J0ukgkmWhiKSTfNxKs6bNG2IORaGh+8d15ZNcmLtVQhYM8PIDk+Tte4py5\nZ/qvL6lm2cOrcRRUkzAgpjbztYQitrIeFRPVPp97NVWE0fg614QaqqhEEM1AhgUUTqtBIrFgZSRj\nKaOUA+xjOEdhxRrU71+zXREFOKkik73YA/SlKcxmM4/tuqdF27QWVVV55fdnuWjQVbXXrK5OgIcC\n8tCxcIBL6YyN+54dcVj69WfGMPgGhwFBVW2ktKg3Hmve8A8ZEn1IejTvpfk+V4hoZOzrU0YJMcTV\nJjeVB1Sn9Ga1Arh1/5Hzb29tYcGF34LEN1m6jSocmDHTnT6MZSISiY5OOts5yH560a/RRKQZS21E\njoJCIp2IJ8kv/r6MEg6QQTFFJJCIQCGVnlixUY3DL2Zf1Eot+FNBGXlk05/BZLKXKPXQXP/2otfA\nnixyfcHElKmUFJT6sosVVIaTzdmYMdPZ7OHVb48jISGwLtLfCcPgGxwmBKrVBBrontBcO6efnoLN\n1v4yuC6nV8VTolNAbpMj3iIKiCAKk+/h0BQKKsMnDqk7TrWHBRd+i5SSUor5jRW16/oykASSfZOk\nGgoqvRlQ62qpicqxYUNBbTQBWzPar/usEEUsHtJx4ySbA9iwk0rP2jeSveykN/1QMWHFSjV1E7Le\nR47OPnaTRGckEhUTVz99aQhX9I/FbDbzc/4PjZbn53nPOzHJMPQ1GAbfoEUkJprJz29dvPWYMXHk\n5TnZvr2i2bYXXJDKG2+EFp3jduukp1ezbl0Fr71WSEWFxsyZMbjdoOsQbinix0U76NpZ5eFnJ7Jx\n1XqAJg19DRJJBulEEk0YYai+KejAbXVuf+IWADRN44pe11Mpq0kgiULyiSUBFZVoYkikEzoaBRQQ\nTwIS/PzqAoGdML9yhsH6602ayiGRTgxgKOWUYsVGCYVUU0U4EehIMtmLhoeepGHB6svK9U4Gu3Gz\nlQ24cBFBJAXkcvvjt3LC1RNC+h38GTEMfWMMg2/QIrKzT8JkCiyL2xSKAkuXjmfXrgrGj19GSYmT\nqqrABmzz5kkMGhQT0n5PO20nCxY0zqJds6amfq4O6Ai8Cotvf/QLU0d+G3q/VSv7b55I/PursGYe\nRPi6HGgmIjI8ku59vJmi/+x6BXE5nbDgYhNr0dHQ0VFQqaQcKza28ztHcCQgGj1EVN+ovqmQwxok\nkhwOUk45fehPvE9ILYxwn7vJO1dRQC4HyeQg3nKPZiyM5lgsWLFg8SpoohJDHHfsuI4ead1Cvk4G\nHQPD4Bu0CFVVkPJULr10Da++mtn8Bj7WrJkAQN++EezfP41Fi/LIzXUycGAEr7++jx07yrnnngGM\nHx+69O3s2Tv57LPmdNcVQEH6vuoezCxd15/O/BTSMXLOHkbVoC4cePA0or/eRPSPO8DtQYsMw74n\nr7adRbUwf80HABzYcZCknG5IdNLZ6Ddy19GopoqtbETFRJQvXl4gsGBBR/qCWBsH0AWKq687S4Xe\n9PN7QNTMO5gx0ZM0wogghyw0PMSRSC/fSN+7b4Wu9CCOBCL7hhnG/i9Km9Qy25tRo0bJtWvX/tHd\nMGgBui6ZMmUZS5Y0nvRUVTjuuHi+/fZYLJb2jQD2eCRm85pWbStw0Zsra9PD6pYLbNjR8ODyVetK\nf+NC9PAAeQAS7v/Rye7Nuxg8aiCX//dSwiO8xUpu7nsvYreZaqpYyy8BjbSdMEYyDpNP/6XGZ1/T\nVkNrpA0j6/2BugLmTqqxY/eLx69/zJq8AW9bJ9HENHpzkEjKKOHcV09j3MUjQ7uQBn8aQlXLNEb4\nBm1CUQSLF4/H7dbZu7eShAQrcXHBY8nbi08+aX1hE5CMPmE2axYtqF0iEFixY8ZMFQ5fK9Dtwc/l\nxmeuwxpAEqBqt4vwJrRdAPozxKdvL2qPX/9fFdXnAlIaGecCcqmkAg9urNjoSo9GYmr1Hx4KCnbC\naqOBAl8RyS2/XUn34akB1xv8NTASrwzaBbNZIS0t8rAYe4Crr97f6m1VKnn63etYsvdbLFZvfy3E\nIlApoxSAGJIYxNGgmRrni0lAUwMae6hJqPKGPkbQWAlTQSGamGY1YGqyeWtG9R7cuHGRQDL55JDJ\nXpLo3CgzN5jf3+veseCgErdPL9+bXuXmiLl9DWP/N8AY4Rt0SAoLW6PF542FOeOUcq459XrW/eKN\n1tExMYFzifCpRILXaLqQ9Fwl2Hu0VjdL6zP+kTmBszb3r8qikgps2KnCQQJJVOOVTrBhx0mVL8kq\ntPQzNy7cDYrBW7AyhFGsYmkzZ+s/0nfjwoSZdaxBILh47kWYzCoz7plKWJQ9hN4YdHQMg2/QIREC\nWj795K0Flbd9Pvt3bKldGkccVjTwRcbUYEFw7DMKmQMj8Vg0sLjBaaFXRSfSb5ge8Aj3H/UUCb6E\nqDgSyOMgPelLDHG+tDOFAnJZyy9UUYkZC93oRRe6N/DX6/UMtZtC8nDjIooY7IRhxUZvBpBDFt3o\nhYaHYgpRUIklHoHAjcvnGvJOBKuolFCIBw93PvFvpt4wqaUX0KCDYxh8gw7J7NnRLFhQ2uLtJDY/\nYw/4XCLBol/Affk/Q9r3O9d9TDxJvuN4M2L7MhCPL+RSQcVBBRns9ptwTWc7TqrpRV1RcLdvmzJK\n2M7v1KRGZXOAXA5yBCNxUcV+9uHGzQH2+bmI+tCf3j37ULHXgYqCCRvllKGECz5e9y6d+hlSwX9H\nDINv0CF5550+fPbZOvTQxTt9VDdaUkRuQHPvAopaoPuz8pn1xBDnFyHTsK7sATIaTZzq6OxnDw4q\nMGP1lebrjAkTO9nip16po1FCETkc4ABZeFRBhsxA0XU/WePdbOPWN69jwHF9qCyswmRVsUYcnvkV\ngz8vxqStQYckLEzF6TySpKSWfYWnTm3se5forOEbPLVjcXACBajc+kboglthhAcMd6xPJcHlhkso\nwo63sMouNrOTLQH1bnQ0DpJJ5tjxFA4YGdC3JYH7T/CWmA6PtxvG3gAwDL5BB8ZkEuTmjmLBgtDk\nNJOTVb75ZgAJnRrL92azj6/Ywjqs/I6F9Vi47ptxjLtgaMj9CRTy2FCr3k5Y0O2HMYY4ElBQSKUX\nnUkNKv6goJAxdQ5SVVFk4NcczR1YAsLg74vh0jHo8MyeHc+PP5qYOHFHk+1ycrwJRcsyF3NMykSK\n8otr13VO7cSy/c+2qR+qUGmYyGhCxYETxafhmEK32tqx9UmiM/nksp89KL6UsBoNe72BW0hBIRlv\nJmzRwJHEb12H6nb6tZFIjj55bJvOx+CvR1uLmM+D2pmmGKBESjlMCNED2AbU3IErpZRXtOVYBgZN\nMWFCNNOnR/HNN8E14mswmUyszFuOw+Fgx4adDBjRH5ut7UJbiqqAx18CIZ9c0tlBIslEEk0VDhRf\nUhXgi9WPJIFktrMJWc+862iYsfjeEmRtIlY08XQmBZPTRUnfIZT17E/Uvu2oLidSCFTprWo194uL\n2nxOBn8t2lrTtrbSgxDicaB+2ES6lHJYW/ZvYNASvv66P0KsDrhuwIDGma9hYWEMH9d+X9HEEbEU\nra6rQiWR5JKFi2qyyPBrq2JiAEMoIp/OpLKvXuROfTQ00hiECydu3EQR4wu7hP7OanaIMHaceTUJ\n234jdttvdN+bS0/ieaHskXY7L4O/Du3i0hFCCGAOYAT2GvyhPPNMV6699oDfsrAw2Lo1dF98a7l2\n2UX8x/YUUFehKlC92fr0ZgAqam2N2YbUPDqS6AyACRMqqrf84GND+XZTKQuX5ZLYZThXv3AKiUaR\nD4MmaK9J22OBXCnlrnrLegohNgghlgohjg22oRDiMiHEWiHE2vz8/HbqjsHflWuuSUHK0Tz4YAoz\nZ0aRnT2EysrRh+XYZquZ63ZcgGapk0NIITWg0VdQiCUBk09PJ9FXDKUhEkkciZixYMNeK7bW7eRO\nCCGYPjSG56/pxz2XpxnG3qBZmlXLFEIsBjoFWHWHlPJzX5sXgN1Sysd9n61AhJSyUAgxEvgMGCSl\nbNLBaqhlGvyVuDf+KZxFbnazlSy82j81I//BjCSW+Nq2GhrrWEEVlbWunbpona4+5UwFHZ2R1w/g\nrCdn/yHnZPDnpN3UMqWUxzdzIBNwKlCrqSqldOINZUZKuU4IkQ6kAYY1N/jbcOOOy7g38Ul60Z8u\ndKOIQixYiSep0WheRWUk48gmk1yyMWOmE12IxFtTtr7Lx64GD+00MGiK9nDpHA9sl1LWOk6FEIlC\nCNX3/15AX2BPOxzLwKDDEJEQxr2F/6IKBypm4kgkMoAWfQ0KCimkMpChpDGo1tg3JCo54lB22+Av\nTHsY/LOADxosOw7YJITYAMwHrpBSFrXDsQwMOhRhcXaOu38UGlqtoa+miioctVm09YuanP7GSTzi\nurPJh8LEm48+PJ03+MvRZoMvpbxASvlig2WfSCkHSSmHSSlHSCm/bOtxDAw6KqfdPoM5H07HbXOi\n40FTPAy7qj93FV9LvxN7EZkUTvexXbgzfy6jLjgCs1nlhvUXN3L7CAQjzh/8B52FwV8Bo8ShgcGf\nmIcGPkfBzhIsEWZu2no5cSmhFXc3+HthlDg0MPgL8O+tc//oLhj8hTDE0wwMDAz+JhgG38AgBDRN\n5+mns7jwwh28+25uI5E0A4OOgOHDNzBohq1bKxg3ZDlobkqJAhRiY1XS00cTG2v+o7tnYGD48A0M\n2sqBbQe5ffADSB1m1Fu+hy6sKB7HSSdtZsWK4X9Y/wwMWoph8A0MArDq4994fs7rSGgUEd+LLJys\nZdWqI/+IrhkYtBrDh29g0ABd03l+zutAY2Nfs2wAe5H6n8cdamAQCobBNzBowIHNB0Nq191qqLsa\ndCwMg29g0ACzLbSJ2NmDQnswGBj8WTAMvoFBAzqlJTW5vsaRc8Nbpx76zhgYtCOGwTcwaIAQglPu\nmhp8PWCPtpI6uOvh65SBQTtgGHwDgwCc9n8zmHnXlIDrEnvH81LxY4e5RwYGbcdIvDIwaAIpJZlb\nDrLyg3VEJUYw7pwjiUqK/KO7ZWDgh5F4ZWDQDggh6Da4C93u7/JHd8XAoM0YLh0DAwODvwmGwTcw\nMDD4m2AYfAMDA4O/CYbBNzAwMPibYBh8AwMDg78Jf6qwTCFEPpDRhl0kAAXt1J1DTUfpa0fpJ3Sc\nvhr9bH86Sl8PVT+7SykTm2v0pzL4bUUIsTaUWNQ/Ax2lrx2ln9Bx+mr0s/3pKH39o/tpuHQMDAwM\n/iYYBt/AwMDgb8JfzeC//Ed3oAV0lL52lH5Cx+mr0c/2p6P09Q/t51/Kh29gYGBgEJy/2gjfwMDA\nwCAIHdbgCyHOEEJsEULoQohRDdbdJoTYLYTYIYSYWm/5SCHE7751zwghApUsPZR9nieE2OD72SeE\n2OBb3kP8f3tnFJplFcbx35+ZXphWVsgwyw1M8spKxAvtRgk3SrMg1k2GFxFIJBExGIS3FnYVJESR\nhSVESRFEtajuzHLMtZq2TYUac0I3C4qF8HTxPt86+/i+uTa3867v+cHL97zP+T725/+eczjnvOfd\nK/2VlB1dSF11tB6SNJJoak/KavqbSecrks5J6pN0UtLNni+jp7vcsyFJnbn1pEhaK+lrST97u3rO\n83XrQUatl7wd90r6wXOrJH0padA/b8mscUPiWa+kcUkHs/tpZovyAO4BNgDfAJuT/EbgLLAMaAGG\ngSYvOw1spXiHxWdAW0b9R4CXPF4H9Of2tErfIeCFGvm6/mbS+SCwxOPDwOEyego0uVetwFL3cGNu\nXYm+ZuA+j1cAv/i1rlkPMmu9BNxWlXsZ6PS4s1IPynD4tb8M3JXbz0U7wjezATM7X6NoD3DCzCbM\n7CIwBGyR1AysNLNTVlyFd4BHFlDyJD6zeBx4P8ffnyM1/c0lxsy+MLOrfnoKKOtrqLYAQ2Z2wcz+\nBk5QeFkKzGzUzHo8/gMYABbT/4TeAxzz+BiZ2nYddgDDZjaXh0qvC4u2w5+GNcCvyflvnlvjcXU+\nB9uBMTMbTHItPsX7VtL2TLqqedaXSt5Kpsj1/C0D+ylmbhXK5GmZfZuCpHXAvcB3nqpVD3JiQLek\nM5Ke9txqMxv1+DKwOo+0mnQwdXCXzc9Sd/iSuiX11zhKMzKqZoaan2BqBRgF7jSzTcDzwHuSVmbW\n+jrF8sMm13dkvvXMUmflO13AVeC4p7J4utiRdCPwIXDQzMYpUT1I2ObXtQ04IOmBtNBn8KXYfihp\nKbAb+MBTWf0s9RuvzGznLH42AqxNzu/w3AhTp/uV/HXlWpolLQEeBe5PfjMBTHh8RtIwcDcwr+97\nnKm/kt4APvXTev7OGzPw9CngIWCHN/Zsnk7Dgvv2X5F0A0Vnf9zMPgIws7GkPK0H2TCzEf+8Iukk\nxXLZmKRmMxv15dsrWUX+SxvQU/Ext5+lHuHPkk+ADknLJLUA64HTPt0bl7TV19CfBD7OoG8ncM7M\nJpeXJN0uqcnjVtd8IYO2SbzRVNgL9Htc09+F1ldB0i7gRWC3mf2Z5Mvm6ffAekktPurroPCyFHib\neBMYMLNXk3y9epAFScslrajEFDft+ym83Odf20eetl2LKbP57H7mvoM9hzvfeynWQSeAMeDzpKyL\nYkfEeZKdOMBmN3gYeA1/8GyBdb8NPFOVewz4CegFeoCHS+Dvu8CPQB9FY2q+lr+ZdA5RrI33+nG0\nxJ62U+x+GQa6cuup0raNYhmkL/Gyfbp6kElnK8UOp7N+fbs8fyvwFTAIdAOrSuDpcuB34KYkl9XP\neNI2CIKgQfg/LukEQRAENYgOPwiCoEGIDj8IgqBBiA4/CIKgQYgOPwiCoEGIDj8IgqBBiA4/CIKg\nQYgOPwiCoEH4ByA09CcrSjRZAAAAAElFTkSuQmCC\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Draw bhtsne result for a fraction of profile - profile_small\n", + "\n", + "from matplotlib import pyplot as plt\n", + "import matplotlib.cm as cm\n", + "\n", + "color = profile_small[\"color\"].tolist()\n", + "mx_color = max(color)\n", + "plt.scatter(profile_small[\"x\"], profile_small[\"y\"], c=[cm.spectral(float(i) /mx_color) for i in color])\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Run PCA on profile_small\n", + "import numpy as np\n", + "from sklearn.decomposition import PCA\n", + "\n", + "pca = PCA(n_components=2)\n", + "pcaed = pca.fit(data).transform(data)\n", + "profile_small[\"x_pca\"] = pcaed[:, 0]\n", + "profile_small[\"y_pca\"] = pcaed[:, 1]" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAD8CAYAAAB0IB+mAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzsnXecFEX6h5/qnrizEXaB3YUlJwmCgAqICTBiDocieofh\nzGeOd3hGMOKZM4o/D0QFMXCAICZAyTktYVmWzXknz3TX74+ZzbMBWCT1w4fPznSoemem+1vVb731\nlpBSYmBgYGBw7KMcbgMMDAwMDP4cDME3MDAwOE4wBN/AwMDgOMEQfAMDA4PjBEPwDQwMDI4TDME3\nMDAwOE44aMEXQnQQQiwWQmwWQmwSQvwjvL2VEOIHIUR6+G/CwZtrYGBgYHCgiIONwxdCJAPJUsrV\nQogYYBVwKfBXoFhKOVkI8QiQIKV8+GANNjAwMDA4MA66hy+lzJFSrg6/rgC2AKnAJcAn4cM+IdQI\nGBgYGBgcJg66h1+rMCE6Ab8AfYFMKWV8eLsASirf1znnFuAWAIfDMahXr14tZo+BgYHB8cCqVasK\npZRJTR3XYoIvhIgGfgaelVLOEkKU1hR4IUSJlLJRP/7gwYPlypUrW8QeAwMDg+MFIcQqKeXgpo5r\nkSgdIYQZ+Ar4TEo5K7w5L+zfr/Tz57dEXQYGBgYGB0ZLROkI4ENgi5TylRq7vgFuCL++AZhzsHUZ\nGBgYGBw4phYoYzgwHtgghFgb3vYYMBmYKYS4EdgDXN0CdRkYGBgYHCAHLfhSyt8A0cDukQdbvoGB\ngYFBy2DMtD1KkFKyx+kjy+U/3KYYGBgcpbSES8fgELO6yMXYn3eR5fIjge4xNr44qys942yH2zQD\nA4OjCKOHf4RT4gty1rxtpJf78GgSrybZWOphxNyteIP64TbPwMDgKMIQ/COc/+4qpq6uS8Cr6Xyz\nt/Sw2GRgYHB0Ygj+EU6my4dbq9+T9+mSLLfhzzcwMGg+huAf4QxvE0O0qf7PZFYEpyZFHwaLDAwM\njlYMwT/CubB9HD3jbNjU6shXuyoYlhTN0CTHYbTMwMDgaMOI0jnCURXBz+f15OVNufzfzmJURXBj\n90Tu7t2G0CRnAwMDg+ZhCP5RgMOsMnFAKhMHpB5uUwwMDI5iDJeOgYGBwXGCIfgGBgYGxwmG4BsY\nGBgcJxiCb2BgYHCcYAi+gYGBwXGCIfgGBgYGxwmG4B+hvL01n36zN3DBgm2U+gKH2xwDA4NjACMO\n/wjDr2lYP11T9X5jmY+E6eu4oJ2D78/rfRgtMzAwONoxevhHGNE1xL4mc3NdfLQ970+2xsDA4FjC\nEPwjjMacNzcu3fun2WFgYHDsYQj+EYQroB1uEwwMDI5hDME/gjArRjI0AwODQ4ch+EcQe5tYoLy1\n+ej8ucoCPlYW55HvdR9uUwwMjmuMKJ0jgD1OH2MWprOx1NvocTnXDPiTLGoZpJTcv+433tixAZDo\nwHlt0/hq2AVYVfVwm2dgcNxxdHYZjyGklIyct61JsZd/HYxZObp+rinpa3g1fR0BqROQEk1Kvs/d\nw2VL5x5u0wwMjkuOLgU5BllW4CLD2fTatD1nrv4TrGk5XEE/D65bioyw73+5e3AHjfV4DQz+bFpE\n8IUQHwkh8oUQG2ts+7cQYp8QYm34/wUtUdexRp4nQHNic7a7dXZXNP4UcKRQ4HMRPfs96i+9Xs3K\nkoI/zR4DA4MQLdXD/xg4L8L2KVLKAeH/xnN8BE7Zj3Vp39qSfwgtaTnafjO10f0CaGO1/znGGBgY\nVNEigi+l/AUobomyjjdSoizNPrbsKIjTn7JtTUQ3Tk06REXTK7bVn2KPgYFBNYfah3+XEGJ92OWT\nEOkAIcQtQoiVQoiVBQXGY35jPNw3+XCb0Ciby4t5dOOyJo9bdMalf4I1BgYGdTmUgv820AUYAOQA\nL0c6SEr5npRysJRycFJS0iE058jljDZRzTqua5ztEFtyYJT6fZz245cMWTgTn96Y5x7aWOx0i47/\nkywzMDCoySETfCllnpRSk1LqwPvAyYeqrqOdhef2bPKYd05N+xMsOTCuX/4Dy4pycWvBJo89P7nj\nn2CRgYFBJA6Z4AshavofLgM2NnTs8Y5JVdl9Rd8G91/fKY5beh6ZTz8zM7fzbU5GoxE5lcSYzPzz\nhMGH2iQDA4MGaJGZtkKI6cCZQKIQIgt4AjhTCDEAkEAG8PeWqOtYpVOMDfnXwczKKOKptdk4Azqj\nUmJ5fEAqHRzNH9j9M3li3RKe2h45nXNNTEIwJrkTk/oNM9w5BgaHESFlUzEVfx6DBw+WK1euPNxm\nGDQDrxbAPuvdJo8rueQm4i1H5tiDgcGxghBilZSyycdnY6btUUyBz8MD636jx/8+5dRFXzBzbzpS\nSpZpyxjuGk50RTTdnd2Z5p/WovVKKZsl9lahGGJvYHAEYSRPO0op8XsZ+MMMCnwe/LoOlHH98oV8\n4/qJ2an34CaUmXKH3MFtvtsolIXcZ72vRepWvnyzGUdJRvfUmRecxyh1FCZhXGoGBocbo4d/hPDz\n9N+4qdtdXGK5hlt738Pvc1YQCATIzcrF5/XVO/6tnRso8nnDYh/Cp2t8ZnkFt6ydhtiNmyf9TxKQ\nB78Y+vSdm5t97OKOj3K152rau9qzRdty0HUbGBgcHEa36whg0bSfeeu29/G5QwnFsrZmM+mql8k3\n5eISTgRw3V3juP+5f6CEM2b+kLcXrx5h5m3svlDugjoECZIrc+kgOhyUrdeu/rEZR0nAg0s4AXBK\nJ2M8Y9jh2IEQxiIvBgaHC6OHf5CU+Dw8sPY3Hlj7G7le1wGVMe2x6TjdLoooIJd9lFNKIBAkyhOD\n1+3F4/byf69/xptPv1N1jkWoEGnA3d26wXoSReIB2VdJjqcZn08CUoBpD+w8GSRIJPkyn/X6+oOq\nv9FqpeSn4E/c772fJ3xPsF3ffsjqMjA4WjF6+AeIlJLrv/me//NlhDYIeDl9LTd16s37Q0Y2u5xg\nIMjefXvZR2Y4B42knDIgu9ZxHreXqa9M445/3YqiKPj1YEhc63aYd1wIA98DU7X7Jooobjffjl0c\nXMIy2WSWHEBI2NMKdv4DzD7osB5umoBiUnBxYA1iUxTqhQxxDyFDZoRMQPCC/wXesr7F3yx/OyR1\nGrQ8gcBWyl39gMoJfNE4omZis5x/OM06pjB6+AeAruk887cvQ2IvqCW6H2Rs4beC2mItpSQ9389P\nm3OYNn8Fv67ZWbVv4Tc/ksO+sJg2Lqgupwu/L+T28UtJr5+99Xv5Rb1QVl1HXGECJkzEEMO9lnuZ\nbJ18EJ84RIo9upG9YfvTvoe/3Qjj7oGgBfYMgF9vAGCw0vKTrjSpMdA9sErsQ5ZIvHi53Xc7JbKk\nWeU4qcCFs8XtM2iasrJ3KCoVlLt6Uy32AE5c7gtwe946XKYdcxiCv59IKZl21ix+z87l/KmSbmsk\nQq8tuves/bXq9R+7PCRO+ZmeC9/jrI1fcUPZ75y5bS5pr73JrqxCpr4yDY2mUxIAxMTFYLVZAbg8\ntQsJhZC2JoDJq4MuUfwSk09y4aPdmP/zIoqjiymJLuEZ6zOoomWWFHyt37AIW8Of/9Rn4YT5YPFB\n78VwxgcQtCNWXsVU21QsouUnkC0ILiBbZje6vzF2k86lDKM/relHK67mTLLY09JmGtRB13WKSjtQ\nVCoIytsaPdbju4NQhhaDg8Vw6TRB0Kex+J/LWPX+Bn4tW0QxRcSRgIqZbvRg4OI2ZPWA/3tUoptC\nXf1if2ihkkKnxogZfxDosgmq9Fagq7A3WeP0uZ8wxNf8lZ+SO7RDCEGBz8PPBftYdpUdoUkSsjXa\n7ggSn6vR+xc/XVrHM3hc/wMS+UJvgI0lHtKiraTFqLzLS0zjTVw4OYPzeLTXZHQ5jHs2Lg2fEYBT\nX4b4fbULsnhh6Gew+DaSg1243Nx1v21pDp8EPkFvILGDFy8btY38xfyXiPtdOLmUYZRSVOWuWsFv\nXMYwlrAbC0fmDOejGSkrKC6L3e/zdLkP9SADDgwMwW+SL66cy84Fmczyf4aTcgSCUkpQUMhkJ718\nfTlh8yBO/BnWjAQkjO3QHYBPlpYTSEsHNZKrRmFfK4Wbx53B5tVbaM6M59ROKehScsbiWexwlgEg\nVUFxexOuVgq3PxfgvHuHMvKh4ajm/RP73RVeLv1xJxtKPJgEKAIS2u7ActZzSEsFAN/xOb+xkMW9\nt/CP3idR4HXz2i95PPPOAHDHQdc/4IIXICkjVKjFA8DwTs1f5GV/cGseFmiN9+BfDLyIXdh5zPpY\nvX3fMRMfnlpjExoaTir4gW+4kCtb1F6/9KOgHJdzEopK5xN5jaTmIcT+NxIG9TFcOo2w4fPtpH+X\nQaY/Ax8+TJgB0AgSwI8fHzvZhlsr5+yZoXMUHZ7pNxSAb3cUhNwbDaFo5I8qpWf/Hs0KV7z5oRv5\ndM8WtleUEqj5iCtAjbXQ8buRnPP46fst9ptLPZzw9SbWF7mwbd6M/cfFyMwscnPTKFjyaNVxEomT\ncqbxNgCTv/Pw8rdAUUfwxMOmUfDabChJBk2FLWehCHhgdPPy5+Swj5f4J32IpxMqvYnhZSbipfbS\njtOXV5D06BYcd+RQ8tRcWHZNg8MfPnw87X+aYlnMOlZyN9dxJj0ZThfe4nncEQaSfXjYy+7mfXlh\ndmg7+D7wPZ8HPmd6YDq5em7Vvo/8H5FYkYjVacXmtHGl+0qK5bG/XpCUkqLSBIpKBQcj9qCiiLiW\nMuu45vjrajQTd6GH2WPns56VrGcVccRTTlmt3qBE4sLJelYytPRs7GWSe/d0QBGCXK+LX2J/bryS\noErmnNXcOfFWvB4vsz/5Br/Xx+qla9G12m6KHv264+jXkb/N/29onLZO++DWgmwqPzAReXBFFoHC\nIjpPnIjqrB649KWlsXfiI+in2VHMod56AD+rWUqxS+Otn8rxBmsorVQhYIMfb4VzXoP/3c/InjZO\n7tx4eoUNrOYGLqCQvFrbXTj5D0/zBs/xEh9xGdfx9RoPEz7NxxsIjWVQ0Qa+fSwUCjrsvxHL1wnS\nX2+LojZvrMSKjT4MbNaxM3wzuMZ/Ta1tJkwoKFxnuo5sPZt5+ryqfRoas7RZ7HDvYE3UmmNyXoKU\nEpf7X/gCz7ZIeTGOphP0GTQPQ/AbYP79v5LFHoooxEEsUTiooLxeaKIMh1FqKlz+Jlx5Z8idc8eq\nn5EWX8RJUJWIgGDzM+n8I3A/rdu25qvl02nXvh0rf1vN4zdNJGP7HlRVZcy15/PUO08watl3EcUe\nQNFVhrRqe0Cf9df8Cjo8/Qyq01mraGtmJm0/nIq8JgrCgg/Qjd4szXSiKxr1HhI1C2w8F9ZdxPCU\nNnxzR7tG6/6KT7mX6xs9RkPjXm7gVZ4i79v/4Q3UeYIJRMGCe2Dof+t9NwrQ2hGkubpqxUoXejKc\ns6u2SSkpLCwkMysHTQuw2ZzB+k4/8XX0B+RLLxYF/DXa52B4EP6j4EcR65BItupbWaotZbhpePMM\nO4pwe/+JL/Bci5QVH5OJqhq++5bCEPwG2PDpNlqTRCzxOHCEgw511rOSLWyodayCwBSU9FgrSDin\nHVN3b+bbnN2Nij0AAQ1vrxTUdVkU5BRw/7iH+eznTxh82knM3/odPq8Pk9mEqoYEblVJQeQyJeia\n4Nq0Hgf0WR0rV2AqKa5XtABil6/EYyuqtfVS/985Z8P3+INDIhfoTKJVlMKsv7fDZo7sNcwlmzsZ\ny3J+jbg/Erv9+8h1FwARGhF3PGgmMNXuxcdaQRU0KvgmzNiw48GFDx8bWEU/WjGFaZzDxeTm5pKZ\nlYNPreDOfldQagl9HxJIsIb++jUo8DRcR12CBEmX6Qzn2BJ8KX14fS8edDlWUx7R0W3qlC2ZXTSb\nqXlTCcog17e5nquTrm6xCLTjAUPwI+B3BUCCDTtWrCjVITb0ZwhevOwmvWqbRjjFQaqJ3stnogCB\nZgzCCiFq+Z5XLVmDs8JFdExokLMyBLOSKNUcOZ0CQHovHCZz8z5gHWLeea/hnZrOl074ezSUCBgv\nb2LOjjLK1DJIKISSRNBr3nCSs3rY+WB8Em1i69+IFZRzI5fwOz/tt53On65DczXgy40qxqQKutKT\nbWyrarwc5sbFHkBFRSKrf0eggjJu5lK+1n+nIEPj74PORDOHI6pErT8gobT2MEOTCAT9lf77d9JR\ngC7zoZlhxvW5iNbx39Tb+r/i//FU5lOsc67DL/1Vv9OvZb8ys3Ams3rPOiZdY4cCY9A2AmWZ5VWv\na4o9gBkzfTmp6r1A4MePROeNf+l4dQ13Q6IcAfvmGjHkEmQja8I+3Gtg5MFJKYgqbd/sOmuyZ8ce\niJCcrZK4VhoddZjsgrF+E48FhjA3OzPU8PTeEBJ9oYHQEDYPk8ab+fG+FLokRW58xjD4gMQewL3i\nIghEmi0sYcgXnFV4E5/FTKV9NCRFQduoULRRYwgEybTHRUW9fboOZ69+lBtPHUbQ7K83ya4mgf1c\nVmKAMoCT1JOaPK6IAmYylV/4oXkznQ8zimjD/sqKKqbROl5GFPtP8j7hyi1X8nvF73ikp1aj7NJd\n/FDyA7+V/3awZh83GD38CKz9eEujt5ad6kXHJZIAfuwLh1BWug5qTsKShKJVFK1aKCSgaQhNJ/nl\n+SjBaoHv0KU9MXExDdb7UK9BvLdxDzv1mo2EgDVD8Hd2sbvCR+cYa4PnR+LHbxsbWJY89Eo2ZmCw\nBq32dubML05iRfQeKO8F+zqCCNvfNgdrny1c0u+qBktbxbJaT0b7i7A04DNRglhjS9mcNpXLeQch\nwNqMp3wpBd6fXiOj/a/QbUc9Mffv7k/0oFCyuJbsQJ6rnMvsqNmN2CW5vfhhvo97BdSwwIlQ4/Qh\n3zDEfx5L8pzsc/uYsjGXXa4gJkVwYfs4Xhjcno7R+3cNtCRCWLGYx+EPNL0Gg8O2BpttQIP7Nanx\nwK4HcOvuBo9x6S7mlcxjRNyIA7L3eMMQ/Ah8O203oTRjke/yYgpqvf/o18+4w7uuOlVxSQLs6QLu\nKPBFgU5IMRxOaJUPmonUt1/AnlM9WUlRFN6Y9WqTtl3WcQgvLc0GXzm4rOBKgk4ezHFB/revjNt7\ntWmyjOYh6dHPw9mXhHq+ui4Y+9Hb7C5OAMUBqGFXTlhZ85OJj1PpHduqwRL/x1cHZokM/RLJp89l\n355BBIM1e5ASdBOiqAME7UiTq/qcBkRaSpCamcKFr+HLHYxdLaZ1t9q/tgyYsHRZ16TQSwlaE5NA\nBRCrmnjL9AkXmy8mWtRPUeHHzyQe5kP9VaQAWlWfK6UgWNoBJaqIv6qXsW/aElCVWgb7dMnMjBIW\nZJez5dK+tIs6MPdeSxAd9THlriKCwe8j7E2jVdxWRDPyOuUH8nHqjae7sAkbieaDSwp4PGEIfh2K\ni/0sz5WcS+jBtJYIhP28a/gDAAsWlrqXMODHmeytzCSZkwJb+4bFUFCV4UwCztjQf0C7+GliP78b\nvz/Aiaf05/mPnyUlrea675FxmBTM8SoBmRDeEqpXFQpR6v576EZdejYvP/oyfl9tN5Siwr/fq26Q\nssuS2V3cMfRGjzADVVdxZqQgpWzQn5rIgUURAYjPriUh4WL2JORCQeX3VOljkXiXXEfOxlGkPHM2\nwuxHiMiir2sKpb88i2vvGaHfSATRvHFIzQqqr/p4RW92r764Cf99rAVuttzAteLaWts1XbKm2M0K\nlvJcq/MQQg/35Kl94ekqnvRzKPv5EbC6IcbdoGup3K/xny15TBrUsIsvPz/I7NlOvF7JhRc66Nat\nZWcUCyGIi/4OXS8kGPwNXYIQbTCbOqMoTV/jlSSYEhBNRD4oQuGapGsaPcagGsOHX4dv5+xjPVG4\nUPmKTvxBG7Jw4MREITZ8WOnNOAYylJ6pfZiVl0G+z0tQ6uBXYGM/0E1U35GRL9jzh3ViZcky1rtW\n8umPHzVL7AHGdW2NKYJjWpdwSVr1BKdSv4/tFSX4dY3teX6e/r6Yf31TzOrM2v76Dp3b84+nr8Ri\n01FNOooqsdh0bn40n869QoOUbr+du754gaYuF5dPNtrbvYq/NnkD10VqAucnV5FZehZrXTnQby0M\n+Q16boQOOwk/PgECvawdpXNvrTo3kmD78wbh2jMKdHPo80gTgbyTcKWPQWrmUO9fglCbzt1SOS7v\nb+TQFAe0s0QzTJwFwIoCJx2/WI/4eCWmaas4+bvN3DHPQvbn8wiWdEOVar1LRqhBok+cHvqcvihw\nNuz204Hf8hruFc+aVUGnTru5774CHnmkkH799vDvfxc2+VkB9jh9/JpXQZG3eYOyipKIxXIpNuul\nWC3D9kvsAWyKjQltJ2BX6j8N2BU7cWocX/X+inaWxkN/I1EcKGajayNurWF30bGIsYh5Hdq3+ZZ9\nBQGiSMJdlUul2gGvINGRjGYeMWIPji/P51OtAHba4Mn+4FDAqYd86ykmGOyA1rUfpCwqfHdHO0af\nEMWB8NH2Au74IxNzWPh1CV+c2ZXz28fh1YLcsnIxM7N2YFYUAnvao+/ohZQCXQebSXDbGbG8dGV1\n3nwpJevXdufHr8uQEs4YU06nHn5AxeW/gvPfvp3NOU3HQp+QbGbTE40f9x6v8CwP1B6AbOASDO5q\nT9lX1+HulAII8NjBHQ1RLohyg6ZAVkfY0bv6JMVP+9f7NCjYgZIu5M75vN52NTqbdpdejXfvcKI6\n/dhgD7rm7aJ5bBS7VXzWyGmf20WBQ7XSno7MZz2vrivikTX1E72NiIErWkFqYgb3970aKep/IVIK\nsl7aUfkOEksatHFCt0Q+PK1Tve2lpRopKbvweGqXb7cLFi5sx7BhkRsSV0Dj6p928WNuOVZF4NMl\nt/ZM4pUhHQ55dExAD3D3zrv5OP/jUFoKTExoO4GLEy9meOxwLErzn06CMsi84nlM3juZFRUrsKpW\nNKnxSPtH+GfaP4/qSJ/mLmJuuHRqkJXlpqTQB1jCYl8/Ml0P//2DU7lIFuBYnw8fnQB7w35Zb6XQ\nSMgIQFYpXBAHySGfqtUEfZItjOx14LnpJ/RI4pK0BOZnl2FRBOelxhEdTqdw26qf+DJrBz5dw+cx\nwfYeoFd/DndA8vYv5YwdEs3gjqHBPSEEffvPI63reeh6HmADVKJsL1Aa+Du7C7NoLHWzEGA3C94Y\n27Qv9RbuYzhnM5XX2ctubFouSb4tDNbhjCBYqBbVGRUp3NUpFbJToahN6L/QQSoQXwT9V0NqJuzo\nRc3fyrdrANZuq+v18KUmCJR2jmiX7otFMfmISdqJVuO8Wv0hKdB9dnx7TqD4w/8gnW2g/W64YQLE\nZVUd1sGUQJLVhlkxcRF/YULgEc5ftJvFufV73ipwSxvoHSUIBFLp6ezP1uh1dX2JBIvr2C1FaO2B\nOijAPSck8X7O+7yT+w5e3cvYxLHc2/5e5s7VMUW4430+nTfe2EPbtnY6d+5ctaraikIXr2zKZVF2\nBcW+IBrg1UJ1vretkB6xNm5rsTGjyJgVM293f5uXurxEQaCAVEsqZqX54xObXJt4P/d9FpUuYrtn\nO5rUqiJ9/FroCfb5rOdJs6ZxQ7sbDslnOJIwBL8GXq+OVQG31vDAY+Wd6CYaiWT3wrSw2DfQOwgC\nv1RgvjoBRZeom3ay+X0nlontaNvWzJNPdeLGG5P2u3fR2mbi2i61V7dyBv1M35uOrzIstLANkYTa\nG5DMXOmsEnwAVe1GfEw6mrYSXZZgMp2KImLpaoOebc1syPbXc9ekxKlEWQT9Ui3884IETkprXnRI\nHwbwEh8SCCyl3HV6vf1CQLY7hnvSr4XSeNh+AiBqDxKXtg5t77UxJHwy/P0pQbyLh6NvjcM2chlK\nVLWDXfqslP8yIYJFGtZ2qwDoHdWGTFlGiSeAd9MZlH7xL0xJGaixhfj29EavK7xZnWHx4wy87mlu\n5wFuFPfUcltJKRkwdzPrSyJHGGnAv7Pg8x5glhZu2fswj/W8iaDwE1SCmHQTEsibV2M9AyEjij3A\np6e34x9ZF7GsYhlePfTZn9v7HF8UfsE9/sXoeqSnBwgGoaysjH379tGhQwem7yripiV78Gh6nSvI\nC6Yi3MEkXtmUd8gFvxKH6sChNj8Jn1tzc+mmS1lUtqjBbKqVuHQXk7ImGYJ/vNG1qwPpsEK5mcan\nyUriKcGHZP6SZvglS3X09/aQxLvkcCeS0IBrdg7ceccu8vICPP54KgBlZUE2b/bQvr2FDh32L7yu\nyOdFrdlwNCAKQoA5nMrZEwyw3VlKh6gYWllsmEz1Z8/Oub0do1/NIbssiCLAH4S7z45l8mWtDugx\nWNeLqXBdQlCrHz/t11Vu23Ad3xcMREOBmIpQttFAnV6drkJuKrTPqNWmCZ+dwP13UBEMEv/iizgm\nfI6wBAju7kDJPU8RyB6E6U4nQVMwlPtHBBAmH/GDXicKB+cXPMALc3Sy1w4MNzAK/vKwqCk+aL0a\nivuDDN86Eihuy2pROz203+8nGAyyrDSUbroxdvogKCUmIejs6cF/Nn/Od22msytqGx3dXdmx6Uwy\nsgdVn6Bo9VY7swr4x8krmJB9L75NXSA6EeJLIScZr8XPrq67yEr+L4HAOdS9tq1WOOecYFUKibYp\nKdz+eybuyhZezQNrOgTagX0FtA4lz9tX8VekfPOIdIU8uPtBfi7/uUmxryQ/kH+ILToyMAS/BkII\nzrugCzNmNL3yUR/Ws5DLml12HAsp5WwktYXL51eYNCmH++5rx3PP5fDSSzmYzQK3OxQlomkwaJCD\n11/vyKmnNrbiFKTao7GpKm4tPKiWmAfb+tQ7zqIKrjjJzskLP2dFSUFIPH5Igek9wGUmNlYwcWJr\nJkyIJyFBJa2Via1Ptmd5ho/cMo1TOltpF7f/l44udRZqC1nhuYsTtHRGEMlpJjmt1Q7m5IfdkSYt\nNJchYoEKbO8dSqcQDH2vz412YesDixapzH3ocUofeRxh9SM9IReaCT93FcSya5ibH8o2Y0ncSHzf\n6Zii8xhWNoF7Z3ZH2+WoM3u4sj4rFPeFHtNgW40nBYufjRs34vP5EEKEkocpTjZGryTDY8JuGoIr\n2LivueYZ0IKQAAAgAElEQVRweBt/MhOy7gMgoEvGrI+jVqtm9YECZgHjuiTw9tDO/OH8lQs2PYBv\nbXfomQ628OB8dDr4rXi3nMlpJ57II494ef55G5oWurasVrj44gADBoSeCnVdZ2upF58WBHRIeg6i\n54G0gAiCvxMooQbMH/sx7+b259bk6oHyIwEpJVPzpuKXzVtrQiAYFhNpYZ8/D79foqqgqoe28WwR\nwRdCfASMAfKllH3D21oBnwOdgAzgaimbud7cYeTqq5OZMSOdyAvGAkhUAvzBhXgJRNgfGQfrKGcE\nkSJdpK7z5pv5TJmSi9cr8Xpr98xXrnQxatRWVqzoQ+/eDfv+TYrCyycO547Vv4RE3xKA3uthS39s\namWYKPx7TDy3bfsuJPZBAbcPh1J71f7ycnjggSIefriI779P5dxzHQghOKWJrJeNUSbLON19Orv0\nnfiki1ECBkqoO0xoVnR6OnJrb4wvguIk6v0eigZlCaCbsJtg7h3x9G0Th9VqpV32QtYsiCFXdkAL\ni70ig9hxcaZSziunn8XkrV6eL1pCxfYRuPI783HueUizDp5KsY/0+wtI/TEs+BJUjR5t3fh8IYGV\nUjIrZi7/1+VZND10e7XuJWDRS7jyBkUoDzpaQKnRS5bIKrdQTJSdfbd3xBUUFHgl7RwKed4g7R1m\nokzVjdLr2a/jDnig855qsYfQ5WbxoemFXL7VzpujA8wcpLFggRmvF848M0jv3tW94MIyyWUrhuDR\n34fYWRC9ABQ/EBZP646qYzXh4bm9zx15go/EpzeSlrwGAoFDcTC588EvAXogTJpUxNNPF+PxSBQF\nLr00mk8+aUd09KEJoGypHv7HwBtAzel1jwCLpJSThRCPhN8/3EL1HTIuuCAegWhkGrtAo3V42Kf5\ngh8kDpUSgrSut0/T4eOPC3C5Gn789Hp1Jk/O5tVXOzJtWiHr1rkZNMjBRRcl8Omn5cycWU55eZCR\nI+OZMfE8Xty5ij1uJ8OHRHPHFa3YuMNMQJOM6ReFxeHj4e/yYGcMPHoKDeUM0DS48sps8vK6EhXV\nvAvQF5C8tLCUD3+rIKBJxg6O5p8XJvCQeIit+lb8hFIUrAHMEb5in6ayoqyOn7zHFliREO7pV9oh\nQ8nSvMAPIL2wuqfCmXeHxl8SUuIZrX7HukA/0tX+6Kh01LYxxLaWNmmXMyNzO09vWo1b7xUqLpgQ\nKruwCTeaNEH8doTViWKvQO+1meeTQ1FCq12Sz7x72XniswiTDwVfpaUkjLoP14z/gVYzMiv0BTze\ntRiPYkeRKhZpRSCw2+306NEDU3iU1WqBVuFT46z1b9uCQAFUxIA9gvtIAVK3U1YguHM3zOut87e/\nhQRchtOv6jr4fPDg7F/wXLwbrJsgfgYodSYZiNohmfn+I88VogiFobFDWVK+pOFjwgvRXNb6Mp7s\n+CQ9o3r+iRaGeOSRfJ5/vrTqva7DrFlOsrOzWLYs7ZDU2WJhmUKITsB3NXr424AzpZQ5Qohk4Ccp\nZaPf6pEQlgkwd24pF164vYG9MYQGDv3A/sTwVjYONWP0QVUC3HpbKrNmlZCT03gD0rOnlby8IGVl\nGlKqgB1q5fqpLnfsWAfTp6dGLOe3wmxGLJgN40bVO68uDgf8978pXHxx4+4kCInHOf/JZclOL55w\nchmrCbokmsn6xwAqTLXz9f9Hg0uByqE4TRdUaDaGL32cPH+dJGluO6w+BXz2kE6WhFuNGq5zm02w\nbV0H0jqYqCh0cn/nJ/A6a/f0rG2iKPhiBHPy9tQuP9Am5K7IcEBuIxFUtnws155PQtv2qCnbOC/3\nGm7Pv5v38ySfFIDlxHeJ7fcxok7ufd0fRcmyR5HWPKLarUf3x2OKy8CSuAFFkXR198aq28hx7OVT\nZR5f8xkVlHEul3E2F6A0MQdiStYUHt0xEV8gULuHX0lGN9Cm41DgxY5wcnToN5dSkikz2VyYzwcP\nDWDP3TdBr02hBlAEQG3cvTkkegjLBy6veu/x6Ggah6yH2lzWu9Zz2rrT8GieqnTVEJqZOyJuBNe3\nvZ6rEq/CqhyeNBRut05c3A6CEaY0qCqsX9+RE05ovm1HQlhmWyllTvh1LkSeZimEuAW4BSAt7dC0\navvLBRfE8+GHnbjxxow6e2r2MPf3gq7pu69sZANcekkcr77akXXrXI0KvqpCYWGQ0lItXFZlT7HG\nbN4azJjh4vLLS7jqqgTq0iM6Hr7pWOP8hpFSEAw2r1OwPMPHsl3VYg/gC8LekiCBjWfBgNqpFe5V\nYJuEv0uIkYKfC/ryVPpl9cUeSCqMp+ADe3U0TiRbdcm337u449Y4YhKjeXDBHbz5l49wFbuRUuLo\nHM8H/26Du67YA5gKwd8RYoOQqxP6fSv/hlF8MOQZWvUrxWwP9czcUbns80umFoBfgs3sBCXCXSx0\n4oc+jRLOuFl3nHOnY0vV6zFU37ff8jlDOZMPmINKwwmCbkm+hfdz32fLUhv02gz2GqLvtcK+G6uy\nSrvDQVxSSvbKvVzhvSLUMj/UEXxhkTGFva+RPJsFbUBCVDsnr3R5BYDc3CATJuSycKEbKWHgQCtT\np7ajT5/DI6j9Hf3ZMmgLb2S/wWrnatpb23NV4lWMThh9RKRT3rkzQEN9bSkhPT2wX4LfXP6UQVsp\npRQicsiIlPI94D0I9fD/DHvSN+1g4ZwfUVWV864cTVrX+g3NhAlt6NXLxllnbcPvD5llNmskJQXI\ny7OgNT8hZgQEITGx4PZacLk0fv898uSdqjMEFBVVVlrtb68urz5jxxZEFPw2tijaZSWTG+Gcuui6\nZOTI5k0QW57hQ4sQ9uf0SXpmXsaOAV/XynaoC3hLwFuAIiX2giG4PEn1zh9d0p+fn01tVOwBghrk\n5VcP1HUf2oUpe54me2seqklhfPZS3PlZDZytg1oMCa0g2gZOQZXoqz4QEsvw10i84nvUGuOvW827\nWeEEkwgJvifrNBw95iDMtV0rQvEj9iNdQyVuXCzjJ37gG85rJEjAoTpYMXAF7yS9x7PffUtJj2Wh\nOQs+G6aiEcRcvgzNvRN/xkhOjO6MX+gskPN50ftSqABzEJJyYe5lkLa72jUkBfjNYPVDUIWABZ59\nHrI7smCzneFxp6JpkhEj9pKREajqsa5c6eO00/ayc2dnWrU6PAKbak1lUudJh6XupkhOblx6+/Zt\n2XQXlRxKwc8TQiTXcOkcEc6+1598i/ef/5CAP4hQBK//+y0eevE+xt85rt6xw4bFUlY2iNmzS9i9\n28fAgVGce24cmgZRUasjPo41n1DPcd06N3PmlDbZgAwbFs0vvzipn+GnYXSdBnPbXH9ae15YUhrh\nrNp88EE74uKad8N2bGXCbBK1lz0kNClrfNszeBKlluDXslXAPwa1o2POmXyyewtuXWNMcif+2Xsw\n/Qdm42/GdH4tKHlxcjq638kzz4TGAYQQpPZuhyZ1Fq/b1/DJAhTbbvDo6CdokOuAAguWoJlTO8Vi\nvnk82+suWampbF8zns9rZCbw7RuCd9+p2FJ/RzF7kLoI5ekRGopoXohgXZxBF3ft/hfe7PdJs3Rl\naGIydrOH0QmjGRE7AiEEmqZRmFXIGYUjOGl4f55Lu58tcetCDQ3fV49I9f+ICYqOlOApAteWGhWp\nGuztDBsHQr9VoWigb6+C0lbQZy1kdoEvrw/NPYhysnBBOcP/AosWucnLC9a6H6QMRZ5Mm1bGPfc0\nNq/l+CQxUWXMmCjmzKnvFh42zEbXrkef4H8D3ABMDv+dcwjrahbbNmzn/ec/xOsJP+5qEAwEeeHB\nlxl1ydkkd6gfU2+zKVxzTe2BVkUJPbKuWHHweTjatjXx3HP7Gny8q6RadPfvIcjjkURF1Rf8886N\n5oXnyxo998UX4xg3LrbZdZ3fN4o4m4LTF0RW9cYlqiq4bWgyr+pxFNJw3pYELYHXNo1lc/lmFBQS\nOB1fj6/xNzU2XvnlBYrxakFeeWUfORfPYHbvF3Hi5DT1NJ5Xp6A38SW/fdLp3NK1Lwt2FrG7MMjp\naQn0Tg7deBqLeJeXeJNJuHDSJpCMY/njZO8eTLqUsCgI3wSgAooSn8D+zDKizlmE1Cy40i8k6dy7\nmvkt1ibog7zVoAe3gLKJLL+FpdkmCCYxxTaFi+Mu5jEew+fzIdFZHfM7z3S/G6i9FkDVy3CjIwRY\n63rOpAI7e8O8K2DK9dBnPezqBbPrd4YImti6K7RuxK5dgYidH7dbsm1b8wMbjjemT09h3Lgc5sxx\noeuh3+SSSxx8/nnKIauzRUZWhBDTgWVATyFElhDiRkJCP1oIkQ6MCr8/rMz/8gcC/vpXphAKi775\nab/Kuvvu/U/YFEJCeDKI1QqdO1vZtq3pELJLLql0zeyf4NvtkZ8Ghg+PwtHIxEVVhQce2L/slmZV\ncMUlBYjYspCwCC2UEvqkZTiFm35KvwbPtWHjwe8eZFP5pqqspIvzF5MyK4Wrr4zC2qA7U4egE7w5\noIXcEB5/kE++zKCEEgIEWKwtZqR/BGmOhl1THe3R3NK1LwDndG3N309pWyX2EFoV63YeZhOlZBBk\nuXkv15SOQZvqg3+XQ958uHEKXPNfCJbiuXsYRa8+TfFvTxLcNwDNXT86qxY1J49JQSd3D7q4elG+\nU0EPhj8nhEIkhRsQXK5czr3ee/H6QpE0QRHklS6PNRR0Vb/KGlpsEVY6aX148qrRLP+lJ9efFc5T\n32ct2CO4GxWd8n2hzsCAAVaUCEricAhOPvnAQ3mPdex2hVmzUikv70ZWVmcCge7Mnp2KxXLoYvFb\npIcvpWwoP+nIlii/pVDq5BCvSaXbo7w8wDuvbefrL/cSF2fh73d155Ir2tdzi1xzTWsmTdrH5s31\nxXro0CjeeqsTgwdvrnLVqPg5gcWkshkFjRJSOOW6a/jPR027VQCuvbY1t92WQSAA1erQ+IXx6KMJ\nDc6CtFgEX32VykUX7QuXWU1iIuTmdmuWXTWpCPh5L2cl+uBgyO8rBVj9+ITCC1tX82z/ZxnhGVHP\nraOg0HtNb9Zoa+qV6dJcxFzyCV3nXk1mZhCnS2IOBzpdcoGJ7+dk4gnWcZUIHU2tPenGh49T+u8k\n94/21aknwnSNimXNOX/Zr8/66ltZ3HvvLlDd8NYdkJwDUR7wWuCGaXDfy/BVXxhoQgkIlMwu0Lv2\nmsQ1HzisOSk8WvYY/dxDakXkOKWT+5T7WKPX+G4EYM7kb6a/oYjqYzdHr4mYeC0iEnqVnUyFshmz\nYub6pOt5vN/j+Pp5+eijcma9eycEJkC/lRBbAgETBMOtrsUD7TNY9OEJrL/Fxymn2Bg0yMby5R68\n4ShOsznkthg7tuHMngYhHA4Fh+PPiWo6rtIjn3flOZjN9ds4KSWjLj0btzvImUN+4KVnN7NxXRlL\nfing1r/+wb8eWlfvHFUVrF7djyefTCEx0YTdLhg4MIqlS3uzdGlfBgyI5qmnUkDRAckQviKVzaho\nCKAV2Wz7+A0ssulZvYoCdrvK2LGVvcSmXEkSCKKqPkpKGvZ9n3uug8zMLrz0UiI33RTLa68l4vV2\npaCgB+oB5NbfWlGCuVKALIHQQB8QkDo/F2Qz1DSUt61vY8eOBQsqKp3pzC7HLjbv29xgufOLv2bN\nsvbcMXMNbVZdiWXHKQzfcys3/Sc7sivMrMFlP9Xa5MdPbqvfWD36L1zfsSe9Y+K5OLkzG88Zy44L\nryfG3PyIiPLyIA/dvzsUmXvNdGifFRJ7AJs/9HriU1CgIzQdmzPAmDPOIHpdYlX65Uq7vUu6c82/\nfmJG9jec6D4FBQVR41+MiOF16+vEERcaNP11JLz2KEy/Eemq/Ygmkc1+ADxbXMii5N/JHZDLslbL\nGF82nswdmUyeXMykSQGc2XGhdQd+HAOFbeGvr8NjD8FfPoQTl8OergQDoXTLQgjmzUvlnnsSaNtW\npVUrhfHjY1m+PA27/biSmCOe4yq1QrcTunLnE7fz+r/fCg9mAggmvvEYbVPa8MHb6eRkZqN4N+FA\nx0sP3K5E3n09nTvu7UlySu34bKtVYeLE9kycWH+xie35Xp6pyINb7cRsyyFucT5qnZ6l1IKksY50\nhjdqd8+eocfiSZM6sGBBGaWlGj5fkLox/eFSUSinFR/w2uThfPZ//Vi7rj+xERYUB2jXzsT997fM\noFqq3YE/wnq+AujiCPX0brbczDjzOFZpq0gQCZwgTmDM4jGNzozsENWBL/gvrw++BXe4sfuJH1hu\nX8KDM77kxbExgAyJqJAEH/4/tN4ZtcowY2aAMoATYlvxycmjD+pzLllSjqzUsVGLqhq2WrQqgT65\ntM1oxQVvrcbktGG64QWyzjoR+rtD/vLfTQzL8HP1JFvVtRgJCxZGa2P48s6LYXd38DgQFh+v7hNM\nnFi9aMsJzoEN5k+qyUX8hSn+aezM3ElZWfU4jssFH3xgxeerEwEmJJS3guvfD82HcMbBLV8iypMw\nhXMy2e0KkyYlMWlS/SirgyXXn8sXBV/g0l1c2OpC+jkadg0aNM5xJfgAtzx8I+dddQ6L5ixGNamc\nc9lI2rUP+eNnvDeLaO90QCLQiWExPrrh1sawfFkhl1zRdE54CD0xnPFmOp7SWJAqDrEbqSrUzeOk\nohHbjOClN98MxcynplrYsqU/H3xQwNSpRWzZUlP0Qzd6DItJ4gMUvBBcgj+jE++9+w4PPHhgi5zv\nDyn2aEa17cAPeXtruU3sqomHe1WnFYgSUYwwjeC/u/9Lv6VN37wXpF7A/f77q8S+Ejdufh31Ajt2\nzGP27EICAcnFF7fmvuSXWaDZ8FI9S9SKlXss97TAp4SYGLX60TjQwC0kJENdGZzyzFb8tgCFlkQ2\nTOgH0TooYb/2yXDH+KZ93AJB9ldnw86eoYlngPRbmT9f8vjjPlQ1NOBnkVbu2/UcL3Z9mCBBdKHV\nLIQU0viW5cR449i0ZRO6XvuC3LVLwWQKzbitRdACa04JvY7ygDkAt76IecqLXH31oXHZ6FJnfsl8\npuybwqLSRSgoSCRPZT7FjW1v5LWurx2RSduOdIwFUMIU5hUyPHkUUoZcIAoqEomCQhQOnOhsCvxa\nNdW9Ju9+upJFU9bi2FuOA5123RP5KLEDu1NCoYHRJXmc9tUbqFptZ7mGyg5OZQdDG7Sre3cL27dH\nXuh5374AL7xQyMKFHoK+QvSdEyL09yGh23iWpx+6rBa6lMzet5NP92wDoNjn5Y/iPFRFIdpk5s2B\np3NVh+61zrny5yv5Kqt5a9z2jOvJ7jN2h9Iy1CGWWMpiakcbeaWXh3wP8WHgQzx4OFk5mTdtbzJI\njZzLZn/RdUlqh9/JzQ7AlV/Abe+E4tgr0QRUJGIatxCtwoKqBAkO1+Faf2ipgRr81gdsEVYwq8s5\nf/FSvLN+7/n883089ZQfRHX+nWJTIT+1/p5dUVtYEr+QBJHICrEPT9DDHxV/4MxxkuZOqyeY2dmC\nq65y1OnhA+hw2o/w3B3Vm1wOpmzdyz331J/ncbD4dB+jN4zm9/LfCURIX+JQovi2z3ecFX9Wi9d9\ntHIkzLQ9otA1HamDao7sU5z/xUKQWi0fahANDZ0Kyogmhn7mwWyRa2udd+u4zzB9vosUTa8S25xS\nH5zdGaHrSEXBmdCWvd0H0iF9NWo4k6WOQMNEJic2aHPv3jY2bWq4B5ySYuKii6JQFD8bFs8lwvxR\nBODKnsfW3Hvp0caM0gxx2R+klFy9bB7/y91TlaUzSjUxvmMvJvYZQqrdgSpqf+dri9c2W+wBCj2F\nmDFHFPxUUT99hE3YeM32Gv+x/ifUaIuW9SMriuCH+f05c+Q6in47G+54M9SyaioETaFMm35BsPsa\nWD2UoG6B9p56Yg+h9XJszTCvzFQI1Bf8xYstFBb6SUqq/l1bBRO5PO8GJJIHxfN069aNN8vf5LE9\nj2ERFgJagESRyOvW12mvVD/5paRI+vTR2LBBJRCocZ3YvDD2o1r1Crv3kIg9wHs577GiYkVEsQdw\n6W4+zfvIEPwD4JgfUfGU+phx7Y9MtH/Mv2xTeWf4N+Rtrp+0c8V3q1FRSSWNVDoSQxwxxGIOL3Po\nxo1GkIzt1bK6dtNezJ/vwlxD7IvjY3n3hqvY3ak9skas2qbTL2PHSSPwO0QohTrtWcI4/NQPFUxJ\nEWze3JfNm/sTCAR565l3GN39Aq44ZSwLZi9ESomuSy6/fAeXXprOq6/msWF95GgfCbgE9P53FnH3\nZvDd+sZn9O4vvxRm811ORnVKZsCtBfl0zzZcwUA9sQd4euPT+1XH6W1O53bz7UTV+a5s2Oip9OQp\n31Ok6+n1zivTyijXyverrqbI9+TzR8EfpHT3kZc9lL99mA07+8Bd/wdvPgJPToHLfoG/fhca3Kwk\nS4UIi51/XViZwKxh2rVrh8NSHc5biRCSNm0kiQ0sNCYIpWpeXLKYf+75J17dS7lWjgcPWTKLu313\n16v7pZc8nHSShsUiURQJjgq47wnov6rWcSfFRH7qbAk+yf8Er2x8ZfiAtumQ1X8sc0z38CsqAlza\n43uWF+hADH3wM2xpPu8M+5b7068iOsmOp8TLzMvmsvLnNXSgE25c5JNblS2z8jFZR0MgeP+ZD3l2\n2lMEg5KJ96wnU+tCKXZ0FNpRzsaLBqFVOlVrIgQ7Bo1k55hYTv86h6vPO5k/JkX2365c2Z/kZCtu\nt5sB0SfXiry48/J7OOfyUZx17UR++KGsKsNmqRxGLAvrlSVNFnLOfQwIpTi4+K080p9qT9c2LTOT\n7/PMXfg0rd54o1/X+WLvbib2qT8g7Aw0HZlUiVkx8/SJT9PL2gsNjXcC71TF6WtozNHm8L32PZP9\nk5lincLfLX9nh2cH47eNZ5UzJFInOk7kybQnOSv+LOzq/i8t6Ql6mLx+Mi9vehmX5kJBQVVUbu5+\nM+cMHsHUUY/CxpNgfY3FY4QGi8ZUv//DDJf7QilCK8fPA/CL18rflEBE0Y+JiaFTp04sXuzEv7MV\n4ANC0URRUQKLRfLii54m0zVMLZ+KW689/iGRFMgCtslt9BK9qrbHxsLrr3uYObMts2b5sfUqZts5\n/6sVSBurxjKv77ymv7gwQRnkm6Jv+KXsF9KsaYxvM54kS8ODuybRuCyZgavijvhM60ckx6zg67ok\nufW3uAMhTzzACmzsxsyNbicrP9jGmY8OYNa189m7LIcAfiC6ltgDtV7HkUBpbgVFRQF6915JYYED\nSWUWSUFWmySk2Vpf7KsKU5HZQznp0VjuuyoRLSGbhx/OqgrRUxT4+uvuJCdbCQaDDHCcHLGYBbMW\nkl4xvlY6ZT+dcDKQaLEeZOj2lCYr5T1H4k/qQmgxPYFE4fFvSphx0/5NqmqIPc4ANQeNqxFsL4+8\nAMXN3W5mQe6CJsselDCId095lz7xoUVcXra9zLPWZ5kXnMc477iqQdxARTKBgkHcZZ7D6V3O4cx1\nwykMFFatdrTSuZILN1+IyWlisHUw41LHMb7zeOIsoammLpeL3Nxc/H4/UVFR6LqOx+PBbrfzbv67\nPL/l+Vp26ejous6H6R+SZG8DG6+mOmwnjFRDqQgq8Qt42gHjvNA/iAAu65DAB2d1xKx4Wbl3JRaP\nhSRbEp07dq41VjRjRgFeN0AZYCI21sIDD5gYOVJrZEJaiJiYGEo8kcVRRcUlQk98ZrOZ6Oho4uPj\niYuLx+XycvLJGqedlkZSWzdT9k1hrWstZ8SdwU1tb6pa97YpXJqLEetGkO5Jx6k7sQs7T2Q+wQ99\nf+DU2FMjnnNTu5tY71yPR0ZeKeyyODg95kAnPh7fHLOCf/LJi3AFaqf60xHko7ImYKL9t5kMubEH\nGYv3ofl1QODF02gufAfRDBt1KuPGbaGgIEBdj5gs15uIgxbYzYK0VqGv/cEHU3jggWTWrHEBggED\nolCU0GP45UOubrAUHZUVP3wOXESoQhUwkZtyL/YBGcTsCOV8qehxJp6U/uHvQIW228AfxZrMzg2W\nvb/0iWnP3LwtEff1io58U17Z8Ur6bejHhrIN9fZZsbJ49OL/Z++846Oo1jf+nbI1m95IIYQOoXdR\nr0oTBRTFLnIt16ui13vt+rOiXlHsBQVR7BUFQb2K0gSpUqSXQCCBhPSe7Ttzfn9s2mY3Db1F5fl8\nAsnszJkzszPPec973vd5GZkQeiHbLJlZpa3CidN/6Ttvg5xJgMAr6ww4+CZqnCO4tJ0OPpePjWUb\n2VKwhRk7Z7Bm9BqkIgmHo8H6dTgceHUvq6tXk+XKItmQzADLAHY4g3Mx3LqbufvmgnRpM9974MAf\n7lW4JSyZOybHEB/vfwa+LP2SK/df2aDgWAkLYxYyNnps/XFGo4Qk1cXu+7DbfZxxhhWTqWXTPjo6\nmvT0dM7ZfxHrP07Dt3kgJBbCxQuh62E0WWNClwnEWeMwm/2zzUOHPAwalE15uYYkgccDt90WxcyZ\nd51QVMzzuc+zz7Gv3kXjFE7Q4PL9l3Nk2JGQbV6beC3fln3LV6VfBfjxFWB6HMxICsNi/nu7+3IS\nv2PC37q1ed/tUsJwbKjkoiIHTreGhpkoEvDQvKvBiAkdnXUvbmFFYSIhlz9crUc8qTJcNaIhlE2S\nJAYPDtSaf+2fr7N/e2g9fh+RHONufDQue6f5f8LB2XEgzo7N+FeLusItlxDx1b9a7WdbMa1zCi8e\nSMerNF4yljBoXbi0U/NW2PYJ23lx/4vMOTgHXehc1fkq7su4D4uhdZeLAYN/YC4YCUfP9ZceBNDB\nKxXh1eyhaifWP+0+4aPMU8Y1a65hTtqcgN1qtBp+rPqRY55jfFH2BQ7dgVVuXpKhwluOUVVqFVWb\nJ8QJE6x89VVygGV83H2cy/dfjlMPtGQn751M7vBcog3+RdGrr07kgw+KcDj8g5imQWamzqBBckjC\nTE5OJjExEVmWKSnx8ur4YWjFvf3Sx7IPlo/FOGMWz/75IlJjGhZthRBMnJjHsWO+gIS2V16pYORI\nS+g0tscAACAASURBVJtqIjTFh8UfhvTHF3uLyXJl0c0SnNGtSAoLMxaypXoLnxe/xUH7e/Q2ebkw\nykC60YfJeC1Gw+Xt7stJ/E4XbZ3OugXEZkrUAWuxsGn2TjRMCCCC+FqrMPRLG0cCBlQq82sQTeKX\nA7CtErx6Qypl7f+qDOmxKstuSyLO1rz6pMPu4PUn32z280Km4iMaQmmjH3BDYQsFsyUd4o/Qq09z\nEsHtR7/oMG7vmYHF1x/Jm47k7YzF14/7M3rRI6J58pZlmTsy7uDg5INkXZDFowMebRPZA1xpuBIz\nZr9lrzUhY080IdNvBTQO8hEItjq2oonARDGDZODc6HO5Ov5qFvVYRE9zT8p8Zc0WIBkZP5Jt29II\n9dykp8ssX56MED1YvDgRh8OBr5HK2NLcpTxheIJ5pnlcql6KqdY/LyGxsLQhiunUUyO5665UzGYZ\ni0XGZpOx2wlJ9rIsYzab6weWWbOOUVzkQ9Tp3OsquM1YnnmEGxOnBxy7Z4+HvDxf0O2z2wWzZ7dN\nAqQpDJIh5HaBaPazOgwNH8pTXV7j875VPNJlGX2j3iIqIhOb9ZWTMfgniN+lhe92tyZDK6EDG17f\ni8CMBHSkF3vZRAwJlFGMQKt37UQQRTWVpNCJ3fRDb2mc3GGHYh/0CQOTjJpi4tUr4hjd20LXeLXV\nB/Vo1jFUgwIheFsg4yCDFjV0FpfCjaGSrAR02gqyTkJc24o7txWzBnfm4rQ4Ps0pQQKuSI9ncGzb\nrEEhqvF4v0SIagzq2ShKl1aPGawM5n7j/Tysm5o4bhaA6xV/cTEDDeaMDvgIipKpC8FtjLoKSHX/\n/zPtn0zcPzFkP0yyieeHP0+fWDNC9ODuu4tYudLB2LFWZs1K8J9a18nK8me01hU4j4uLQ1VV+lT0\nQVX8r2BvuTfnK+dznfs6vMJLpS8wt+DRR9O55ppEPlySz2ZHPvahLnTCgp5EIQTh4Q0zyC+/LK2v\n59AYmkfmwAEHGRkN8gzV1TpKM7ZIZeWJFYC4ocMN3Jd9X8CisYREV3NXOpk7tXBkAyRJxqCecULn\nP4lA/C4JPzy8ZcuhDscxUyeIbMLCedyAVkv0+9nEQbZhJgwPbpJJxUACu+mNQEaqNRcFKhK+2oXh\n2tt53A3H3ZhjFbbuHEZGctsjYhKSE/C4Q8cfe0igTTKImgAlhE9D9YGucHHy4Fab+H7xMv5+yZ3o\ntcJkkTERPPDifVww7fyQ+w+LC2dYXPuyLr3eVVTZ69rTAIHZdDthlpmtHvuA6QGs3fZwb0kRXn01\nSM8BNf7bUwRE4i8KJgF2/OudTZCsdOHV44u4OfnCZqsghcvhdDJ2IscTmOVwStwpvHP6O/SMaqja\n+cwzCUHHHzt2jMrKSoQQ9ZE4JSUlCCFQG71+FslCmpzGeGU8y8VyxkePD2rrgKWIJ9O+x6frfJMv\n6JTSly5GK6ba0Fdd1lnRYQn3qdcwmSu5lluJigr9ivt8EBkZ+NngwaZaWesm9Qws0gln1E5Pns6y\nimWsqFiBLnQMsgGLbOHz3p+fUHsn8cvwu820laRFrewhuI5y4glNoQLdn0uDl0NsIJxIcklhHcNR\nOUwSr+OkBx5SUCkijH0c52ZcdK09P+TmDic5uf3ysFcMvpqtP2+j7sWr8w6XMoEyJjbT40YYbINh\nUcHbO+yjW698Dl5+fYuH91L7o2uhZ0n3PH0n1999bavX0BqEcFFemYig6VpLGBG2f2FQz2y1DZ8u\n6L14ItnO5fjaUVAeUfvjUwGVB1Lu4oKYySFnXy7dxeWZlxOhRLDXtZdeEb34aeJPhJtaJ0AhBD//\n/HOrcfaNcUA/wJqoNcztPjdg+7ZCJ6f/+D5O0TA7MyBxQWQif03oTFVEJu/Hv8q2cH/hbhNmutKL\nqZ9+xfS/ZAdEdKkqjBgRwdq1wWs9CxZUce21hbjdAk3zSxx36WJgw4a0X6TouKV6CxuqNpBiSmFS\nzCSM8r+nwMcfFX/4TNsPPhjCVVdtbXGfOgMwFCTkWllxIxH4ydOEG4FEPAtQcGNjF9AQaRLPAvLV\n++nQwcAXX/Q5IbI/sDYLy/4o4kmkhMJ6t5Jf/7KNi2YdQsTqST56W1PZfvGEZg9zuVz0t7T8zLzw\n4Mv8+e9TMZp+2Qvr9ixAhKh+JYSD7cXPkS/SOLdDGga5+fWONUdXketa1T6yh4YvXfIBPl4vmsuk\n6ImoBLrcNKGR58lDQ2Nm2iyGDhhESljowvChoOt6u8geoIfcg9NNp6NpGkqtf+XhH0t5avUxvClA\nuEZE76XYuq5FUt2sKurBjoLxiC5zcDXyA7pxkc0hIi79kelbRjB7dh4mk4zPJ+ja1cLnn2eEPP+l\nl0bQp4+JuXMryMvTmDQpjCuvDMfclnTgFjA0fChDw1vloxOGR/ewunI1XuHljMgzsCntX2D+I+B3\nS/hTp3aiulpj+vTtQZ+ZTDJut48SFMJq5YrbgnhKMeHC2Ew1WDPH2Lq5J/0GxJ/wotLKOT/ic2l0\npRed6U4NNRzjCFmMwRcqN78pZCA1eD9VUlk9vS9GNXS/fD5fq2QP4PV42bntKBu3Wigr83HDDYl0\n6ND2gU0XVdTYL8XrWwlNiFrXJLKWdWflWhsL+IK/nRnJmgmXkB7WUHXL5/NRU1ODEIIvMr/AK06w\nolIjHi7xlTDt4I3M6zKXcNUItYldHt1DobeQL3p+gdloaRfZAyiKgslkwh2kRtY8JCQKi4o5nF/C\n7GWRLJyh+b1dWEA6E3XofsK+3oxi9keUmTvsxZ5wACvBbkwHNayVlvPMM1dw112pbNlSTXKyiYED\nw1p8Pvv0MfHKK79OnsZ/Aj9W/sj5e86vD8X1CR9v93ibS+ObD23+o+J3S/gAN93UhRtv7My2beW8\n+GIW27ZV0KdPBFeMjuaq6ds5iIFUtDbfBAkYw2r2YEIi9EscEeb8RREENWX2ejKSUYggkj4MJMpg\n4Qff8FYLeXN1UsjEr5gwmfjw5q3lZ+97odW+eYmmgjMYduox6qKEHnkkh6QkhaNHT62Xym0JNfar\n8Pp+oCnZu6uNvDfxWsoOx+CxGznNXI32Xg1TX1zMuuv+DEBhYSF5ef66tEIIcu25zdbIbRHBKgUc\ncmUx+qeNvNRrFMOjNQyyiqqonBp+KgKJxIRg/3xbkJaWRlZWVpAyZbNdE1BRDuedZ66VwK6fjoAA\n3+ZeFJ3+IcmZ5/i3yv65nydEIoABI0n4FV4TE41MnNhK1a3fIGq0GibumUi1Vh2w/ZrMaxgWPozO\n5l8v5+T3gN9lWGZjSJLEkCExvP/+MPbsGceCBSM4mmNnKC4OY2Q1ltrlwgaI+h+Bq4mP2Us2Eu6Q\neTZmq4nUzu2zApsi7axBHFF7cZR0tEahl+lqIQsX9MTQzHr0gAEKBwtPwWwNJnVZgnG9m48lB1j4\n9hctfu6iEzk8RAVjaRoSmp+vMWDA5haPB9D1Ery+7yHEYLn22TMoyYzHazchIWF0galGkPbEMTaX\nVFBRVUVObl794udjeY+x2LG41XMGQYAs5BCJwT7wjOShfV52V0m4NPDhj3OPjYkmMfHELN6IiAhS\nU9suTS1JEB0NM2a4aS6sWDuYjmd7w2KxJPvXnJpGHMmaTJ+dwzh06BBFRUVkZ2eze/dudu3aRU5O\nDt6mpc6awO3WKSnR0PX/nXW+plhSGrpUtiY03i98/z/cm/99/K4t/Oaw66iLcAThCMqReZ1IBuGm\nBx7MtV7zMHQEOuWNNCh9qKwP+z80+xY68DICUf+KybLMzDcfDymf3BRCCDx2HwaL4i+7WIs7/36E\nl2ZbkMQpte0KxvINcRQzeHI/Lrg4ifxRcTz9dA7z5hXgdOoMGhTOxx/3Ij3dH8P+wXUJ/PmdYpwe\n/3UYFLCZZB4/vzVlw5at80KmIlpwKe3d68LhcGO1Np/rL0Qp/njJQMI/+F13Nr02Et0XOJDIAuJz\nNM5YsA1XuBEJyDBLXBi9iy+rv2zlekLADETCdbbrePvAu2i6hn/wkqH6UW4aaGPOWH9dW6fTicfj\nwWKxYDS2fb2i1OVjW5mdDhYD/aKt/plIbui8B03zy2mEkF3izDNbnhF4tvTDONAvR42A7lJvZF1l\nSP7pnFU6EUWoCCGI0eKorKwMKHQC/kih0tJSMjIy6rNs6+D1Cu68s5g336xE1yEqSuaFF+K54oq2\nF7T/T6HKV4VPBFd18wgPZb6y/0KP/rfxhyT8julhbJZVLtGr2Y6RODR8QBYGeuEhHB0HZZRzrF4l\nRkNlHWOotIcBZ3CYfsSwgDB+xmBO4KVP7mL85GEtnxjYvegIX9+2kep8B6pZ4dRb+zDu8SGs+sHO\nK7PL0YRKYwt6JedwMR+wbfEuqktriI21MWtWN2bNCl1z9qLBNlKjVWZ9V8HhEh9ndDPx1/4yHUwa\nhPDzAhzfX0CKsxOV7Az5uY4JT30Aa/PYty+fIUPSm/1clrvQ9JHb92Vvltx0YRDZN4aQ6oY/2OMS\n7CmY06bo1DooKGidNejrTwS60nwlN3e/mc+zF7GpIJ8BkRdwz6BzSAxrFCZpsWCxNCSCVVVVkZub\ni8fjwWAwEBsbS2xsLIbaKZcQghnbj/P07gJMsoxPCLpHmPh0QAQun86yQiPndPDUR8vqOrzyipF/\n/CN0TkRrUjWmUwLXpp4U8wjbH43daW82SawphBAcPXqUHj16BGy/9dYi3nuvCqfTb9kXFmpcf30h\n8fEKY8eGhWrqv4Zx0eMQR4JnIDbZxnkx5/0XevS/jT8k4V99fRfmvnCAaqeLIXjqX4+6x6YCE+sZ\nQQ29SeUITizsZggu6kLxJHRiKOEmSgBcMOMpM+Mnt3zerJXHWTDtB7wOv9/ZU+Nj3Uu78bo03tgf\nhVcEk56OQhEdSDeUsWfZfk65vPWF1RGdzSy6qQObF23n3Zs/5cUqF0IXDLmgP395cypmW6AV/toV\nbxPhjCGBJDy4qSDQMpLw0Rbvn8nk4L5Fpby4shK3DywGeODcKB6Y4FfMlCQDWZ6HSVLuwyx7kCVY\n/tDZ+JyhLWhdgqJO4bhtTQeq/Bb7oaJikk1IQuKhpIdYlLSITSmbAPDiZY5vDgtiFzAodlD9MYeq\nnFy9fD8rCiowyjJTO8fz/NAumCXBvn378HgaiFnTNPLy8sjLyyMsLIzu3bvzZW4Vz+0pxKUJXLWV\n63eXO8k67uLmbREUuiSGRXuIr9XW27hRYeFClb/9zUPTSaEQcPBgcyOaQIorR+1zCCH82kNPSfPo\nXTOAA84DbSb7OtTUBMqJVFfrvPtuFa4mMiEOh+Cxx8p+FcIXQsenbQbhRFVPQZLaH81Wh26WbtyS\ndAtz8+di1/1CcGFyGGOixjA6avQv7uvvDX9Iwk/vbOODxX/ilqnr6V9eTbrmRQKOyFEs1XtSTR0h\nJnOEXi01VY/t293k5XlJSWk+6Wv5jK31ZF8Hr0Pjp7n7qBo4jOYyAhyEUerVef2NGuavKODKKyM4\n6yxLi4vDWZuyeX3ae3gcDUS1bfFO3HYPd3x1U/224uwSju7wL4R2pket6LCPvWzHjat+PcOvS9C8\nu6ZnT3hyrZmPtje4DpxeePDLCrwazDjPT/pV4mLu35HHdR2X0tFYRuXR0K4mAbjCDSyd3ifEp71B\nFIe8XWbJzD2J95BoSGSIdQhe2cu8uHkB+yz0LaRCVBAl+cNty9xeRny7gwqPz5+Uq+m8d7iIovIK\nHkyQWrzPdrudXbt28Vq+Cbsv0A2jCVhw1Mxxp4xHSMzcH8YLA/yktHq1itOp8NxzRu65x/8d1Qmk\naRrcf3/zay73PFPF+KrlDIwYRLTkv6+F9sJm928JTa+tqMjXbLbtkSMnGBHVCD5tB1U1ExGiqlao\nUMdmfQuT8ZITbvOZzs8wPno88wvn49E9XBl/JRfGXXhSfiEE/pCEDzDm7A7sKbiA/XursFhVOncJ\nY8iQo1T/3PYQusZQFKiubnlxq/RQM4JussSlU6JYu6EQXxO3ixcj6zkbyQX6SgVWVjF/fhWXXhrG\nJ580v0D89axleJyB7gKv28ee5fspy6sgJiWK/asP8sy5r9VPbSQkVFQUFLqTwW624SKFIqbSEtkD\nvP1uGKe/E/qzJ5aWMXb4Pk5LOJUz4pO5pKonK7anA3BDRBnWquD75gxTmf/8aWjGYPYxcB0+aQOi\nUaSPhES8Gs9jSY8xNMw/C/LiJduYTZYpK+B4HZ1PPJ9wk8k/8L11qBCnFqivmW6Eu+LbRhiapjHU\n6A6qRiCAbwuNeGojq9aWGZl3xMdfO7sJDxcoiuCzz0zs26dw220uEhNh+3aZ554zU1ERbKk/91wY\nd9yRAvQM+qytcsVNEdekekpqqhrSnSRJMGxYy8+AEBpe3zI0/QCKnIGqjEbX9wASitIX8FJVMxYh\nSvz71x5X47gaVRmAovRorukWIUkS46LHMS76lxWn/yPgdxelU1bmYuKo5Qzs/hUfvpPV4r6KItOn\nXxRdutqQJImnnorDaj0xq8Bmk+nRo2VJh6SBocPiJEni2ukdGNBbQq0nMb9+PRgRqOj1xcr9vtdP\nP63mzHMPNxtBUZRVHFKyVzWplB0rp6q4mmcnvIbXGWy1SUiEEY6lr5Wqi/fiscbQnO6zyQSffmoj\nPKETzQVz+DSJc1dMpuuSruQ5jrLktAnYVANmWWHTRRa8TXnErLLu8m4hyV4GIgw9eW/kAsLkWveC\nOxZxbBoZxz6hl2EoAgmf5GOHZQd/Twkto/u25+36338uq8HZJLP41gQDJim0QFkojI0EU4jykTVN\n7skb2VZGrYniUC+VWhkddu9Wuf56G+edZ+OhhyxUVDTIG3TuDNXV3RCiRy3Zh4bN1v5EI6vVSmJy\nEsVuJ5rwX7/JJDNjRmzQe2C1Sjz6aHBprV32XfxYOIj8MpmySpVq+7k4nLdRbT+b8iqVypoBVNb0\np6wyDKdrJkKEMqh8uDzz293/k2g/fleEP6jnItJjl/DjD6UcPuRg+rVbiJA+xedrW6z22WeHsXBh\nMn37GjEaIT1dpV+/1idBZjO8806HVuvFnv34UAzWwPYMVpVRDw7EHKayadcgZj8dzoj0EiTMgJGQ\nqphIgMyalS6ufyG0Fn3PM7qhGIKP9bk1UjI6sOGjLXhCkH3jU2RfdYiqC3KxPHMLE6dYAnzNVivc\neWcyJSXDuOSSQXRLjQn0sBhKIHEBxH4DajE1eilH7EcY8s0QBkVGcHzStcwbOopr7htH57sGYog0\nohhkrDEWpjw7jgceGE2KxYACGCR/oW+LInNexxh+mjCQq7pcQM0VNfx8hhPztk1w+GF+ONSJMe9E\ncsnHNn7Y25ElnZZQroYu/uGWGohnYIwNixL4KmRYZOR2uATiTCoJZhVz7aqsDFgVmQt7mjHLgazv\n1KAkXGHUWV4kqSEIGATR0RqXXOLhyy9rqK5O5vDhHthsrb+mjRUyW4Pu0wl3RfKBr5yYJW/S8et3\nSPhyPnOz/Fnjd9wRw5tvJpKRYSQ6Wmb8eCtr13akXz//yKxpPo6VD6WgTCLJ05/exu0YJBFSpLTR\nVeN0P0pQAgQAXnS9qE19P4lfhn+7lo4kSdlANX6T1deS3sMv0dJ56emdPHRvaPIDqBKXoekCu08j\n3KC0arkVF/sYMuQoxcU+XC2U17zoIiszZybQo0fbQveObizim7s2kb+9FFsHC6MeGMiQa3oE9Of2\n24t48cW2ydFKvcop2TGAGGPgwlfpsXIe6D8TZ+2CLYDRauTcO0dz0WOTePP6D1kzf0Oz7VbHVbHk\n2Q/oG9uXeafM45Q4f3Uij0enstJHbGxwQfRJs/P51/5cGHwOGJuExDXadWD0QLaduy3gmnVd4K52\nYwo3tavQ+ujn81h1wBUU2yhV+1jyeDnnx4euqvSC8QVuM90G+H343Rdvpdzjq5/HLOxqopOp7fZQ\namoqpqhY5h4o5tu8StLCjPwjI5FkXw1/XlrO2lIDEsIvUe3x4XrVC/kmsHgh3Ev/VANPPOYiKT7w\nnIMGDWozkdfU1HDgwIEW9/E5ffz86gEOLMqhKgoW3R9BRbLfMLAqKm8NG8NlHbvX769p2ThcM/D5\nVqALL+BfK6ijjfa4yYUASVIgKFkuDJv1HUzGi9ve2EkEoK1aOv8pwh8q6hx3LeCXEH6E9GmLn0/f\nOpzZ+4/j0gQRBoVZg9O5oUfzoYa33VbEnDkVeFpQEv7kkwQuuyyESNkvRHz8IUpK2paZKXev5KMV\nUQEvaR2KDpfw+YNfsW9lJuHxNibeM45TrxqGJEl8+cRSPn/w62bbVSwKEx8ew8X3hVbHDIWZO57i\ngV3/5/+jBSIwySZWj1vNiLgRbW47FDKrnPS8Lw/0EISoCx5MVfjpzhtZXbaGge+OoNOPXSnvWkLB\njUfZ3m9rAJEeqnJy809ZrKyN0rk3QWFiVOuGQR0GDx4cct/i4mKOHj3Kd1VHmFn2L3zGPJTIjTiF\nF1F0LYgMxttiuT++K2EhdIOSk5NJSmo9JLYOXq+XgoICKisr8fl8aLURQ7om0D0aez84zM7X/cV1\nBFAVL/P27Kh65u4VHs2+c6YCoOm5VFb3R4j21Y/1CfigDJZWQ4IC18XBQAvoAhS5U60P3167txVV\nGUiEbTVSK7VsT6J5/OHF0xpDAM/szq2P567waty4KYsNxdW8fVrohaLFi2vw+IAI/DrqTYh/x45O\n9O9vQghBYaFGWJhMePiv4yFrYxY+GDUMZxZikkOvDSR0iePmj0IrW/7pupEsfPhf9dZ/YygmhYtm\nTGLi3WNDHBmMUncpY5aNYUfljjbFx+tCZ3fF7l9M+Bet3gcGG7hDE35poY/Pqxfx3Ih5uCo8qE4V\nDALDuwayv86ly6i0+t27RVj4vjbpCmDPnj24WpraNYKkGpodGKKiosjMyeRJ9VrsCY1CICUg8U0o\nfpDepk4hyR6gvLy8RcLXNZ2Dq45waFU2pnATfSb1oGPfjnTs6JdUEEKw4Jav2P31AZxFLrRGtSIk\nwFKt0+Ggj4La9ac8p7+PmpZLRfUpQPuLhesCHskHe+2j9XEFnGWDWUnQMWwaMeYRuNyvI6jBZLgS\nk/Hqk2T/H8J/4i4LYLkkSRrwuhAiIEZOkqQbgBvArztyIigqbPnFlGqljpvi3cNFXN+9A0Ojw1mX\n5WL9YRdzVleRX6khxkOA/zxbwEodfBAWBgcOeCgp0fjLXwrIz9cQQnDOOWG8804HoqObTyJqCy66\nyMb8+VWtEL8OnaswjDvO2YnB2umtITopkrG3nMGq19fh8/gzFWVVJjIxnCd23Y8tuu3x1tPWT/OT\nfRshIdEzIjjSpD3IrnGRVe2CFCA7oomVr2M45ODcO1NYOWM9egmo3tpH3Svh9fpYdO133Hnk+maJ\numPHjhw8eDBouxAi4BinDmWGSJqrMGAwGNgfsx89L/jLlBCoYZs57svAqWtYQpB+S+6cgyuP8OaU\nj3DVeChLLebgWXvhEFydcDXTZ/oHekmSOLgsm4qCSlRvbZIYArfNhcHlrxJjdtSAZAAM9I9MoMY+\nHbd3brPnbQ2KFGgfCWBVDYw5BG90781l4ZMwGiadcPsnceL4TxD+6UKIPEmSEoBlkiTtF0Ksqfuw\ndgCYB36XzomcwNiir1UQSx7qIZXy7klNPoFHNueyeZkFj0/gapyhLdX/40c6MFGGJTqKIlNZqXHN\nNQU4HA1dXrrUzsSJeaxff2IDVx1mzozn2+8c5Ob5gt2dtT033bIf5cwClpw5CavatoIvTXHVSxfT\nZXgnvn/pB+zlDoZc0J+J945rF9lXeCpYUbCiXeftGt6V0+JPa293A+DRBbIEdHCAR4HjNpCEP1sr\nzMVwXTBhQgzP3pSF7g0m25pCO1V5NYQlW7D77EQYIgKIPDw8nPj4eIqLiwE/0Qug1AsRikBI4NHh\nrRKJqwa27NbzWrwISQQFOgnJxwWpifxJ7Ypa4wsaTIB6DZ8cVw7P5j7LT9U/0TesLzcapvPZpO/I\n75DH8qeWUJhxvP6YTaxmy5r1zD/jDdZWruXF5/5JyorO/On1seQNyGHlHf+iJr4KSZfpubwvx0/d\nB+H+RezhCXG4vFXtSWQOgC5ghxO8Id7kagFflH3JZYlXnmDrJ/FL8W8nfCFEXu3/RZIkfQEMB9a0\nfFT7ULmrmJFUshELAgPUCyKAhEZ3duCef5A1T00LOnZVTg26qw2ZfpIEiYAJDAbYutUdVDrO44Ed\nO9zs3u2mb9/AWMOqiip2bNpFZEwE/Yb2bdE3POWdfHLHCSiXIVzyB+voQKEORXDpOI0Lb8hgUvIE\nbOqJ69JLksRpVw3ntKuGn3Abdp8dWWq7KyvWGMu6s9f94qSY7uFmYowqdp8HOlVDSg24FZA1zlOi\n+HxmPxRFwhRupLreX9wAoQuu2XwNSyq/QCAwy2ZmDJzBvf3vBSA/P5/S0tL6/XM8cNVBf516VYII\nBap8EGtReDupZY2ZUZGjGj+S9bDJNq5JnsKEmEH1vv7GiIqKIioqij32PYzcMRKX5sKLl601W/lI\n+4ge0/uy87wt/pCgJrfzLfEmB7cfYF31OvRonfLzykje3ZHl9y3BZ66zbHT2jd+BUBs6dmlUSZvI\n3r8AG7h4KwS4BdyR1/xxG6qbDxQ4iX8//q2EL0lSGCALIaprfz8beOzXPMc3f/uBza/uYjBOdOMa\nCq2dia4sxCYq0FHQkTDgQy30otrd+MIaEbEA3dU+v7tlkMzSV1K5775ifMGaTRgMkJPjDSD8t55/\nlxceeBmD0YCu68QlxvLW9/NI69Ix6Pi7Fpbw4yGPn1UaJ/7IQIoCyYJOg+O5PC04Jvq/gWRLMgmm\nBI46jra676iEUXwz+hvMyomn0tdBkiQ++lNPzlmxB00XuFSBzazTLdzKx+N7YTT4v9dTbh3Ed/es\nweto+LKEqpPZ/QCLKxuqorl0F/dtuw+zaubWXrdSWFgYIGmcbpJ4NFXw2HFQZBmPgI42la/HUhyb\n8wAAIABJREFUdkdtJaqop7Un1yRcw3tF7wWk/58ScQrnRPtljuPj44mKiqovfxgZGYnVakWSJG4/\nfHuA/K+GhqZo7Jy8pcU1kx+rf6z/XTdpLH1oYdD+jcke/ANZa6j0wesl8K8qiFfhggjoZIJMN8wu\nhmMh3os6JBhOTGb6JH4d/FujdCRJ6gLU6e6qwEdCiCea27+9UTpeh5cnw+biMik8/8BwVK+P02fN\nRg0RWmPESln/bpR3iiZzXA9cURa/xZVtg+PhtFWNa87lsdx0ViSPP17KzJllQZojJpNEVlZ6vcTC\nph9+4oaJt+B0NFQjkmWZjl1T+f7Av4IsXemmw23qhz6n8/9M6viKghVM/mEyDs1RX6GrMXqF9+LF\noS8yPrn9aw1Nsb9yP3dtvYs9lXvoE9mHW3vNYHtZIkftbkZ1iGJyxxgMjfzeui5YfP137Px4P4pR\nwe62U5JYyMd3voEjItjyt6k2Si8tZffu3SGrVemygie5K1ZVYWBMy/IWjSGE4MuyL5mXPw+X7mJa\n4jSmxk/FILfujgtbFxZQBLyhUdolItcWvJwCl0T7cx/A76KpG8+EgBoN/nQI8rzNpeI1DxWVj3p9\nxCXxJy6jcBKh8T8TltketJfwf5ixkdWPbubRF85BGGT6fbiQ1I1bkUNcUyQxKKj1D6kj2szaG0dS\n1DvR7/fdFg+e1ic8FS+kE2mRKSnRyMjIpqxMozbyDatVovd4iZKp5dgMMvf0SeWne57k+0VNk+7B\narPy4ep36DO4odRcSbWP+Ltbt5QBPrw2nitHnFhh6X8HMqsyefnAy2RWZdItvBtjO4zlvJTzUGX1\nVxmYipxFjPh2BNnO7IDtMjLvjXyPqV2mtnh8eU4Vc7+fx9PlT1IRX9YiUWp/1tixY0fIoiU2m42e\nPX/ZgnN7kbQxiQJviCpr/wbCT1JhVXeIUiRUSbCgDAaHQaIKP9nhlmPQxojhIDzU8SEeS/9VJ/gn\nUYu2Ev5vOtNWUWVqzArC4BcVj8jND0n2/jgdvfY3/09YuYtxs1YRfSQXFAEDiwmdBQh1tsyD50YS\nafHfsrg4he/WJdFngowhShCeKnBdVMXWyQXkONzsqXRy9fqDLNsfmsBlWaayvEFbZ+nn3zP8yvdo\nJV2xvjuvbixj8dFSHG3MIv53o0dED2YPm833Y77nteGvMSVtCgal+XDF9uCSNZeQuCgxiOzBr4vz\nl01/waM1nzDx+ZGvSF3ThfvVu6lIaJnszbI/YzUxMTEoQkaSJJKTk0/0Mk4YtybfilUOFFMzYfrV\nyR4g3wejD5rIE5ex2S5z23E4JRM674XLck6c7A0YuCLhil+3syfRbvymCf/Ue4bgtDUsWlZ1TEYP\n6U8VKCEkCiQBfb85ANoroLggtZrmJqrzr43m8cn+ePcvcr6g9xdDGLJ+ArsvfhfvS8epnlmMPs4Z\ndEcLBw1AMQWLTnk9XvoP7wfAh699zN3T7sO1fSXobSFwwaZCO39en0mHz35iZX7bsnJ/i3hp30t8\nfuzzFvdx6252VASHhQohuOa7jVyy9EMc3uo2EeStvW8FICkpieTkZFRVRQgorbZgjexMePivO6va\n/eUBZo96h1l9X+Or+5ZTUxLsurm3471cEX8FJslEpBKJWTYzIXYCU2KnhGwzjLB2yyTXQUJiWf9t\n9I9+nxvzUvH8Sg4As2Km0lfZ+o4n8W/Fb5rwv314A7YSJxFlfv941rgz0UOEKBoxI4cifCAq1w3y\n+SCegRQHWHz+9Pc6CAHOMp7Yuo8Kj4/p66czZdUU9lf+jBCr0fVnQb8FRGiirj79VJyRMej4B6Y6\n1RS3y8vEPpNZ+fUPPHXnM7hdHsJyt2PJ3d6ClS9g3Atw17loI96gWrdT7dM4/4e91Hj/Nyz9XxsP\n7HigTfuVukqDtr2/p5r3dkWA6SuQW0iZrsW1Xa9l1tBZgN+aT0hIILuiKxc9EscF91oYdEUFgy7L\n5WDOL5cJBvj+iTW8f+VCsn7IoWBPMatf3Mizg+ZiL3MG7KdICm/2eJPs4dkszlhM5tBMFmUs4rPe\nn/FClxfoYOiAVbJyunw6i82L+dT8KR2ljliwYMOGSTLxRKcnuCPljqAyiI0xMGwgmUMzyQjLQJVU\n1g9YH9JQOlEMsg1qfaeT+LfiN+3Dny69QgSwpUsiK/82CAwykcfy6PPWYqJK8hAoWDBhxRryQdeB\n7JGdWHfTQD9py6+AboNCCxRYoVzCtPU4hgNF6GkGLDOiKPWNI9j1Ywb5Q5AaIhDCC6oZ+cZG4g+W\nokkamTHVVJYeRMLh11SphclspMoQg7Ha76P1RCRx9NLZoDQduAQMXAhT/WGDeE1Qkw5r5hKuGJl3\nSncu7xxfv3dxQQnP/d+LrPhyFSaTkYuvn8LND9yI0XTiYZz/DUgftm6WS0hkX5BNWlhg/kPPN4+S\nWe6DuB4gNT8gZkRmsHL8ShKtDXVrN+51cdljhRwtrD1OA2r8s8KEGIWjS9MwGk7cp+KscPFw0nP4\nXIEhLapZZez9pzP+oTPb1V5NTQ0HDx6sX3cQQrBf7KdCVHBJ30vqr63UW8qC4gXkunNJMCZwVuRZ\nDLANaLbdJ44+waM5j+LlxAc5i2zhje5vMDXBv85S5ClifuF8dtt3M8w2jGs6XEOUGpjLsLV6K58U\nf4JAcFn8ZQwLb72a3B8ZfwhpBTvQAVh9WIYXD8GEDlSKONaXX43fftdJ5SgD2EIMwfUtNYPM7vMz\n8L/NNqASlDDoYEc9VIzx3WJS+Yk4CtAPSxxYMQDOmgUUgVgM1GViukBfAMrfiM4pp+/i3XTamofH\nYCSvUxcUTaPH8Vy8GNnJTwF9yO92DlF7v6kfjkxV+aR9dit5Ex9Ds0b7K1TLAm6YBt0bHWtwQ/hR\nSFqLXjgaeyNfvr3GwZShl1JSWIpWu33+M++w66fdzF/6+q9w5/1YudLBvHkV1NToXHZZBJdfHo7h\nF5BgYxwuO0zXb7u2ad/+Uf2DyB6gwF57T7wjwLDBz9aNMDR2KJsmbQrII3D6nMzfs4A7PtiK19wL\n5DEgDP6k63AQleBw6Xzzo4MLRp949afcn/NRTUoQ4ftcPrZ9uKvdhG+z2YiJiaGsrAxd15EkiQw5\ng+Tk5ICBLNYQy/Tk6W1u976O95HjzuHtgrfx0UK8ZSNESBGcHnE6Dhx0MXfhb8l/q7fu99j3cNqO\n03DrblzCxeLSxTyZ+ySbB24mzez/Dh/IfoBnc5/FI/yzslfzX+W25Nt4svOTbe73SYTGb5rw44At\nWNGQ4KgT5h4BkoG6WqQyuXTiOB05k++IoRQrTgTgiDKz+h9/ojIlEoQdyAZqLXSvwPBdLqfxJUpt\nQRBdaIx852dKN6Sy+a7z0I1n+91A4vvacx0naWc+Z728FsWjcaRbLzaeMQ65VmccIThr6RdEFGRS\nhd/n7g1PRAuPQ9IDXyRT5XE6f3Q97riuoGsce2owdAohXaA6IXY7WsEoxic3VI5a8v5XVFVU15M9\ngNvlZvOPW9m3Yz+9B/ireOm6aJcyZWPcfHMBc+dW1Xufli1z8OabFaxY0RFVbWhTCEHhvhJkRSK+\nR2ybFnGHfzWczVWb29QPCYmvzvqKnQv2s/rJTdQU2ul8VkfGPXY6wzqYWHHUCTWPQtRFgBskNwgj\nFlXh/T+9H0D2OTU5nPKvUyhzVuPNsIM3DJzxsPRzcEf7bQiDv8h3blHbyK85RHSwoYXIAAYoO1TJ\nwe+P0P3szu1qMy0tjdjYWMrLy5EkidjY2IC6vCcCRVKY130eT3R6gtuzbufDkg+b3TfRkMjbPd5m\nfPT4ZpPx/nrwr1RpVfXhuw7dgVt3c+fhO/ks4zN21uzkyWNPBoT3unQXz+c9z9SEqfQN6xuy3ZNo\nG37TPnwnsA4bgatx5iZ/S+gorGICX3JR7RZwR5gp7RIJwgXiJZD+CpIKAuTNDoYc+wQDxnofpoyM\nhETMgWP0/ngtSGbQ78K2KR7JbQHpfM58eS2qR6M6IoqNZ45DMxjwGk3+H5OZledOwdBo6mpPG4qh\nPDfktUmAuSQLc1k2T/ePI8xlxVhWu/grgOxJsOxj2HsDetxWelYOpHNNZ15yv8S29T/jtDuD2nRo\nOmfN+Zo/P51Dx45ZKMpBkpOzeOut9i2mzZ1bzpw5VQFLDR4PrF3rYsmShgSh7A3HeDTtRV4Y/gbP\nDp7HE91e4fjO5kvxHak+gvqh2mayB4gzxJH1ynEWXvct+duLqM63s+vTA8we+j4PdDdiVWUkLR3K\nloFjOrhHMyBiMocvOkyvqMDyldevu55iVzEe7LW1Z+xgy4VBT/t38JchQJElBvcy8cjbZSRfnEP0\n+dlcNbOIvOK2DwKJveNJ7BU6eU7RFJY/vK7NbdVBkiRsNhsdO3YkNTX1F5N9Y8Qb4/mg9wfkDc9j\ndtfZPJX+FDd1uImOxo50MXXhiU5PkDM8h7FRY/m0+FOu2HcFfz/0d3bbd9e34dE9bKreFJSroaHx\nbfm3AMw4OiNkLodHeDh1x6n039qfOcfnoDWzZnYSLeM37cM/X3qVVcRTUz9RkYAuNB+OoTONNwEo\n6mbluwfNIDb7p+2SXzdd8QriH3mZQbkiZKSDQKBLEkvfvhXJ7SJ26T+xT/sZ1whB9EEL/Z8aRknJ\nVHYPHEZQcVAh6L5zE9JGvwVT2Xs8QvMRmbmi2R6rqPz5nOvYvWIfAqjuUsmGuyMosVwEmgVkN9hy\nwBcGp92OGlZMl+e6o8wy1rsLNJMNR+pABBIVCWfj3poBnoYzhpnLuPqKKm69rRM9+/Vo1Qo3mTJD\nyEYLMLsYdXokK5elU1Pi4J+dX8JdE7ijJdrMjNw7MFoD1yjK3eXEfh4b8mVvCZNizmPwxaMCMmkB\nZFVi8DV96T5zNI+sLWNLoZtOERKPnBrLmE7WoGv0aB7CPgjDJ0KQtscGn+you0RGD7EghcPK7S6c\nbn9/VQXiIhUOvNuRiLC22VHHdxby3MA30BuRlwEDKgassWYeLLm1Xffivw237uasnWex276bGr0G\nBQWjbOT1bq8zLXEaPuHDus6KVwSvB8SqsZSMLKHLT1044j7S4nnMkpkpcVP4sFfzs40/Gv4QcfjD\nh8bTFRcNoZSi0e+hINXvYSkXKN7RoDxQT/ZooBULInIzW4xmkIVA0nVEzGFK56zFOboaEV5D2eBi\nfvjoW3KvWUpzhUEP9xlCWkQ/YhNiuHCwDUfHIS1eYx95EAdWZCF7FRSvQtSBGMbeYsJapEPXj2HC\nJPjTrTBmGnhs+PCRNS0Tj8GDBuRMfprDV39IwZi7KRxzF+6M/pDg75uJw6Txdzq4pvP92/dw/oCL\nOD1lFLnZzYuhFBR4m6kRIEFUNT983Itn3M+w7eNd6Fqwy0L36uxavD9o+70/39tusgfoVdEHWQ2+\n17pPkP1jHv3ijSy6sAM5N6bxeGEeq4e/xh3yYzzR4xX2fJ3ZxrP4nwabQWLmzTH88x8xAWQP4NOg\nyq7zztJm6haHQGJGPBEREZgwY8KMGQtqbU3jxL7xrRz9v4d3C99lp30nNXqtxDIaTt3JTYduwq7Z\nUSWVKXFTMEqBgQNmycy1iX51z1RTaqvncQkXC0sWcsDRfLEXXegnZwEh8Jsm/Du+O48eODGi00D0\nlbSUQKXVErm10sms/h0x6BI48GveF8hYnj2E0sJtqRsIzGXV0G8uwqwH3kVVUDHtfRi0xL+/rhNd\nUkRUaTHUDhREDmZD4RrmfvAo998xCaGaQlKdlTDCjRFoTUIuJU3Q46eV0PstUF1gcIDihYhsALQE\njWML7Ry77GU8iT39ylZ1P6oE58jIajUpPISRXGS8SPirPRXnlzBl2GUhZQXALx3RLExuhMnNPZ57\nWBj7GV5nsLXs82hU5VcHbV9dtLr5dpuBQTLQv3sfNHfoFzumc2T978tm/si3D66iptgf515ysIx3\nL/2MA8v9UhZGxci45HEoUpNZmW4gquR8nr85hvJv07l9WhR7sj0oIR4Rh1uwYW+omq2hoagyYx4e\niclqqncZgr/s5dlPnN7mdv5XsKB4QUgJCFVS2Vi1EYA5XefQ19qXMDkMm2zDKls5NeJUHuvkz8C9\nLeU2jLQeSeYTPjZVbwravt++n5HbR2Jca8Sw1sConaPIdLR1YP/94zdN+OYIE13w0ANnre2uA6X4\nvftNCUtgxM3XXIjJZuCfG+/hzuFdSXs6Bl6wwuNh8H9hiBIbPrxorUQkxO/NwxC+L/QdlIDRr5Nw\nPJeLPpjH+C8/5ZzFn3DhR28SXVqMwdngX79/Qgwfrv8Uoy1QcTEqMZrSB7ux5dwKMkfZcYQ39Ef1\n6USWHfGTfWMoDVNljy0Db1Sn0DXoZAhPWoNUP/w1dBugoqSCdcvWh7zu6GiVuFQvQffX4oRrG6qO\nfTD+bUy1SXG6BF6j/0YpBpkuf+oUcKgQAr0qtV3iLBISiZZELh10CT0ndkE1BxK1wapy5v/5C6xo\nXo2Vs9bhcQS6ErxOH98+tKr+7zdOfYNkSzLhajiKpGBTbfSJ6cGRO2dz28VRqLX1arsmh9a/MRuh\nd1r7wl5Pu30ok14eTXR6BKpZIXlIIlf/6yI6nda6pfu/hgg1tGqojk6Y4o9oijZEs2XQFpb1W8ar\n3V5l7YC1rOi/AoviX2+4MPZCbki6AbWVeBIdnRRjYFH3l3JfImNbBhurN6KhIRD8UPkDp+w4hTJv\ncJTeHxG/6SgdWZUZfedAPM/tYCTVLCWG41jQCKXGKOHBglcy82rhU5is/gVQEzIcaLgNLnqiYKCA\n46SQVntkIGlaCGPU+5m4ll3PwWn72fn3NfisgWRiUo4x+ttFGBpJahpqvIz+7guEsQZntQtLuL+f\nI4Z1Y0/1erIP5lBeUs4KQxZ37/8YhJvcHn6LcdO1fqt44EcWuq+NoOjUFjRoAVaHrnRV3xdTIXLT\nMl6N8MM3P3L62cGa9fm+fEp+PB0+Ph9qwmDRuZCTBmdugFvfrt+v0liJqW8cS7uFsefMFDRVJqLE\nyXk7y0kbFihPMOurEg5V3A+RVwHXA3rD6BNCL0aRFCYmT+S14a9hUkxc8v5EvvjrUnYvPIisSBis\nBia9NIbOZ/jVSB1lzmYjYoozGxK2du2JJm3daiTz9wzrncutpw/nvLQJKE0Kk5ySYaJrsoG9OR68\njewCgyrx10nty8SVJIlhf+nPsL/0b9dx/4u4Kekmvi//vl4RtA5RShTDwxskuCVJYmTESEZGjAxq\nQ5IkXun2CtclXseIHSNC+vsBjJKRUVGj6v/ead/Jvdmh3YIuzcU7he9wR+odJ3ppvxv8pgkfYOxT\np+Fzamx8bRcTqaAMOwtovgBJeLhST/YAN94Ywz/+kd9oD4koulHKPnI5SmqTtiKJ9Ydpet0Yj0Gf\nmb3ImDeIT3a8ii+8wXLv/Hl3JCl4liApToqGF3Bd1l/4eGDgolN6907UxMLd33xc15UgbL/Syb5J\nTrRTnP4JTYgZhuyWSdmVz7GmAUz1nZBwJfREP7QcmdDVwtK6BFuYLuEixZkCsQJuec+/8d454FbB\nHHitmk/h8Zt7gCYhav0flYlWPhhn4cKjJUzpFI8QgvHLd7OsohIiTEB3EJ/jr4ezDxAgkuhs0ZmS\nfgZ3ZdxFojmx9hIaLsxjcZPxXjpnzBlGeWkFJclFpBgaImCssVZUY3DMO0BCrziyCzyMv7OAzGyf\n/36q4zi6GX5eJHHkI4lIW9PbJ7Hi2ST++lwxX290IAT07WzkrbvjSYr9zb9SJ4yzo8/mjpQ7eCb3\nGQySASR/0tU3fb9pV80EgEHhg/i016dcuu/SoPh/GZm3ur8V0Oa7he/Wx+03hVM42Wnf2f4L+h3i\nN/90yqrMhFfPYuzTp2EvcjDvAzsLHi4mFNNZLBI33xxY//Xmm+N48sliCgp8yOioCJz0J5zjVFNJ\nLjl0IBkFFSOm0Jo8JU76PX4NPz89p36budCKGsI4kX3gSKnhs6SlvKW9iQEDC3IW8H7W+5hkE0vy\n1ZZVECVwRwioMDS7j+JS6Za/itKkM3DYIkK4dQSulM5IzZA9QFpEOntXHqDnGd2QFZmfygq5Tb8e\nYRINCnT1FxvC/eU1IyQ9RMYwXLpuEx/K3Vl4WGdZQWWT64gHHghYh3dqKs8OOSWgDbuwU6VX8bT3\naV4uXIGeeSXYUyB6N1bTYrTwHP5i+AuvmF5hxU8udqX1I33PzxiEhgAyu6awfVQaRYOt3HZbJhyw\ngKjtiAREQHmN4G8vFfP+A4k0RWykwqLHOuDy6Hh9EG79TXtHfzU8lv4YNyXdxJrKNUQbohkTNQb1\nBOvVXhh3IbnDc5mWOY0fKn4AIEqN4qWuLwUJsTk1Z7OL/gYMDAo7KesAv/GwzObgcOh89FEZDz1U\nSFGRhtUq4fPBJZdEMn9+x6Bs0OPHvYwYsI9eJdlE4iGJY2SxjiwO1D9EcSTSmwF4m3GDmBUbH+x/\nFWeiP0IhaXlHzpx2DgZ7oE/XZ/WyZv5S8ib6VTQlXYLjIH4W/kKgypTQfvemkHQ4/4uQH6k1KuPP\nmYJlfypLrrgOr7lxPHbt9+1x0/W9acha8CJjD6UvyWGpIASqxcDGR7pwsIOgZuxNbVdorOgGa2f7\nw0WD+u7EYt2H094HCBaWC4XMyUPoHmGhRtTwV9df+cL3BV6PBb24H2x7CDQTfvPc5x9VbdnISRu4\n3nMW7991Ok6nTr/STLras/lm6Bl4JBNE+iDaAxUyZFlAa1JwJhLCrRJV/wpMgFr9wga++r8V9YvF\nilnBFmvFUe5EkiR6ndONyc+eTUx6y6UPT6Lt8OgeqrVqotXokLOFFeUrmLx3cpA7qQ422cadqXfy\ncNrD7Z5t/Bbwh9DDbwsKC70cPOihWzcjHTo0X2zi2M+lzB6yuF5euZij7GAVVVQgEJzKaIwYcTVj\nFVuxcXzccb5a+L5/gw7jzptM3NZEVIf/vF6rl+Pjcljz3neBxKnh14lYhp/wkZB90Hm9hbQtJlzh\nOpljnJR2bTxlECRP3shxjhMADcKzI5k8aCrl/UvIG53P9qSb4PBQ/BM6gek4uIsE8Ue/ITLfX1O+\nrjuxUjzdRZ9GZwFXpMLni2tg8FttJ3yfCt8sBb0poesgl4FSCt4etLXBs+LDWXXOAM51nMtK32o8\n226HXvPhxzngahrCWOuvVzz/z955x1lRnf//fWbm9u2dZSu99yYoIkUQFYm9N9DYEqNGYzQaTTFq\nikk0lmiMiVGRRBRFQZqIFOm9LMsusH3Z3m6dmfP7Y+6WuwU0P833+zV+fCHcmblzzpyZ+5nnPOd5\nPg+4j8CT54JfgUEt0OKAFgUmN0Jfr3WBprC+sjgJqjs8I7EQGyuof98i/PyPj/Hmle/jrw9goBNC\nRyJxYG8LpzQwKHE52J87jH5zB1Lng9IqSZRLMLCPnXFDHFw6NYoku87eJYfwNwYYeG5f0ob83wvD\n/N8EKSXX5l3L0pqlPZK+W3HzQMYD/DT7p//h3n39+JbwvyRCfp1H3a9FRIrks5v9fIaJwQwuQEWl\nESsr1Y7D0iQH/PiJIgpUwfMlT+CscxNzLB5FF8TsiSdraR+kJjl6w0EKr8pDqt2MeQj4HKiZh+q3\nMfuJRGLKVWwBBVNITJtk+1VN5M/wWS8IVXDXgEcoz/kr76nPIrwKaOCocTJz/jz23b+dE98pQGom\nJg6Qguilv2bBpHgeH3I+iu7hu58fZdHeY8R8tgmtvgFb5hCUmDhSDlcxZPlhlNbJgEvwzsF/oCd+\nCRlmCRxcCPnX0b7QEI6ksh+A4CBoqz98etiAI1dnMNg7GH/+RXB4AUy/Htb8A0w7JO6C3musmU/1\nWCibCtIOWjPsjIPtCTC3Duo16Ouz3n0dm5ZAkwovp7TviIHvXx7DH+5KYvOzO1lx//o2q16G/wNr\nUb91Yf+kJ5p/DZtEnSs8szEFNNNejN4Ndg+ck7+VEVUFGEED05BEJbuZ9ZOpTP7uODS75TasOVZH\nXVEDvYal4Ipzcmj5UfJWFRCV4mH89SOJz2wPO/0WFumvqV/Dv6r/xeKqxdQZdV2OiVajqTujrmv4\n7f9xfEv4/wZemfURBavbLWaJpI5KdrKK0UzCjYtmWrBjw4atLRNXIhEIJBJDNVAN62EyNQMpYO/C\nbax/soNVL7Fi/53QtiSgA/shalMOg1ZPo996F7Zg5NTT0CTvPXmS2b9MYPkjffAnaqA8jRB1CGmQ\nerQ3U5+aTcWsEja8vArD3TU+3YULNx5SPv8LhyoSInd2eBaUoMH8H36ApyFA0Knwz5LnMe1fIpFF\nAlWQvutZynyDAQ1EC9iKLD2bwBi+bAWPFZfrXBGcT8PKP4M3Hc68DXb+BCbfB57wfROAoYEvFTY8\na+nhiFqoyIRowCZ7DkYOCliUBFU2EJLsIQqHXs1GNQx+lfx8l2zejoQPoCsKz54xB6/N3u6Wa830\na6DdmIgBVejc/tnbaLI9eki1q0SnecidksnJwzWWBpGmYAR0PCluqqqqORo8RBMNCLsg7sxYLv7u\n1dx86aX/tibSNxXRG6PbEsA6wiZsVE+q7jGE9P8q/isybb9qXPvOTGIyPLS6+BRFkOruzRtr/kGz\n2/qxRxGFHXuE7IIIZwEoKGiG1mbxqbqGFtIY+dIEhv8lfC9aF2R1YBlQQPv2RuizMZmBa9xdyB4s\n1/RFDybjqdOY/GoDKA+DOIkkhClMKnPK+PTulez//u5uyR7Ah49aWcsh99quOzskaJl2lfd+O4/y\nQcl8vnAypr9X+ARJltV6OjtBAElQr/8I1F+CYxM4DoPiBdlTYlwHdDJEojSFMbZhlktND1d/8pTD\nmXdbZN9xIVnVwV0GE38EUgXhtiKL7Kcge7CuyWZaFdAmNxF/ZT0uh8LJ/dXdZvN2tOwB8pJ6oStK\n5BpM6z87LuUEIbmpG/XWoEFdUSO73zpE7a4G7H4HarOGETKpKa2jNlhNMLyGJIOS+g3wVQegAAAg\nAElEQVS1vPO3X3Ldshk8N+F1ynb2rFP034aeRNYStASi1f89pUH/0/g/H6XzVcIZY+f+o5ez/51j\nHFtfQUJuNGNu7E90qpthU7I5uaoWAchuLVOJjhHOl4wkB9VUmfTLaRy8ZjeOOife9GbL2gTYB3iw\nrP0qiKqJ7qzi2waBQNOtttMONaD5JXqH9VjTZlAxvBhleA+sFgRMrOzgsqndXEK44Q6kv/rH061t\n++9Cq8hEhtIxXCE4/wLLR34qCPDiBfMTCDSAfQaIGDBPAMPDFx2JjEN1THs9j8SSFgJujZ1zMtk2\nLxOfIRm15AiDR93M7pQtUDEJnLVt7XSBIiHuKMQchbqhX2wyIYD0IMyuhwSDQ42CouYmlHi6jeOX\nWIE9ra6vJofLIvwufSHyRSOhV0NVt4VuFBRcuNpmjH6sUF8bNlJJJ4Ve5LGfRuqQQYXGtTFkP7WL\nrcOGU3t2Hd/bdwNRaR7eu3cFO9/cjzQlwy8exNyfTycuI+bfKjnpD5r8fWUTS9Z7SYpVuHN+LGcM\nbb93pjT5tOFTTgROMDZqLMM9w790G181ns59mtn7Z+Mz20Ol3Yqbp3Of/krKbv5fxX+dS+dIg59X\n86upDuhckBHLhZlxqF9gOly0p4QHLniFtJKuiVitECiUU4QDJ/FEKiFKJA19anHUuHip6GlrkfB9\nrL9jsRZtdcj8vA9nvzTnlFo+AIZq8M9nXyUUHYwkMxcwk+4JTgLvAWOAPau7WVDt7jshkCVk7vZQ\nPDq8sCgETHgI0j47tcVsAntBFAlkqONzdicoV4TP1fY/Uo41ctkvd2ALtpNryK6w/+x0Pr1uIG3T\niuRNEF0Cw587NZFLYMvPoOKcUxxE92GwshFFPo3KJiSShU/fQ9KRNKTefh1Bm8DUBHa/iSKhKDaR\nRSMmE9I62VEm1v1tXXOPhkHVBUzP/xx7J70hF6626mwBAujdFB4J4GcXllSBcJnkPl2APN6f9N/O\nQBlWg37ldoRLRy4ZjLaxL2p4QVl1CsYtHMHcJ6dj90QGMOSXhPjwcy9Ou+DiszwkxysU8TnVoWK+\n92gme/ak4fVLhACXXfDUrQnc9Z1YigPFTN87nYpgBYa0sltnxc/incHvYFN6DpL4T2BDwwYePP4g\n+1v2k+3M5mfZP+OixIv+R/v0deFbH343+EdBNbdsOoFhSkLSchOMT/Lw8bn9sXVnmXXCg4+/SeCx\n/C7bVVQESls6eDPNqB20UVohkUghWfaPtykcnQetUiAumJ80n9zoXKYnzODt/h+dkvAlktqsKj56\n/J/WhtYFSDswDSvSsZuvqy0aqX9Mp2xgEcQ9CY1dM2mRDWAuAVaDPAkiheiqG2lKnWntb7OOQjDi\nd5C1CrRwaKdhi5B3AKAc3DvdSF3i0/0gfgVicrehpxc+s4fcXdVd3iG6TeHPz51F0GVFGYGArGUw\n+qmeCb/1sd7yBFSc1cNBPX1XgrkQKIRw0o+ryc0Vf7qZzBN90GUIHRtrLs/k+LBYLnilkPSCZqQU\n/HXsNE5GxXbw4UtrwbYx/NkGRIFmhli46R2cevt4CQSuDtXZvLR0G1tuYLCXbQTwI5Eo4afNRJLu\nTCPjhjrEvjREog91YB2238xECc8YwJK3uPBPMxl/i5Xd+9O/1vL02w1IKVEVgTuhkttfvgHTXUYo\nJAiZQQ6svoiVv/slSOvuOO1w86//zIv1v8bspF0lENzR6w6e6/fclxv3b/Fv41vCD+PozkJefeAN\n9m09SmO0m/x5kymfOLhtv0dTeH5SFtf3616bvCMCLQHuTXwKJRBERUMAjvD0W3T4QZmY1FJNFN37\nCvPnHOTDaxe3WXxJ7hH8c8DnnDncSXWgknMvvoORH6d3CiKxFoYN1cCw63z843epzwzLAihYi7/D\ngBy6tbq1Fo1xvzuLMX88g2Oj8lj+641wfDERjCkrwVxAwjEng1cOJ7oijoqhxRROmUJj+gXQxWIz\nwd4AZ90B7koonwIZ6yJ2a0UaPxM/4/rc6xm7/CUq41UrkqZ0OnRKYrvxhxuJO9k17DXgVFn86Dhq\nMjqmvJow93ywd12Y6whb/vXoBxb24IbrHg5xiIB+NxBZU0BTNKIDsSQoiZQ47yJgjqJ1/LSgiSxy\nYnweb91XKRCmTkzNQRqcQ0CxWy/kDoFJyf4SLtm8HBsaCio27NixdyB8L7IbIUATk51sJkSwi2Eg\nEPRhEBo24okHd4jiWYfYdN526jKqSTqeyrSXz6PvkQHcuOoyqhITmXZPOd4Oyp9X/f4K0ofsQtHa\n14GCPhdrn3uEfcutmZnN6YfL7ibUb7V1gG6DplTw1IDdh0BQe0Ztl9KF3+LrwX9FicOeYJom99yx\ng/de3c2Q0FqUsEhYdJOPEa+uwNHYwvFZ1ti06CZvFtaelvCllMybWcCxwCSmsoJaqunDgIiFu46k\nX0QBfRiIk65FKGQlYbIX4I/Ft+xFLvJXYNcEdy2IYd+kn+Go38KgLTvRnSEKJ+XRkF6Hs8FN0OOn\n8Mw8AtEdiNEEYoBMunexSIg9mkjaznRqRlSSs6c/uTv2cWzIBqjsYP2aD4PZSGNqC5sWrkEqEjWk\nYmh7Qb4O8jkQHXVwFAjGwdq/gWKAsyqC8IUUPOR5iAdzHyTPzKNy2rOWfv/J8VB5BuiRmgXVmdHE\nVPlRJARcKpsu6UveGalIRRB0dA6jU+DgLTDqmVPeNy17BYnHbqGiTcSx5zTmhX1TyfA4KPPu480C\njeZOCcS6qVMXV0NdWg0En4CyRVY/EOhRDTCiApHlZcCvluIpPo49aIWx5g29nbqkMV0ks6uiE9ih\nbifKcDCUUdhxoAN+zYFbD2JDa1ukbYWJSRON3bp6rKuTlHKCWOKwoVEy4QQrv7cU0269OEpGHGfR\nr1/mwoeu5A9nNWDGRTM6OovSuFROxPfCnVhD2qC9EWQPYHf5GHfZX2ioyKCuJBdfUwzDc2qoiYOS\n5Tdhrv2BtaAhVRizGDnnF7x18q0vVU7xW3z9+NoJXwgxB/gDljn3ipTyya+7zdlT17JlYw2j2dpG\n9q3QgiEGvLuBE+eMRmoWiYRMyaLCGmb3jiXeYQ1JU5PJJ5+0cORIkNpak82bfaz7XAVyOAs71VQw\ngKHdWlgKCn78lFNCLv0j9ksk3jENeOxxePdfiNxzOy2+VFrj9x5/th4mudl+3jnUDXiJokl7kIqJ\n7tBP7a8+E+jJZSqgZkQlH732L6Qicda5GPWniRy78TFYttyKV5d7gTxQQHe3M51hN7As3QCYPwX1\n5a4nl3YwJCTvAMOGXdhxKzb+4vkLF8ddDMDl/sut2YCQkLgfzK6d3TI/l5y9NQjd5J8Pj6W2lxvT\ndop46ePzYcDfwN0p4iXM6Zq0w6ZnqPMLkKVAMZAFjipwfQJ6MnjHMzYuh3emTSI7ysVJX4i/5I8g\naHYjF2FN6SzYK6DXrVD3CIx5BhG1H/fBaJx9HIj+SdgL2kNeM4+/T0PicCsfohU2L4r7KCPM8aQS\nj4qGgcE72RModScwu3Qv/RrLUTAx0NHtIYRuZZwe5WC4O90/EAH8aNgoSMpj3aMfIbXIWbzu1Fnx\n0BIuuecGoqv8jKtrYmTRIXRVY+Psvkize/dmYnYBFz1+J5otyPEdk8mdsAvd1DCzl/BuyRxK94cN\nzF2XghrkWPapC5l8i/88vlbCF0KowJ+AWUAJsE0I8b6U8uDX0Z7Xq/PnP+WzZWMNKn5ctHT7kzAN\nBdvOEMFEO+QarC1vYm2FpUY5MdHNHXXpLLinGD3GD07DCtM7HEerC6KJXBwc6rEfEkkQP166uhuE\nELzzyCJ+u9rk6W316J2iJ6UEaoFkScHUAtBOEwkThr3KQbB34JQaPMFYy9ce8gTZ+uB6EEEY/BIc\nvA30RacJ0jWBQpDVILqbDQk4/h3sNeP5+9RMLo6ZYAloAcVmMflmPm3hR85a6PMOHPuOVbUrjOrs\naLZcmIOnIUhDsuvUZN+KyomQtdy6Na0x7z6weWyMrbmNvU1pBEIPAlux3ohey78eZ1rHJr5ARVNv\nsqNKeCmvih9sLUIVLgw5LvydDtITCtZMqhWOIyhTbyDxvRSy77fUIEVI4BvcQsuQJjwHLZeey1vC\nwMNPcHjmBcjqodYLauyb3PPYMNwyuY24G+zRFMT0QldU3smdhGoaqNLEaTaTmruIzBWHu32mOkNi\n0kg9+36wndT8dBJLkmlIradk+PG258OX6OUff38BJaTSd0cfzmqOwZ9QyZDVuWgNapcAKikhaIAz\nyvqd9J20DqGAig72Bi751c28cNlmQn4PhNyw/Woyv/+v09+/b/Efxddt4U8AjkopCwGEEIuAi4Cv\nnPCrq/ycPX4VlRWWq8PAwSbmAWCjhTF8iikcbLZdj09Jh7/STo4X+GGODxTYstvHlp+XQ1LImqJW\nuWBYDTx9GN4YADuT2cgwRrGGaipJIjUiJt/EoAZLvM1DJ5lFIHtSBgnZcdQ2VnUhe8AiIR3QCkHp\nWpe22+OBi+f/kPLZ+/j0F++fPgRRgWB0mMjSl4NcDnu7Fq7oCgE9SSpLwFCwFYzmxgN2Xs+t5q1L\nUol2KBgYXa3RoS9A/GE4cg00DEQBfjEqi98V1lOTGd/FKu0RVeUWj9sBL7ib3DzV5yluirqJZ0/U\nsiV0HxZxB9v77pPWSzWso1caXcqGymPcs7UWGgIMX3WMzP3z2D0rmsJxm/G4TEaljGSbbRs+tcM9\ncYJrv4ece/qj+tpfTp69UfhyfPh7e9EabTSOrKfotq3IrH/gUTzckXYnf/1LNW5DjxiXKmcsqmmi\nh+WYDUXFQCWoxtNkG0acbUtbUt+poKIxgnGM/cnktvNL1cTUTLbP38jmaz/BF2vdb9NuUDg5HyMG\nLhkB+oJy/FUOnLLrunqdD7aWwPkDQVUkAR1KGqBvIqCG6DdlNYfWhKNgTI0cV6RQ4bf4n8fXnXjV\nG2se3YqS8LavHI8/vI+KMh/BQOsil2j7E8LDFuay1n63RfatCUatxyxzwpoYi2j/5rLMmRoXnIix\n/t6QDo+Nh5sOQaKPMpJJoTdHOYgfLzo6JiY6Ifz4OE4+CgoOXBEp+LHp0Sx470oAzp/kxuPsysyq\nAHsS1sJmT8wtO/wRgOt7LPpFArXOs0g6kPqFColIVYI3Cj6fCr45oEQhjNO9KWJBplnj0/qntT8B\nDYpSaWm24dclKwt8XPT3CkxTEhOMIdaItSQGGjv0O20jk4ft5/1pg6m6dAIPDMlAi3YQXXOaF11b\n2y1g2w3VQBlQb8WEX5FwBR7hITfKiZTL6PKSCie5dcSLJzahNga45tH1jPuokN6FLcx9aQK33Xkv\njxS8xyczP2Fm8kw8iiWZoKBgd9jp/WIWij/yZyQMBUeZk7xfHGD7hxs58sQB/Fk+FBRmxM3gqT5P\n8r2im7tcVkKwBbO7GHE1iMw9hOHuxs3UCQoKGeRgxxHxMlEMFVvAzsTFZ3PrDT8kqro9oMCQcKIR\nmgKgRRs4s71tZG+YUNYIW0qs/eWNsNYqEoZNhSUHYW0BKFoQZ1T7oHpi6xkYPeS0/e2MJr2Jnc07\nqQpWfenvfovT438801YIcasQYrsQYntV1b9/k5e9W0Io1BPTiXCUhqPbcEAQ8L4KJU4oUMFQIdjB\nkjIU8KmwIhPOtlL43+JGEkljGxs4yG4KyeMQe9nFFpppop4adrKZ3WwhoW8s1711MY+V3kt0ikUY\ncye6mTLMGUH6Hqfge9+JYf5IDw6zP5jdRDhIhXh7AknOgWC/ENxPgZpD5iHJeX8QXPC9KyCgtr8Q\noOcXwK/+CIejYO+1uIufJ7E8JfI7rXlYRmtm7UmQ08D8IciXgQpUIMmIgvJk8IV1YySETPikuAXt\nJ5eSuCiRiqUVsBJYC3wE2kmNMbZhvC0eoPrmtTwT/RyPO//Ajcuqmf6PIz33W0owrR2jt72ALWTJ\nXDiEA5fi4qV+L5Fst/IFLspIgJ4koA1wNFt+C01oID2MWFGIqymIpltGg2UO+Fm9/Tnuzb+PeQnz\neHXAq1yXch239bqNpblLsZXZEbLrMyVVib3OHj6PQEXlsazHWDJkCUIIbn2wa0BFir+JXr56FLNT\nZI4WgrP/TuGDeRgOIzJUU1j5H1bUmEIKvcihX/fXDChSwdXg4cy/zeo0HjY+XXQdR9af28YK1S3w\n/BZYvA8+Ow4f5oFXh93llnunLMzvO8qgsgWO77DKMmoOH9+543X6idPkP3QcLyn5yfGfkLollXP2\nnkPm1kyuOnwVfrNnCe9v8eXxdbt0SrFiR1qREd7WBinln7EqXjBu3Lh/O0ZUs53OOpUIsxmp9CA4\nZQjYZweklTrZ2drVVTiY0Eb4AVwscT7E1JRlBIo+pYVmAvipJjK9vYxiblhxCdn9IgupKIrgw1+l\nsXhdC2+ubsLtVFg4N5pzx1uyAdVeg1Uli7ll63kEDANd+rArbiYmjWf17I8Zv2YR1Q3Vbeeb/K7l\n7u+9I4uzL/sVn17QBDc+BY5OiVmtI/zP70L+SEhNg+p+5FQvZcbf7uDwuB2svmoZUjOtMVAlUulg\nySNBbMUl9rFk+neZ1msSGU8cgW3CWtttbStOh0Yv0vgFFJ8HE34O0UVWTLoB+gadAakDeP2Gd/FW\n+5GmNROyfVZBpltn6ut5rL9mQAfdNUl0pY9xayoYv6aawn52MvLGMmpFLvo9zYyZN5xLky5tK4Ld\nEmrhg+IPyHTnUuwtjLyXJmQeyeW8hZdwsPceagaf5Kyr3eg7D6Dq7TOrloQmFr30Cv4YHx9XBvFU\nefCoHraM2kKOMweA3838I/U7m1H9ka4WJShIc/fi7PqzGWoOY+GEm+nTK7dtf9qwFFSPDaMlFHF7\nLj++lReGnEOLtIczfw/DVQ9DQjmNIxUKHjpE9iv98Mf7UL0aI46OJ4U0Avhx4WkT9OuM1rBeANVQ\n6bdpMNzXvt8wFA4tuo+jimBaQwLD5i5iWR50rBnTMdm41gtrCto/ry8U1DVEYc86wv137eDhcU9E\nuDpPh5crXuaZ0mfwmT584XDYpTVLubvgbl7q/9IXPs+3ODW+1jh8IYQGHAFmYBH9NuBqKeWB7o7/\n/4nDv2DGWtavPUnPDmyDFsdITOHswcqXkGVCmoRPu029hLFVlo7MrnDGqQK4INe8Cc3X0GPf1p1Y\nRXpWry99TQCVTbXc9ffX+GxvMZ6ycXxv5hwuuzSG+4+t4u3yPMwwg9/2fUlihfWdAym9+XDEIEKv\nTwVPp4LhhgLPPQ4bzoezX4HQbeD00a96PfOeM3H4rOv2u7zsn7gTvzNAXVoLBUN2E+WNZuD4vtw6\n4Fau6nMVmqLxTOmd/Oj22wk1uzqNWcdyXAbYm2D+THC2KhgKeh/LZOFv7kX36hGuLwODIAF0TVDe\nNwa7zyC1yFqsVFHD4Yt6W20Cd6KLX1Y/0NbyxsqNnLf6vPBMI4Tf9KNIBVOYKCEVLaQx/f4LaPTW\nY9JxIUVgx04/BhNDHB89+k/yzz6EtLUznYLCrLhZrBi+AoDa2jpmDTsfrVZDCYR9706D8itKGHd4\nCjlb+uGItmOETMZfP4JL/nQ+aliXR9d1nsn+K01lLeHWYeQNg5n81Nlc/+wt1P62DO+gGirnl6FH\n6yStTSZ+XTK+rBaiD8cipCCeREYy4bSZ2a19b3uu+pTxymu/sz4EXbDxFlh3tzWe8ZVc/OqZvLXP\nxOiBHlwq+DoOnQTqbsKpD2Zhn1E8O3pW91/sAf239eeo/2iX7U7hpH5yPQ7li9VN+G/F/4o4fCml\nLoS4C/gYK47i1Z7I/v+zHbZtqOpgw3SFRgWIQSC6xsW3IUvAXgmKpZiD2eFsmoD+MXCww4NnAl6Q\nSs8LaRJI7Z32Ja6mw+lNySUXNLOt7EKCBlAquWdZDffeU0OKPZ1zcqqJSqmhcKSkrB/EV1qTk5y6\nKszgWHjsNfjJLWALWh1RTPjnHGANXPYzqH8Ugh7Ax7Hh0DEaz+lzM27dmYTskve/J7n7gXmM//hq\nLj63fQmmwlfMprJ9yNacggh0tO5Uq0BJ/pUw/AXADmpfyrKH8d5VISYulyR2kvUH0HRJZl7ki1R2\n+K8VRrCdeUJmiHlr5tEUCr/oTMGsf8ynNq2Kqqxy0o5nMPijEeR7D0WQvQMnDpy00Mxh9jKKiRSe\neSSC7MGKg19dvxpDGqhCJSEhnqc3/YLvPnYHnvUx6DEhKi4pI7usL/23DEFDQzQJJDrb/7aXmF7R\nzHlsmnV9msb9pbd0vXAgdHkJ1UeKSfkgnZjH46wrtksq5peTtKIXrdW56qghRBD7aQrJdPxlhLQg\n+ybswFUThS/UGzZ8F/bNa9vvrUvl7acfwpzzix7P14Xs/cMgOBA/Jq8U7uGxIWeS6DjFb60TqkPV\n3W43MWkxWr4l/K8IX3scvpTyI+Cjr7ONdeuqiQqexEtym1BxJAEFiWI9ATMDrzK2BwsfayFvnALx\nwHrTWmJuzWI9S4FYjxV3VGHAyfB3JDQZZxLPB+F22yEBD0N5NOP3JKS7mXbvGYy5evgXFm968o06\nNg4IwaDIqbFskIxbeZSReXVoeYLBW6EpQVjvMp+JJxRkWuFBPmUY+o0bYdAecPqgNhnOusVSmZRO\nUOoAE3wJGJrK2w8bXP4EqIbVd9WAVddLDFWwbt4oZg+IlFP+weo1NCmpGKHIql7dwnBB9QhQh4Lj\nCsBE2p3smQmYNsauDtKQCsnFEFfZsyugVb7CQKcutZqanJNMGD0OKSVCCNZXrI8ofD1k60hGbBqL\nPdBOGAUcbiN7FZVhjCWeRExMFBSKKKCCMqsaWTcQIlIlc0bOdN696V0eqf45zaKZab+dQ1yDFaHS\nepyGhgyorP/9ljbCb+1zd+jn7Mdn3/uMmmlVJK1OQSpQM6sSb7ZO8tutz7k1TnvZzhjOaEsCbM0F\naU0ENDGRSAx0BArleglikWD8OxexYfKjmEpXGggdno6Y+UsrJPl08hVGFDS0lxx0qBp5TTVMdnSt\ni9wTJsdMZnnd8i5SEqn2VOK1+C98nm9xanwjMm2X/KuIdA5wkmmADwUfJjEITMAgmtU4OUpiaBE+\nxwgknQhKSnADtRJGKNYDfp5mhe8FsZQtWwXWFAlDFDjZbvnVKZcRFbcdW10FQpptj6yCyhQG4a1o\nxFvRyOLvLqNsbyUXPnX66W6j1+DhDfVW2E5nxMKHl40meWkLWZV12AOC1DoV5wSNo1sLcfkTSak8\nyRCPl+M56TT7MjAzimDcYVB+AKEN1kVFj4LGMLmWTKY8dwPPvmiSdUhiC0iKhkFIg8znC8kPHMRV\nFgc54wE4Whti45E4Zl26C2l+gXh5xQ+peRbZi/D4CwCDPbMN9k8HTbfkePrugAv/YEc3I6NrVFQU\nFELOIB8sWMTxEfkopspaz4e8v2wxK89dSdCMlBsYsWFcBNlb59HayHAwI4kPF6ZvrVecSR/yOci5\nxmxWiY8jimNraMxLmNelTF5qWjIDPhmG9MouETLWpVqfQw0maxat44WfvExpYRkJqfHc/MgNXHz7\nvAjy/0HvH/Bm1Zs0D2+keXgjSHC/6iHu+x6KwprasSSQRGpYT0dpa8fSdorM/j7ALkKEaKYBHZ1U\n0hkUGs6R5kYqo+KQHbKAHTaTUQWVDLvtB/z1xT9gqmaH+xVxUVZmbcu5dJTJCBg6uZ4vJqlQFijj\nnoJ7WFW3KoLsBQKX4uL5vs//V6tbftX4H4/S+SrgUHUaSAQEDgroxyoG0chgYDAqvRiHIBqnPEKv\n4J8ivyzD4SgzgZEiHJoRfsBcAmJFO9kT3heFpXAZhpnqoejaP1I58258SX2QCFR6M5W52DsMcbAl\nxPo/bqW5+vQx79N+VEyP4TXh/r0x94y2Tbrf4OT6Wvb5D5FPHDWKjZMj+tA8vjekpUBgLJROAG8q\n2KaCfQxE14Dqs8bAmwT5czFrh3J8iJ38cYKA5sD5t600b1oCO5dzyZyrePH3VknET475qK8fgufv\nw0n1d5PpGpJtcswgLVG1wYfo1lwUlscn4AHdDgVj4PNLFJy4sGFrKyDvtLuY8+TZ1D9TTNHoAnS7\nTtAZwGu0sKduL3Peu5Gy/eMJBlWEtEFTJiHRdfaRSnqYGDWSSe1SmF5DI4NsLjgwj4GugUQpUdiF\nnWglmhxnDi/0e6HLOVMGJpE2NLlbsu8IBYW/XPUmpQVlIKG2oo7n7n+Rt38fmaQ01DOUJUOWkKam\nYTNtuP7lwvmmCxEUbS6tBmqppYqhjEJDw4bNciF1al9BJU0ZSBUVtNBMiCAD1MGI6CAXHf8Mtx7A\nrnnRNAOPUzDAFmDW4Wp6H8nigblPMPPZeSQdS2mrHBl57wzwfNr20aVoXJjej16urjkonfF82fNk\nbM1gcc1iQp2kIiSSKCXqSy38fovT4xshnrZ7dy2XjH6divgMUlIPkXY4i47vMhnOZqrjD8SSzjZX\npIqf3fSiX61idqxU1BOkhKMm7JFQ03W33RYkFDK5TGwgXXYNM1XtKte+8R1GXTq065fDePCxap7K\nr4cocer+GJJr3tlM37rqtussJIpGbGy5eijVufGYtk5C7MIAz0nI3ARSQRwbT3piKYMGbyK6+Mfs\nPRmixdBp8rbgbTbQSg+h1pYSHDEH6XAjZCPzx/s5r1cO+85dRkxdC4XxqfxrWDpq+RYMJYGQ8xwi\nbAm3iWPEMmInl3LSlcAXsTM8DXD3d1VUm4IeMLBH2cic2Isbl19K9rvZlHrDwV4hD7SkAQZseQwq\np+CKqSM+8xgtNUk0VGSTU1PK5fu2onZ41iso4ThHmcjZXQgfLHmCc/42jjnXzWJN/Rr2e/cz0DWQ\n2fGzeyyPt/jvG9lzw+YeCd/AoJAj6ARpoYUG2l+U0fFRrKheitLB0l70t0Xcd/v9GIk6oVK92/BP\nG3YWcu9pidGn2th8ZjFHP38dNAO7sNEvIYdR8/vT/5pBuLxXUlsVw/hBTj6b9hkSwQAAACAASURB\nVBr+2shwSL/Hx2/efwTT1jVbUJFOxMnHsQuVG3OH87uR03Gqp3YeHPMfo/+2/hh0l33YDoHgt7m/\n5Z6Me0553H87/lcs2v6nMGpUAmpcNN5HRpD8s3I6W5HWtFbDwxlEd0iVjzWriZYN+HGQsq+Yg2Mm\nfbEG44Xl/gFIy4fJ74PDj/3IVO6adj4+r8CzMZ6K2iKW3VmEzQ+qrjDx3RRyD8by1o1LSe6XSO9R\naUgp2bNhH/s3HyApPYlJc6fw61/Vwo3q6V8+JixyjuM6tpBFHQJBMzZ80Y5uyN4aCaQGLSnQkAWx\nJxhx9p+ZNWExU9TbmU9vXtvVyJ3La/Aa0eACve8k9L609UWKON7dqXOi9CPm1ragIPB53iRm83ZC\nsX1pSnueLoTuVQgcnEdVUwWcvwm6IY3OMKIU7jt6C/sWH6a5soV+s3LImZ7Ngl+XUprQBHYFtj8M\nR64K6/KEF9oBX2MieoGLXoN3MfGqFzFNleNmA7m/z0JZ39e6bWSQSGq3bUsk9aKGcy6ZiiIUZsXP\nYlZ89264hlCA4xtLePvOzZQXt5De7VHtZ66gJOxS6RVB+L4WP94mL1GxlmVcXVXNvbfeRzAYglPU\nYDDChdRPh8zBsTy97k6KT9zJxnUbiY2PY8ac6djtXWdAG7ppytHixFMXRVNK12i0c+KmsHTyD3Ao\nGtoXkBkHeOvkW6cle7DuxYPHH2RB2oJvXFnC/wl8IwjfNExKbx0MdgXF66Q7t4FA4KY/paoTmwxw\nTuA9EmUlJgoqBnKHyaHRE5GntfCBJh0GbIKFD4O7Hg6PBkeQ4IDNbBj5Khsv+pTfrFX5TXlkmNnS\nB4tQgvDduwbz5o3v8YOtC7h37oPs//wgoUAIh9OOVP6ITdxLgKzu2+8IBYxqhcWM5j7WIgAFSSDa\njmKY3RB+6zVoUJ8NcSc4UDSas0YuZ7BxH2aU5P5VtXg7JrB1Nx5SI+dzBwrNNFLPp2YjDbcvxmxJ\nht32Vgn5SARAlqdCQQb0LbFIv7WZbpoIqib3VW/kuftm4tHsFJ/0Ez8zD1+TA+ZMA18fyL+iUxGX\nDpEofjflh0Yx4vxFNFakk3nl24TOtcOfpuB4aA4Y1pqAiUSqEsWwlkBNwEShRfSls35ak9fk0z0+\nquoMNh9oYVXDEYz4zTh3Z3I8ZTQiUWPqsUOMLynoVOBKYmCwj+0EwwaHJb/RDpfHhTva3fb5T795\n3iL700AoguQJ8VRvqUOR3c88bG6NGY9bxkxmdiZX3nDlKc85/KqB7Pjz/ojoJ0VVuGHNLbx6zfN4\nTcslqaDgUlw83edpPNoXWLjvgBaz5QsfKxBsadrS40v3W3xxfCMcZNveKcRht0LAfOlOuvN9SyQh\nVIq0SUwLvkeSLEdDx04QFQNFSobs2XbqhqSEFdWwthSOp8NP/grvLoChu2DQLuhVxNaTO/jRpkf5\nccViXPUKvXc7iD8Rfq8KMO3w19/mUXGwird+80/2bTqAv8WPoRt4m334GpvpH3rl9BctJRRJCIEX\nB/vCwjBJBPDU+jC7W+ztiLCQmS4F77z6EdV1qVS1GDT3mK0cCZ/DiUSSZz9G/XXPYsang+ih8gqE\nTQsBn4yHlZMQTalWSGjHjGDC/zYVzGPn8NrKbIa8tp16v8G8n+ywyB4BGx6AQzdFCK9ZUhSRCPk9\nHNt2NtljN6EoEtUTwPz+Rg7PSWXQvL5EX5/FP55Vef1xg7yJJhXZJnumSg7ZY2kxPTw05Q18LT4+\nePUjrr9pCUnzCrnqV7u5+aEqXn6rmdI1CZx8dzaj7nie296bTOak9azuO5xNWf0xaRXRC1BIHuv5\nmFraQw9bfdaG6qA2ezrMf5gXP2iiscW6jrf/vvgL3YcHHr+fWz68mt6j07BH2bB5NIQmEIpAc6q4\nEp3M+f3ZDLk4MvvWNCXNVV5Kt1XQVBFJvjN+MZnEAXHYo2wIBexRNjwpbn7+o0dYMWwFM+NmkuPI\n4eLEi/l81OeMiRrzhfraERcmXPiljk+0ddXlKQ4U82Txk9xbcC/X511P8uZk3BvdzN0/lzxv3pfu\n038DvhEW/s6X95KSm0pdWjQJLjd+Wuisey4QHLBPAVROKAOIM2tROxS4UICBB/ZwYPTE7huREgq8\nUOa1Qkn0sLzv+nkw7jMYuBdUy3p7Zv/vGbfkcvqv82CqkoDHJOiWlIzxc3SaD2+iidceZPnrHxPw\nBbo05ZJVVu66/RRuHRPY0E5ym0lmODWk4qcpZKPfhmKOTsnEtHdj9Qkd4o5ZQ9ScRl1DKgOTbETb\nv/gS2aFRMYzbBmVDhiKFCkKBJHpY2AMGYakclAuUsiRemRLNXftX4+18vO6AYzNAtyQoisrsjHq+\niKIdybTdz+ZeYOscLtj1RSUUHWdUA3ZXpDbP8TMLmH/vQ6Qtew6fYUAqLLnP6ogSgMGeABmrYnnX\nHMBz5xch6YcMCRJSCmgoybb6YaqEfFYf37//T9yxZiIX/uwuXrtsOZ8mD6XG4eb8/B1sY0NbXdrI\n3poEHPEcnPAowunhyHGVjS/V8tPX6lj7VCy11V2LnLdCRSUqPoqHfvFjFtxhafJ8d/vVlGypoPZo\nPWkjk0kcGI+/PoA70Ymitt9VI2Sw6sENfP7H3Va5RsWy3gfO68Mlr8/B5tJwxjq4ffc15C8/TuXe\nauL7xDL4O33RHBpncRarhq/qsW9fFBOjJ5KoJlJjdLMQ1glZjixGe0ZHbPug5gOuOHwFpjQJyMjf\n0Iq6FWzavYmDYw+S7ji1k+2/Dd8IC//k6lImrjtBdKWPnLwGctDDF2aZjyqSTAwU4cSnRHNUG8GH\njmsIdRKQt4V6mEJLCX4J6+stnZ2OCNphReQUOXdzP/p96sYUkvreOq5GhYRijWEfRnHRA8lkbXFQ\nE92I2cOCuRAm2q66cPdbZQ06CJbpEtaY4AVLvreaSlz8ilF8TAZZRjMjC8uIOdmC0I32SCQMi+xj\nSiCqFEwb9trhLBwdTZJbxaEJbhsfg7uLTEWnfooQ6SOWIFUDd3Qu2MLTeScWsasdvmMGLFXKDCAH\nEAaDnIX8ZOlKvN1pzqshK9SvFYZK6RET6VatYu+tJkrnGHnZ9VFWbSGGzFxK0bYJ7YeZKpoZxfb6\nCuzdLL6aDiib4GfjmJGUpKRgKjZkUMOpNpIxZjum0bUdKQXHNk1FcwS54Mm7QRfkuzMIfv4CTIkP\nx8tH9reXszcZd/8R3R6DX7f64fVLahtN7n2pCVXt/qdpw84Qzyief/b5NrIHKzcgc1IvRl47mNTh\nSWh2lagUdwTZAyy9ZXU72QOYYIZMjiw7xod3rm07TlEVBl7Qh6kPTWD4lQPRHF+tbSiE4IGMB057\nnIrKx8M/jgjN9Jt+rsm7Bp/p60L2YM2s/KafZ8ue/Ur7/E3AN4LwBZCbX8OALRUgLF4YjM5AdAag\nMwgdF9DLtBacTKESEE4K1HY1PwmUZ/QiktwMEE2MKPwEx/IGui8ypEBN5OLf4DVDsQUVNF2QXGBD\nC1lJMKou0EKCKX+Ow1GhM2TQELTOBa+xMneN0R5Ld+aoCVWm9fdO0yL6vxtwzMSKe6yz+olAR2Eb\nyTzNKP6ZXsqUi17l9Svi+fW5Cdw0Mgq3KwRJhyG+AOr6k1Q+hyenZnFHdjU/+t6PuW7+9QwpWMbN\nI1y4NIFLE8Q7Fa4e7sHtDIAwEM5aZmf+khk/6oswFfoX+1FDHRbfBgJTgN4GIroa0afc+qwANkkv\ndynf+eQ4Xnd3U4Hw3Wyti2sCu0A/KkCTaI0b8BQ9SvTxe7GVrgy/ANu/p6gh7O5m7O4mNLuPabf9\nEruzheR+Hab3QnJe6qXE25zosps+mEBJGj6H3YpNl4APhs3/F7rPiTS7uV9SEPR6EAJSBx4ifeQO\nQkKlLsFB//s/RYYdPK1wuOxc/ZtLWbUnRKca5pgSPtmjc8bws7pE3qhojGYSfVsGs/rtdT2MX89o\nqfaxf9GRiELsrTACBvveyiPkP70i56kQ8unsfeMw636+hbxlhZidL7AD7su8jwQ1ocf9AFFKFLnO\n3IhtGxo2nFZKIiADbGnacspj/hvxjXDpgEX6k1YW4OuQVNXx4iQCv2i36A1hp1LJYJCxxwobt9nZ\nNXEKQurYAyYXrVhDjLeGiuFH2X35Bs7dfy0f0M2ikRqEIZGhpJ6aKCsZpodMTalIBArNS008RNFM\nEwYGSjhhRjMU7AEvAY8djguL3LudDNR1sy3c5paJfHhnCS1Lr+CTyWsAeJVUIFKydtmSDzn3ujsI\nBoIYhsG6VZ/SO/MlTmxaQcjmJsWjoikCSGNTwz4eXPwiY1+OR3itcc4+XkNWaSPFGXHo9vD4xgZg\nnIaUcaC2F0uxK5K71h8k0Byk/06FHTPNroWvTBWC4RjuSqAFRKCWmJ23oPrKsO4k2Bo2oVS8j7fP\nHXgTBoJQMHUNm6eW7DEbyBy5har8gSRl5JM65ACBliiEkNT84xV+uyAHRYHeriiONtdjdhhcNQix\ne+Kp94TJNjw56jVsD5o9SP4n57a5ctq6rGtkT9xofVBMBs1exsm8wTSW92bCtBqOepwYhknQH8QV\n5WLQ2AHMvGIaUW/uxBDxhOyRgn6qkPTJH8ohDlFNZbgals4AhjKSCegCyrebbZm6+Xm7OLp7P6m5\nGYwaf3aXpLBWNJxoRHOoGIHuo2OkKQk2h7A5/z1aqDvWwCtnvE2wJUSwJYQ9ykZcdgwLNlyOM7ar\nNIIqVI6MP8JNR27ig9oPuj3n9PjpXbbZROeHpitUVEZ4Rnz5i/iG4xtB+H0uzKTgg2ISCVLa5qZp\nJ1sJmAiKlXZrQpE60bK+zeX83hU3EvBEgZQYqsGHM89m4RtL6LupL7lbcjg2oYDoinNpwqBtYqTo\nYAvB9PcsF0PIDlJgCzhObX+Ed9ZRy1DG4MdHLdU4cJJGOqqqkrVuKWsnTKJi9EAoolPkS1jE55Th\neAoUZbL+pvmsfGsP5w4aCYBPN6kJ6KS6NDAM7l5wNz5vu4/Z5/VRfKKY11/4C/c+HBn7vLPkc47u\nPI9pu9rVJwVw9Rufs31cDrtGZ1Fta8SMSgSt6w88GAjibfSjonDWEo2Dk4L4PWDYrfQAaehQPtK6\nLinguPXPqMNPoPrKOkhXCBQEtOzEs28h8SQxlNtw4UbaG/moei4HP7kYTMmh5ReQMXYX0U4bPz5n\nPpfemtzWnxVTL2f2+sWU+5othYlgiDHL7PSu81LmMggqStu9qsobzMQFz5MxZislOydYpC8MNEeA\nybc+S1RSeEFWgmloSEOj18BCxsYu4JbCu1j5xmqqy2sZc84odq3bzUWZVzBAqughndrEkRQMXohU\nbdg0uGySA2Od5FzmEyCAl2biSMSN9aKxSXA0xjPtriLmnfg73o9jMJEIUcPKQZu4dd0CEhK66jfF\n942NiLzpDE+qG3eiJRldWauz+WCA5DiVyUMdbS4VPWhw+L0CavLrSR2WSP/zc9vE4N69cSUtVT5k\nWL462BSi5kgdax/ZxNw/di+VnGhL5P2h7/NRzUdceuhSAjKAiYkNGy7VxZM5XSuiTomdclrSV4XK\n3b3vPuUx/434RiRehbwhnvC8iMDixTLcSJQw0YMfOyvtw6hT24s+aDLI+YHXKZMpRKstrF14Jf2P\nlTJr0x6iW/yENJUWu8DVYrmBDMXgU3McaZSwkqHgaoHxa2HuG1DV29IFKMuCd27mOu/qU/bX0CSq\nbv2AYoizMiRVDScuhBAMOD+X+S/NIirVg2lKduzw88AD1Wzd6iclRaVfP4MNGyrx+3ueLlswSaeM\nTO0k112bypEbx/NyUQMCgU0R3O6o562bF9Dc1LVs3tCRQ/l09ydtn6WUpF61j+ZAgPu2f4hS0jV1\nPmBXeerBuZGZyR2hhxj57I9JatBREGR5hnP0vDgKh5s4q/yc/HAxolbQfPFj6FEjYL8CZoiE9ecg\nZEehMzfTmE1uuIh8BSWsZy0DuBswsUUZvDflDIjycPvlUTzznWTsPchnn5RHWFO3lVDIQ2ZCJRt/\nvR7vW1m84LmaBqfLEsZrBrenmgXvz0RzeilcP4O8ledjc7Uw4uLFpI/Y3Xa+kM/JWwvfJG1gHpc+\n+hz3shc37YbGey99wB/ufR6/tz2xyVRs1PU6g/KRN5KTprHud734/cDf01LpxYGjTTOnI6rc0RQn\nZzHxxJEu+zwTvUy8cAbeaj/9ZmfT99xslPA9Wf6DdWx/aR+6P5L4hSqY/vgkzvzxeB75az2/W9yA\n3Wa5mFLiVFb/pheJZoCXJy3C3xAg1KJj82jEpEexYNMVaA6VX8W9gKl3fSad8Q5+XHv6YuY7m3fy\ndPHT5PnyODPmTH6Y8UOyndndHruxYSMz983EL7vq5Sso/Cz7Zzyc9fBp2/ym4IsmXn0jCB/gcdG+\nQHOsXyLvXTSM9Px6mh0qFZ4oQps9UCoQUuIQXvpmHMArbKiHgvQW9bj6RTOkoAy72S6MZQowZAid\nEBLJcVLIoYrXxGga5m6EOcstP/K6M+Hdy8DdQkr/o0w5XkNUfX2XPppCYtokIYfE1WQt1CmopJFO\nAD8TrhrFFa9f2GWhrTM2b/Yyc+YBvN5TVYayfBHplHMWW7ChY2oKyx67kMbelhiVp6qc6EfvJOTv\n+qM5Y+okPvj0/bbPeUVBRt9WjM8vuHHY02T+JRfhbbfigzaVrRNyWTur5ypHtrJ8Yv58XeQ2bMzg\nQg6oOyk2jrdtb5jyJro9F4wACZ/NiCD8K1hIPAmo4QmqiUmQAP9kMUECxDEApp7P5SMHMc5mEKgN\nULCqiGBTiNwZmcz+9VnE9vXwJldziA8RQAgfSEFTaTrv/PQl6k70wdRtVgUqUxIfCGDPLGL6Q4+Q\nPsLSDyrbO4JeQ/djIlCEiZSCHW/cTLSawO3X65yj/BAPkfV/L+13jSWr0AmKzcZP1/+LWROjEUKw\n5a+7eOeOj1D8WhdffkhR2JA9mBknjqN20huCdv0cAM2lkn1Wb675cD6qpmCaks3P7GTDU9vxVvva\ng9mkVQfYPSadn3pG0tyxlK+AQVk2fnxyB0c/PoHsoJms2hVGXDuYuX+cxhOxz0fsa+uPKvhp6Ptf\nuSZOVbCKAdsH0GA0tF2vyv9j77zD4yivtv97Zma7Vr03S5Yt925sbAO2sSkGDDEltFBDTUJJBwIJ\nCQnpkAIkL4FAQiD0gCk2GBtcMO4Fd8lqVu9ltXXK8/2xq7KWBKTw5X0Tbl97rbw780zbOXOec+5z\nH5VsezYVJ1R8rMJmdaiaF1pfoCpUxaLkRSxOWsz67vU4FAenpZyGU3F+5Pr/m/BfZ/AfnvIUbfu7\nkMArF01l76z8oZ5mgOgUwAsISCxr55LH3saOiRqLDUf1eBz9N5lEEiIYK54RqEgOzjvKmntfAlfM\nUHbboUkFLQhSI3PTEyx5ageKHmULGZpKzcSx1EwqILWxlSkb96GaAwYsj0IMDATwE/OOuPL649HV\nZTJqVBU9PVb0YLxtKD6TUrqxY7CfdKyoIFBsDYmNECtYjR2dsMfBKz86h6QDZViahvbao8iGGqxB\nXZbcHjcPPflbzr1wgCtd2aAz+Yt1BMMSzRbkGvf9ZL81DStoR6oKu6cXsPqsKci+h5WUcZRSEQnh\nef2nOD5cNeSYBApCAcuyUD2Sku+czL71X6e3NRtUgaPheSQaaqSRos7dnGecMUQOWCfCFtbzIdtj\n8mEql2o3kWC46VOQhGihkiPJzvSDnWzK/mHU0PedKQlPfvFN2o+VxCVnVdPg27nN3PHYAurrw2z6\n+ire+yBEVUIWY7VKii/cTMI4NxN7zyXJyCB3ViYJOW72PHkIRVOYcfVEUoqTOFgd4ZrSFSj6UC0l\nVVV5rf4FEhK9mLqJM9HBrmf38/QXXsFmOjCFgk1aRFSVdpeXv8xcyJ0bVnG8QutwUOwKBfNycCU7\nmLBiDJMvHYdlWPws81F0fzwTwbSpvFw6g0OZ8XRGj13yjXdeH9ag2702vtPzZe5PfoRw9zAPIE3w\n5Q+vIGPCRydo/xEcCx3juvLreLcrOhtdkryEP5T+gQJHwUeu92Ddg3yr6lsYx1UJuhRXf7ho5cSV\nLExe+C/f508D/1XSCgA3bLuE77l/HzUDIzkSA4WMCEtywUsbcQ8pC5XoRHAw8HQfkLiN/s8b0AaM\nvQG0RwbOpF5IS+E83rqmhMkbtuLu6WXd5SuIuBygKNSXSg6fOJ/zHnoSV2+gf/xo1adFy4E2sqdk\njniczz7rw+hnWWho5zXyhac7SJQRfsdUrCHTf4GOk5dYzgrewOkPMfPrP8bSBFIIhJVEo6cAy2zD\nrgkiYZ3S1PF87+Ifcqe4h1PmLORHL93L6NwUirM1Dh3TMXQXjzdeRxZXMzphNt7IWRzMyURYEikk\nSsSMVvkqICwLpIVr52vYhzH2AA4cmJaJRYSUK69nx8rL0IOxi2VCOP0i+tzQI1kR6ivfpzgQH4ay\nYSeNjNgVNDEwOWjsZC6nxDE6pCXRAwbbfrMP/f74GVJr5Xi6m/KHMHFMReX+5lz+9uV6bgocoXNN\nHSVhizHtrVH54Z9OowfYwiHg0GCVBwDW/2ArJ90zlysO5JKcOIbk9g/798jSNHpzi3BF4OGxz2CG\nDJCC9AkpLP7+PLyah5CpsyO/BJtlUpOSSXlaDlJR0FUFu/nx8gRWxKJmfVR7qOKdWrY9vJdT7p6D\nMkxxnqqbTGmqHWLwtY+adMaONbHAS2v3UF69zaUR9g19EPwrUOgs5O0pUZVUALvy8RW/5cFy7qy+\nc4ixB+I6bp1z8Bya5jbhUT1Dlvu/iv8Yg29z2bi17At8vfR5Rpe1Rj38j8C8NYdI6vQN+12ffnif\noRjoxBSVLvDlDArX9DXl7kP3TQB05Gax4ZJzGQIh0B0ONp93Bqc+/QqeQU8hFZXHjMe5mztH3O/6\neoNAYGCDMw4GyBRB6qQ3lrUYDgITlR1M4yS2opgGYpCnliPTOHL5N/jaDIUP7nmaPbXbMWIc1Jc/\neIH3itexvW0LL96bxcLbGwhFJBHbKDon/5ZDO24mSdZT+PpiJr2eRRIa2YSIJDrpSUkg5FBJa2hm\nZe9Dx7cS74eBwTRO4EBOOUd3XIEZOX4qLvrfLdXBysI53Hp4XdxzPUKEFhrj1vIz/PU1wyZdf5vK\nq8bpZI05xEnXPIhqM2gpH4/e5YpeTy32GqSeeqhG5zarmBupJrOvee9wkNHc0eB51rv3bUU9eQlt\nhZ/D21WGakZonzybyuVXIiQIRaOyN8JlT28hrd1Py752XrpsFSKWoB7d0cLTi+ZgWioyJLC5ejkw\nzc2MXT2M7OEMhe7XaTnQTs2G+hGXGU5eRCoKxUsKqVp7bIiXbxkWletqmXLpON77wRascHwc34yY\nfPDgLkadnMe0Kybg8P59MgyfBJ/E0PfhpbaX4nomjASB4PWO17k44+J/Ztf+V+E/xuADZI9N4dsf\nnM+iS7dEPxgcVujjbCsCETY5YfPQdmrHIyrHoMf+hqbMdHJbWvnwrIEk3RBuvvkJWhkKQWPJKCIe\nD/n+VCSSIH5sdgc/DtzPMt/pTIhMZMvDe6h4p4bU4iTmf3UWOdMzmT/fRUKCoLc3etNNKjewWRJb\nlKfxURulliwaOEYBxeQxChOTemo4Jpuw9Upe6p1Cc89WTEwsTwpGzjiUjnraOuq55aLbeeTl31D7\n3Che+8BPY7tJR2oJ9227GHnv87SGd+K3vgwkoyKZ1tPK0p5aNKINOAooppUmsslHJ0ItVf0PlTQy\nKWEC3eljabVFhjH48ei1OWi32UiPFcpZmOhEKCO+mVoTdVhYQ9QwLaCsbQI170ylfs8JpI4qZ9uf\nb6bjQEm0aMzDgLz74IJtAYqQ9CY4yQoPTXQPhgL43XYcYQPVNAnh4/yNL5BICi3u89md7sNTPI3C\nBj/Vo9JAEXQma/z5yvnc9qs1KDIqf9AXos/0d3PTlV/kkGssve1Z5E7cQ9H4LZgLbkM9MNAB65NA\nDxg07W1DDOPhS+BoaiZ2yySiqKgKOGyC//laOsvGLuUPJz6Lr9Ef96wzQiZ/WvISQhFRqqgmkIZE\nqCBNkBbsf7aMIysr2XD/Nm7ccRne7H+f12xJ6xMJzlnSotf86Ov8fw3/UQYfYOqJGXRULWfenyrZ\n0mjFbtyYzLAALInQLZwhAwN1WMW+wzPTODI9k5mv7CGnQ412K0KJJvCA5qk1Aws7oV/JAcD1DoTn\nHT/kEEghaJpxIsWbjuCjh4Ziize+J9Eb7ubEhreZs249pz4lkH6LY+83sP/FMi78y1mccd4YJk92\nsHdvmGBEYiTbwAd59GLHJHJ8VacAp2wBVAoIE8RPPTXkUoiHBEoYR6KRQrUe4ZjfwImKf+mNhE64\nAO9zd6D6WrEUhTfefp1phRt5bf2rXLhwHLplkvi3VZijsggu+jahVVP7t2sg2E06QTRWUIlAYQ4D\nsVCJxVwWso7XaaGRkzmdCCFG141h17iPN1ymEByVB0igABWVao7yPu/097jtQxcdNFJLLoVxiU+J\nyQHtfZJdq3FlT2TNg/dhNrujVjox/vRFqV46+dVvYA930JM8nh71k3mTUsC2ucXM31yBToTdbEVD\nI0s5C4cyCW2/yYz2gySoEV4+byaT9zWQ39RBr8dBYm8YKxL1lNXY71T74alM2/wbcOlgCewLv4JS\nltFv7AfPSgd2f/jGn921PUy/ZiJbfr2bwbeAAM4s30+B3ktg2TRKxrj50rmJTC1xsOW3uwl0hEae\n2MTomEII3FkupGkRaAthxbqf6wEDM2Kx9q73+dwfT/9E5/DTwIr0FXz/2PfjGtsMB1OanJ7y79vP\nTwP/cQa/D5uvLObSVxt4riwUfwMrAivBRmN+Mrm1HViE+p/2wQR49Q47TSVBoJaq01NIOWJx9fcN\nXJbG6OZuqjlG2oFM2qe0RH/4TjUq3NWX/E58BrruAPoke0fGuMNN7GE7jaGXwQAAIABJREFUnWoH\nskFh6i0KlddNpHNeDpsX6+ybAjd8Q8EZVNADBi9d+zbnrMli7do8Hnywkwcea2BXTxo5ohO7tLiE\nMp5hHAYCiUBXVdSZClO3P4rAopBCcjmFOqqpopyJTOtvAmJ3ZlCsBDkyaTGhORfi3PUatmP7EEY0\nV2EBne2dXHPhtWw++D6b2+uRHSZpN5dimm6O5/lYqBwklUXU4iUY14WpD4s4i2rKSSUN1/mS0MtB\nPFaYLssNHyWzqwo2JzWyo/1FpM2FmZCG1tk77Nl+kxdYzDmMYTwSSScdvKuuwt/ZiLLVojWwGDPB\nHbuWwwwgADQSuo6Q3F1GWssO2hQPBieixWo+DEWgWDIue2KoggOT89gzo5D5myswMfFQRChpIbtS\n86jLyiLkdLB1VAmF7a1c8+T7sUiSHJAjElBVmEZBXQeqKTAOj8Y976tY965CtHhQthcgDG3Qrn4y\nL98Ugq3NKrUvbEJdXoVoTELdmo8aW98mLea21/H5c6eSMcnJE4ue5oXAMUSrFxGTgRjpQQJRqYZQ\ndwg5TNGuZVgcfrVy6Bf/HzHBPYG78+/m3tp7sYYRf+rrtvWt/G99bPL3/xr+Yw2+EIJnP5fH2w9U\n0mnEM0aUlho+tLaSRTG2WGWuicWbX1VpKhGgRkVyATpLVapOEEzYGk2sFlPKjC+30fK1JvYt2QFB\ngcxUwG9GPX2hw/glcHgdA0Hg4yHBe5R1i6vIeKETxZRgmkSSHfSMT4mSn4XAlw6//KPgwp/YKf0w\ngr83wtUnvUdnRiZpaTaCzUEOBhMpIo3JWjspIsKN7OeoLZl1Z0ymxPAxZ+cRXOIE3NKBHQcqKqVM\nAiSN1JJDARJJdqcfa96T+KxLwO7GsWtlv7Hv32spOVZdS01VDdIjGPX1RMaanazVXJDQCV0pQ473\nBcZwJbuGNQ52HExnLsWn5pOQksxuDjFvdxmbiqfgS/BEx4oFw4W0cJg6EYeGZfNh694bjZHrQZTO\nuhF/Bzo6a1nJ4TmShj1rMCL+qGJGzKv1HrqLzlmrATU6GxzBZrblnERydxmqFcaSJjVUUMJ4AAIe\nOzbdQjUt7LpJ2K7i87p4b9F4EnpD6Bg0kYODcSR1h5nafRBX1T5qk9NYWzqJroQETCGwx6Qe+h4c\nJuBLdCIsUBAk6WHYnw0XXjPi8R6PgTxUlM6pSHivaCyT7/gN4op3sCIaKBbWsRTkaV9CbUyM1rOE\nDbJmpPNA/h+JXLgN9eVp/cY+blzFRAzT4lK1qUT04WUaNOcnaIn5KeOeontYkbGC+47dxx7/Hgrt\nhcxLnMfR4FHcqptrsq5hQdKCf/du/svxH2vw+1BxcyGTbt1O49hMlIiOdnQb3r/cRZ1p8DIpzGAu\naeRQNXcStRMbok2bB0M12XquYOLWKLVPQ2M2C1j9wEuk/SoDS7Uw80x6v+PDHNf3A2+A3PHwwcsQ\n7Cvv7udlRAXMcg5TcWEKeulpnP6oSUq7HZ+UmHsl+xcN2r6wePHqidBVBICmm5h/66Rur79/3Dcp\nYstlkyns7KIhJ5mWnGip/mGSqZyYzQ0Ph3D1hgeoibH3XArpxYcDJ+1zH6Y68h4kXBdrTTg8+yNs\nGMx44KdE8hZiy5nOeb2beaMkhKtsLCGOn+0LWvHQhQMvQ6mIAsFB9lC9roJyqsnkHDRdZ2HZbt76\n0okYiorNrpO/rYXSrbWImAhco7WV2kF66k5cTGYWaWTQSjMH2EV40JwjWy2kIrQXW2Q4DfYwauAA\nJlOjjKvhJmZCYGgDyXUhDRrUegrUiSi6RYIvzGPXn0xeQxcpnQEacpM5PD4HRVpM3VNDNQFM0kjA\nYDS9/UG3MV2tFG/fEJUPGkbXR5GQXNdDUAYQmLjxfGIvfji8Nm46lakZjF72JkmXrQOXEX0BsrQV\n48UnUBbcBkgcSQ7WPrAH0xFAesMQHsFUqBKp6YhIfOVrwG/SkJJGXkd7XKcxzaUx6/rJ//AxmNLE\nb/rxqt5/mtc/2TOZ5yY890+N8X8N//EGP8WlcVe25N4WH6e/uJP3au4lQgQJdNHOu7wJio1AwjkQ\nyQfX0Phs0Cv7wz4ixptYwGlssN6iwTqGVq2ReFsSXc92IJMHmby5F8HRb0DdF6MiLVKNthfM3Ae2\nEAUHBZ//ucAWiXo8KS2CMx4HR1Cyc1lsjMbZ0JPfP0Mx7BpckA6Pt0DjQMa444ikY0lhtBPUIESc\nNnSHDdE7fLzSjYfGvAjVUxSwbOR2vEtaBQQKzya1Yz3CMqmjqt+AqqZGW/OVcCSFsbZmLGDMlPcI\nHhlFPcMnW1tR8Q77DThxk8kYvEylETd9j6O5r+7H2xpAMSTCik9H5zOTPJJp4j18dLOcS1FRUNHI\np4gpzOIVnqaHLjQ0ZnmWUm29PfwOKJAYCZJIKzUyCZwxw9Wf7I+G65I6D8Wt5iuB339xJqYmuOTX\nB5m7cjWvX30uUlGx7HbUiE5Scwty65N0Mx8VhXx8celjBVDkSDpJYAmBo6cFHX3IjWrF/kXHUT6y\nxaFEElFUDmTlkePrZt65j0HCcb8Hm4WcUYeV141Wn4S/M8iqPx0h45QKtGdmjfigkbndiC43UlcR\nMcVS6YpwND2b1wtnceXuzSSFAwjAZRMUL87n5LvmDDvWR8GUJt+r+R6/afgNQStIhi2DXxb/kksz\nL/27x/pvxn+8wQf4yn3z2bzgOdTe9mG5t1g6zo2rCF12LVoIZr+lMnmzQtgJO5bqVAbqeTlrGvlm\nKpO76kkwwiSRwtlcRBcdvMLT6JEIjjedhC4bxO1WTCj9KRQ/DK13D9EmXfIngS0SfyPZw4KFz1js\nXiKxFCf0FMTLBUN0nJO98PwgzfRtfpjnBZuI19AXAjmCmBZEzWvFHBuO3tO49EdnkFEnEeY+NCMB\ni2XoRFAQbGQDFWo1weXXQ2Y6TFAoJ49XS7pZur6DnfTShAdzGMOThA8FZUi81Iad0ZSioOEmEjP4\nUSTXDx+Tj+6zisoYTkTBwkBjoBpVjf29iDMp5xAzORFvbzLm/Elo5Q2IcDytakn4DHIP1tFIB1Nw\n0uOys6u0kN7EdITpx9G0CiP7bDKaNg+cfrdJ8hUB/LkWuuHmqW9NYfl9TzHzV3fQPmkWEW8yCXWV\nJFUeJCQlrphapnPYZgHDQyIR0sJpWvjopJl6bNgoYcKQorM+4sFwRt/STMwzD9M5v5Wbf6FiRFwk\nOkbQ2jcU8IaAJNAlkWA3MqRBcPgktXTqGN9fjTWlEdtVl6EcygKXTvi67Wwxzyew38Hv5yyisLuD\n1HCAk5dmcM40SfkbVYxZVvR3ibTdVXUXDzU+1N9tqzHSyHXl15GsJbMsddnHrP0Z+vBfYfABbr9g\nPC99vXpEOpYSCeF9dB2XNpxHaqOCLaZ1U3DIQZttAi8VzaDM6WZ9ziQuqN7GGF8zNuykksF8TmW9\nsRpbuX3YZhcovWD1gpIQ93FmzdBFAbQwTPnBbvbeuTimKna8wReQcZx4lATM+FxFH/bOKGTR2oNo\nlhy0uMRvC/HY/RH8hR6WP6SSVSPQjIGqIQWBHTvbz9vM7lOaCNvtkP1zeD8Chy8GtyDQUk5yaxJz\naGIvGZiD2OcaJnYMTBrRKAKixkkwYJjVWKpQBZyYhPrL246HJBM/aQRRkIRQOcghZjJniPcpUEgn\nmyzyUFAQlmDesYlsnXgY28FaCOtgU1lgnMpoazJHSMaKzd2SghaL91bhYz37+R2WopFZuxHVNLFi\nS0ldxV7eiCehjG7XOEIeF69/dRnT7vklmbs2xu2LZVPoGN1G1pF0LODviV5LJPv4gP3sQkYFHLCQ\nTGLGkGX7jH6P3U2CEcYQKoq08KsOeGMU7tfzgU6a05I4um05M0b9GdVxHKc4aEMtGxCXy+8G1o1B\n5ndBXcpx+QCVhs/XkrGkHNeCWxHtHoSpIk0TxxOzESfE6MlCcCw5jWOkwTu1ZD6+H6GA1AzGrWnF\nnjWTxPalzCxJIClheMckbIXjjH0fAlaAe2vu/czg/x34j9DD/yTInpVBqiOZdEauYp281UZ6neg3\n9hC9QTP0MOeUbycz2I2lN/G3gskYQkEiUVEoZTIIwZgdEwYGk0TFeCygEQi/FTd1VwxQRnD4JBJH\nTScp+yqHGnsAywJdRL15LfZyqWAqDBcf2D63hKDLFq0KxcIgqg+0z9jKxHs2467sZuIHfcZ+AAKB\nqUrWvn8pxXfeyXm3f5kzLvkh2f4tMOoZ6IWm7GXoqk4iEa7iAKPpRsPETYR5NHAze2iiLnauVOzY\nsWHv90bFIHOd/xEKoLn4SCfQL4HhwmQGI1eS93FdAvg5xAHaV7+HTE+i584lBFfMRlu8iKlyJm24\nj5OiiO6Vl0ImciM2C4J6PY3U0kYz9VRToR9G+/Mszvqqn1Nv28esn+xhRqWkce5MTMdA+ZvpUGmf\nl82OH3jpGtVMG44hJODhjlbG/jVSxwF2Y2LErpskT506IoO8IjmdR+adym9PXMrLk2axJScHEW5F\n7dchMilor2DityfR25GJEYzNFHQBfhv2ay9FWIOvi4hKdtelxG1nT2Ye+7LyyNqUiOPOsxENiQh/\ndCwRtEO3k9O2xnszNsNgXHMDul8n5NMpl6k8dMUsLrqxmDNvaST/jMNcdE984Vwf2vX2ER21qnDV\nCGfjMwyHT83DF0LcC1wP/d2a75JSvvlpbe/jUHBKDllTM5iz/WTW8jphQijYASNWV2sxnsmow3RO\n6pNXPvPoi6xUnkdIkzXaXJbp8wljQ0PhJvktAj0BOta1UXNyFYkdY7lv+tf42a7v09DUiEzYDnIF\nDAqvWDHpgcFTcRODBmpJDSeSua2F4JLD6MY4TNugSyUEZAm4KRcORgAByTZoBwZ1AozuvEQzTJz+\nEIfZh4adCCFaaMSSFmoIip44hGIOz0gQpsYlbR3k4sceC8pM+cPJvHv5enZyGZaqsT/Dy/SmIOmE\nuJj4XqImJimks6ukjjkVRfFjH+eZezHxYOA/jt2kYpFMaBjRCKXfMEWvU19FtImPbpJJi0lOZ7GL\njXgP99L7zbcxT1E4/9SHkfiGbGtgbEggDzf5lJDBbrbEbXcLa7mIAtzhXJqOaMgjbVAyiX3ftJO+\nuRZhStoX5NA9LR1hmOw1XqHFNpWl+jSSMfrruSJCwSHjHwN9Br+cAzHhPpVAzs2E0i/gf7DjMnWW\nNuxlYlc8O6ksIxtD0fA5NHx2J2fsXzuk+loASkMX76/4M85L1zF//KuEqxOw/XYejgP5cXTL49/7\nkB3wkd7bg40kZOXsYWZYgkxfN2O0Mjr0NDpIwWaaFAW6ab6pnKcariIc9GLqdpReC0tTMYXCyvU+\nzjp3J/YFOWzYnILLrnL92V6+dVkGdmHvlzwYjKnuzzTv/x582iGdB6WUv/iUt/GJIITg8veWU5/e\nyMbgO2SQg8c7nWRfKbt5MMaBtscVVg6GiSCVXBQr+qOr1jezmkZOZwU2bICKV3q57Ac38NT9/0PD\n2Abu37SJhasvYcv2v3HsjkrQ3wP7YhACS4PK6ZC5qxsHThw4MdDR0cmjkBzymfq+Qs3W1Tx3vxN6\nRw+Ea4QAVYVUCSc6EEcBE6RlRZuGZA0chCp8JEZ2ciyxlYae2mHPTUJFN7XjJQWHomGcPlhAi7CT\nK6PGHmKJRiSL/3oSB6/6kKBvKhumzKCodS0ppnvIza+gINwett2ay8xvCezhoYVB0XYwURTj5xBe\njJgxB7BjDuvfDTZIUXE7k6jYXQAvyf3fe/BygeNKflT1TRLWJBBxRghktCJbnTgxCQzDx4z+DhSK\nWEYSFQgUzmAFBRT3L9NNJxkE8WOnFwdpFZBam0/VF9NAjRlxQ0XumkR3yeeRqQGa9z/HQVmJhkKA\nVqbI2WikojDAbx98bgD8ubcSTj0XlGg2wK+ovJE/C6cRYXRvS//yBzPz+v+2mQaqZQw5rr6RM+uD\nlD+QzPZjWagIJHvxcoxpzMKB8yPZQFFjH38N4jC9Aa2yiCvW1iLkMRoSvTw7eRJvzM1H96Xwhcxn\ncLp7OZiaycY9ywlbBmbudsyjy9jVVMDyJzfzhRAcshfy67CHZ3pWMWX2FLbp24hokf5DcituflT0\noxH38zMMxX9NSAfA5rZxa+XVJJBAI7UEfe0oahLJjGMxZzEQvR5AtBOswIeGZKDMWmBSxVH+xEM0\nMyB3q1oaJz9+NZYvgxb3U7wx81GsKgu1Ogki74HeELWkUrL1HAsPXrazifW8RSftOHCiomLDhipV\nCvQ85ry/FdzNQw9ICNAkCdkBHDZAKtCmwAEBhyWEKlHbH8V66B7e7nlycElPHEyXxqobLMIe0O2y\n/7h1FHqF1m/sB8MSUNBZC0iCSbm8ueB09GGKpUxFsveGPHw5Gs9/y6QrwyKQYBF2SkLuqJE2AAOd\nCAYKFimUUctqQnRgEaaXDsTHJDwFgjAhmqnHgxdtkC+joCDCCi/84BUem/44alhl87efBSCT0FAW\nJtFuCgKFHDKpp4blXEwBxf0zCoEgiWioYxQ+RtFDMiEmPpVI8l9mwL6xsLcUXlkMuyZhN03swmBn\n/kJKKcZPKxI4xB589MSO3ogz+GOZiCoSCKedB6orbh8NVWNT9oT+pXXVjj6oAlhXtX6CgoVFOy3U\nUkUbzfjppVY5Sl7t2xC7AhYWPXRwkL0feZ6j53NkyJxuHIcLcfWo2C0Lm5Tkd/dw7a69HKSUz73W\nQ9Gfx5D9yAxO/UkOt5X8BqeqgbMLLryYFq+GvS6R3GqFvITV+C8/g/LcB9nUsgnRJXB3ufEIDwsS\nF7BmyhrmJs792P39DAP4tD38W4QQVwI7gK9LKYf05BNC3ADcAFBYWPgp7w54sz38afUfOe/MFdSz\nGWE6KWAZVbzBQs5HImJSxbGuPShU4SGRMHvZOGS8MEFe4Wlu5JtA1EPOqkuDltnIkmr8Y30E/uRH\n7r8TAk/D5pMhxwWTDtDhdIAWYboxhwPsJp2sIUwLGzamf2Cxd4bO0HbNgBC4QxHUtjANSYNiraPL\nofQgntvfRESiN34XHaSSEWdUTLtC0+kFdCXr/O4Bi5Me9ZNakUFHWiHttTpZhg+LYW5yzUT3F0NS\n1FzWZmRRnZVFUUtrv4KjbpfUjxWUnxj1XmumwMMPxQx3bCrl7IElT0cQu0Nsm7WMtqRkLEy0zsOU\n7/0bszs1dKppZALZ5PVr4PcfftyMxMJMn0VrJEJGT098ZAuLt/+8jie//zuybdncEbiTNWe8xelv\nnUkJPmrxEI7NbxyYhGLJ5DSa2UwDizhr2NBF9N0ikTAJhMi1ehj9Thc7DpewvbAUSwgmHq2gqKER\nE0GTtZZVfECEAG48pJDOCzxJIkl4SSKFNMYyCRdusikgL3s6LSNImHfYE6hIyKKj1ODEtG3kh6ZT\nK3MwhQZCUObsYUzIzSH2ohPBwkJBRWLhbm1CHCcgJpG004qJOWxldB9GMvgSiZqsI1vir5EKeCNh\niju7OZyQy4k9sVCUAcmPzGfx3S+yavNFKCf/BDnuNVoOTCA70M3ztz+B7higj4aVMLagnTub7+L7\nJ31vhL34DB+Ff8rgCyHeAYb2UoPvAL8D7iN6a98H/BK49vgFpZSPAo9CVA//n9mfT4oZZ0xjX+se\nfnbLA7zy7CrcuEhlFj200EUReYRpRiWAio5CKkGaeZeD7IkfSHGAFcZAp5vOfo+vJ6mDJGsn3aFk\nkHZkQiOc8EtY/zvongndgKrjb/RQM3EXow4mMtcYWXfbGVSw6zphdRgWjpTYIiaLPjzCX06eP/B5\nbiMi3IXwDzwmPHhJI5MOWhEo6JpFMNdDzps15K2sAk3DCjnZkzWKIzOWw3id3PanmLglORbsGoBp\n16lJGDfwgRC8PG8e0yvKmN68D8st2LdIsmcp8RZCxL+HkuCNmyXa6qUYvrTYlypG+lSMeWPYkPAY\nCYXtVHeuYuabs5nQOBUNW5ynDdFK6S1T5/NhUTGWIkgMBPj8xvdxBcNYCDQMOkJtbPj5NmxGAk+U\nPMWm9h3s4AhNlJFCNi6K8JEQE4IIkWivxyGSmBUedF5HgIwlkwWQqIc4ueYQRV2tbM4Yw6iGJlRL\nUsPLNDFA74wQposoRbKdFtppoZpydrMFBZUT0n+CMX856hF1qOKTlDjDYV4YM49xS1YR+dn/sLzp\nJV7/wl85po9CINlZspDwgcfQBxWhWbHwmCPcNuxxCARBAnhJoi/1bQmFxgQvzYtrSc9rIv3JUjzB\n4YNFwmmimsM/KBLDIZTjH14RlYlN7awSFpai4ylaSVXyfGwZu7HUocV/uiPCk2V/5o72b+NK+7/T\noOR/C/4pgy+lXPpJlhNC/AF4/Z/Z1r8aSemJ/Oiv93L3H7/N6y+/wd4Pyvjg4aOkks5hEvEQVaBs\nyEthzSgvzq1bwRT9nr9UHVExzth4fnpIIgUL6G3N4KQ778YSgvaMZMonZdK5/D6Y8gg0nRJd4cAU\nUCSrsp7nLLOEwgMCdTgeNRY1kxVmbqli/YLiId8DzPugguzO7vgPK0Yjp8V3VhrNOMYykTAhggTo\nMFqpranG6ksaGjp+dBQz5lW5NZpHZbI+aReL356IKQRoBpYjwnNf8GG1xjOIpKKwu3Qsu795EOwD\nN6unExb+VTBmF+gO2HmGZPvZgwhIpooRiO8Mxf5W+NINUFpOrztIr67xxsW1+H4QYv6mk1EY6Opk\nAY2Zyew6O4KpVEBdAeGASpmShocggqih05pGsebujZj6QLFTDknkMgeJpJ5qekrrqDunnECaj7xD\n42mtP5EZ704akSUyEhwWlHR2YPMdpttS0QnQxBZUVDLJwcCglaYR15co6G3tjHopFSPrGGUZxbj0\nMDldrShS0uJNYWxLLbvSSzjhiscRUsXp6uXyTbvwy3ICNgfdxjGqh6gcxRK3lh5Lz8Yfl4qKCzeO\nmOSIhYUlFEYVV5P8yG8hoxdxpwdrzldRmxKHDKxfuQXtcB5KMJ42rEpJbWISZ1WUD10n7IAJLwGQ\n06qyLzOL6Y0CKYY/55pho3xlNVOvGT/i+fsMw+PTZOnkSCn7eFYrgP2f1rb+GbhcLi66/EIuuhza\n7+nm6uxfYiHoUZKwFBsJrX70mbMJn/wb3IefRO0qx0zIRzqScdQP9HxNIZMQCvW4CZq2WBWlJLO5\ng+TWWg6230zd5b+HVgkpRDtKWBKf6xbePrQMtzeF0b585rIQFRUFBROTiFuhqVgw7/lu1s+zoqJi\ngyWfAxHGltfTmnBcj9nafNSiTMLzxuH4oAyhG/3TegdOvCSyn50Dxn4QnP4+YpXATDybHctcHLjm\nLkYdKCBiM6medwCrcjm85wf9OJlbAdgG9Z/1w7XfFrh89Ht+Jz8P2ZWw8vbYDR1ww+Cbe38XlLwI\n48oGGs3YDLDBhu+uZtfZN3CZfpS0mDrmW+clsfuibqTWEc1jTDzAiTdm4/Bb9PU2BgWTSQQinXFl\nS32MGIC2xU1suOE1DLsBiqSppA5XzxZSmm+j6OAnk/Pta2bT93eR4SOAyk7aGE0piziznwkTJsQq\nXqSDod62ROcQTzCOKxjTIknr7iExHEDI6ExibEstvS4bC2feTvP1Gt2OeWRe0UCOzcTbG8YbCbNn\nzAI4un54TxyBrrqxmUH6dKMUVEqZjBt3jAEVK+ayQB7OJPnmS+hd+Tso6KJm1cusvuW3dHdlkhPs\n5ExrI4l/fgSjtBPnr5di1qWiGlEHJqIq7MvKYE5tJZW6QMdOMhYFGNhsJu8nZsIJv0CTCjV1N1LY\n00PJ0SzcPg8RRzhuhmgP2pm37hSsz3/yIrbPMIBPM2n7MyHEPiHEh8Bi4Kuf4rb+JUjLSuI1+QOm\nnmCRbvezf1o67yybia6BcGURHHcV/mm3I5Bxxt7LGA6TwyGSGOAv9MHCZ71J/t5fMesuQVbt23hq\nqyAcBgTYvfhn3knQ38MBdvMqT1PBIVpopKFAJ5AiOOVpg53GOlLuOwn7hjehKwI9FtToEHmHD06p\nY3NiEyh+MLtxlj+Ld+NXKf7uKkKLZyInFYOmcsR+uD8+qzGQ1DsetnAvCInDpeM2I9C1kGDZOg47\nH6Ey/zQszQZj34TkKlAHqHI2w4CS8ri5/rR14AgQN823RwSl2yGlz8F1BaMcVQDdgsO9cPo7A8Z+\nMISkd9Y+3slzUqvWUl9isvvzXUi7Gf01qxbphzQ0wxjy45YIOoeVxATDprPm+lcwnHpUxAYwHAaB\npF4qZ2zkaKIyjJD2UJZKX+y773OFaM3AKJJYzFnYceDAiR0HCSSygivIoQAXQx8oQZrZzyOY0iAl\n5EeVsl+HRwAJwSD6/6Ti25pI6waNQ7fnsW/xi1j5nUjFosPlJuTKHGFuIrFUJ11JpZiOLOzqaJI5\nkxamYPTrZg7AnF5L8Icro81dTIWOuiJ6TC+6auNYQgaPJp3PX574BcHNk+lZ+wA913+AXtiJb5Sf\nypsOoV3xAmpTF70ohFFoRmUPdlrn1JK9/xCqI4TY+RWS28ZwweED6E7JZT+/HlvEHp2NSRCWYPLm\nWUx5fzYlZ48a9qg+w0fjU/PwpZRXfPxS/zvx42230V7v53df2sL6N1fiFGmURMLU0EkXFuU0EI5N\nfnM5hTwWMbIUcpTtITHRjF4Kal5jV/bPIeAAe3S1SNHZ9HjycB15ihZ/PVt9G1nC2WTVKvTQxsrz\niqkqyINtpxFJWgJ1fWwMO/SexbtpQCpgleEsW4Pa24mtbS+dZpik+7ahjykgVxaQHcmjOTmVMV06\nCgoOnMNWBru1LK7euIm8li6kEHSkenhlxQyacjOh+iaoPB/mXAwXfJ60N27FuftC7IbBrIpK3ptf\nH+evFhwaKh8BYGmQVQWd2YBm4EyvItRcAi2xnEN4hCYopgYFx6gsyqFlbjqFe/zI41rQObpG8mME\n+gi1rq2FTcM2ETHtBpWzD7Jr2S4KD0xkyRPzSWpKoZYquulkMjPjzVG5AAAgAElEQVSjLKCP0LNR\niTJljw/ZRcX4bJzGedRwlA28NSR0ZGHgp4HEWKVyPBQ0K58IUWqm5VdpXZVI15rf4863KFh1KVue\nv5kpm+6jz4sfDEekHUeknZD3C6RH7KixZYYY+6I2et75FXij10ZRLcadtgpPeisv3PRUdA0JTTun\n88ejj/CVn52P+vBTdP7KTm9bJolmiJqinx53fgUm0LN+NNOd6Xhnu3Ctz2CNLnltmQf75IepGlWO\n7owgTIFiqlz6q+uYtmcWC388B2/ef07bwf+f+K+RVvh7kZbn4e5Xl/AN/yl8b+HDbNq5nySSSceL\nW/kGGoIEy8TXfxP33ajxt4tFGBMfPrrppAPThJQP76XjhLuQvUlRFzDdj3Ea+K5eAqpJt03nyYgO\nYQEJWaAGQJ8CzcuGqbwVA2+ylKSsDBa3HSJXm00vLeww11BddpQ2oDk1iQ1zJnHVxu2U+u1MYjp7\n2BbXBEZBZUFkOqK5M+boSjJbfVz5p808dMsSAiEHtGTBkXUo6m6Ktq4k4m2kNT2Po9nZnPhCJ6tv\nDWDE7HV7LhiqRDsukScs6M6InjZ3N1zy4kHezQ9Q5R4VPZaV58K0vUO9fL8HtsyFW7sIpzponuIY\nYqE6xusowyQOJQY9HEUnuV8WG8DvcBAx04dNEgI4e13oo5qomLOZuSunkNaUSYQI3XSxiXdIJpUE\nkihQRuGxhsrERSUZrH6uffzVEzhw4sE7bJ7AIoKdbuJbb8UvETeeoRI5/TK8b6xm6qiN7BDXEcj6\nGunNazBoRSWFUFxnMEFOr4Y2SK2zE4WMQd2RQ7euA0f8bFBzRMibtovUogo6qkv6x9K7PewpX8ic\nFB+itIXk3DoMvxNbbieRuvg8jUTgw0ZeyE7pD2dhAec7PiR5a5DmXYsxz7doLWwiv6qIqzquZdrS\nqUx8bAzpE+Mrfz/DJ8dnBv9j4PTY+OmO2/v///wP97Ltnm1UYosZ+wH2voqMKcX0Fc0Y+HifVpro\noXPghu7dQMqGD+mc8CzS5oXcBhhbPWgoBZyOaEOOvnvcEmDTITKyGktWdw/XrP8AzYxKmCWQwhms\nYANvcZgPkZFOkt69jkORAoo4j3xGoaJxmA8J4CeBRCYwBYHaF9Xoh003ueEPm2nMS2dTdiH1qUlY\n4WnsLVRJ3HQLvnk/pql4LvvNQpb+di17VvTQNBp2nS6Z9ZaI66pkqJKOHGgqhMwDqRjtJ/DHm7wI\nKUmpaKLzfRM2LYA3zoLlr0clI6QChgbf/gk05ENhE7qA3qES/AQzTWoXBShe46KvLYnEwKCHLnbh\nYQyjKMHvcPLq3LnUp6UipMTW9gxmdhlykES2FrLh8Dv7t+HtiEpPlzKJNDJooBYLCzdunvnZo1xx\n7824euO9T9WuEox0oJA4pN1i9HejkkE2Grb+to/928dGOkZcU7UBWISJlxYQCBy6B+/pt2EjxA22\nTWzJKMSlzEK1rFhh2mH6LojAHlXsHDT7OIaNFMKoRGcn5tS6uCR8H0zDRnJBzSCDHx2xZsc85o5d\nN+ggTJJufI/Wey48bgQZV+OhAMnh6Iwz61gu5/8qGiSweWzM//VcZn3xH5dV/gxR/FcVXv0r8Pm7\np3Fn/eVc9cMZzFiQgcOtkkyEyXQxkR6yiPLVTAJ0soogNfHGHgCJYnbhan05Kqpz+iCPSxz36oND\nh1l7GVFLF5hZWc1rs6bx2JJTeHP6FLrcLmzYmKecDcKJ2nsMEfFxVO7jGBVYWGSTyyLO5CwuYB6L\nYhWqQ6FakpTuAFMPNXHD+h1MqW0C1YaROhHLnoh3y11gRTBsGntds7nmTo0vfkMluQmev0PSkS0x\nYnI/NZMEK2+RzHrdTaexkI6MRFAEUlXoHJsDN+ZAqga/uQ2uehJ+eyv86C44/yWoGB2lwIgO4EXC\n3gdAvAl0Ala0ObuElAQfBfQi6ESnHT976OJNwKKeGgxM/nrKydSmp2GqKoamEdzyJPa20WhhG3a/\nEzWikVmdzdE5A9LINZOPYqomAkE6WUxlNtOZQw4FXNx2GUt/fhKORDs2j4bm0VAdKqfeO49eu48g\n/mG9eInEgZNU0uPqDFQ0UklnFPnk0kN/MDv2CrAHk3h2lkAhi9wY/dLCrUdY3HCUNCsYKyYDJ6Oh\n/0EY4fjGzDqC3Tgos7s5lJZOsDN1uIgQmj1CW0Vp/IdCkj76KFbOwH5pDh1XaSNDuyVEC98+DpZh\nUXLap1+j89+Azzz8fwBpuW7O+840zvvONKSU7HqlmpWXvwlBnVxCZMsAG0bZ6W1KwgxX01f6PxgS\nSX7qYyT+YQ87ji5i5BxADALIaY4qrg3TYQhgzdTJmIpAKgotSYnsG5XPF9duwPQfw/IUo/QeQkoD\nPbGEp/MTmdlaz2mtGf3718dXiTb+HuoLKEQ7JtlNi8/tOsSBvEwsS0c6UpC6H1vLNvTcU2hMT6WC\nRNy1JjN/ZeO1ZTN4fK4bmyURloVUFK767rvsmleMMea47QgB2Xa4KQsea4GGvOgrdtbAgvt+D9rL\nA+dFtgAbokuoiWDp2EOn4VUW0G29G9cMBaL8942pTXR73MjB1cGhTMJvr4IZP4cJfwAVGsbHy1Fs\nvmgd4zdPxR5yoJoD10FFJemnuWyx7UV1qFiGxazrp7Dou3PxZLjp7u3mp/f/jHkswo5j2KKmc7iY\nvWynLEZoK2Uy0zgBgSCFMB7aaI/1DEgjgF9xc8CyE5XEk1gek9bLG1nre53Zq+ezqXMdS1keZQsR\noFfRkYWpjF12BVs2P01tWTWWAUH9Q1xMRxlEOIigs865hbqJXyHl1R9w7YozEdLqJ4jpISdH31tC\nT0P+oCOQCNXixLFvIwsHaiz1oBNtQ3H/MtF5sSSfAAmYCEVQMD8boQkatrVgREykEeU829wa8782\ni+TC4yign+EfgpAjVPH9OzB79my5Y8eOf/du/EOQUlK5tpa2Ix2UnlPMTbsLeeUbzzKu4gc0Ujms\nZ5c/X8H3k7OpbJrIxxp8iNq7stGwd0ospPsx61gWoxtr8X9wBTr6oBoCJ4HxVxMaewmuUIi563/P\n+OB4wnhox0ER3XgUE5s1YAyjseYBQxXWVH63eA7NHo2UlaeDouE/4btE8haBlMzdc5DMig62nlRK\nS3Y6g4WBhWVR2NpKuFCjaWwaIyJiwgNNEIp6tAnUkJ/zDIcffuvjT5eEhNZECp4vwrshCUWPf0i2\nFczg6IKrwBimNUveOygLvkReeSGmzaChpA4UiRJRUUwVd6+bi79/Hen1mYj+dO3QB6TNpXH9tkvI\nmpyOZVnMvWIBCc+kMIXZeEigT/L4n+liZcNGbU4lf7vlaXwl3Zh2EzWioekq3m8lsbztEgoixdg8\nGo4EOzdsv5SkAi+mNPnGS9/kpS1/oyOnnUk1Z5Py2/louAnSQjWv00sVCbZscpfeQsUYH6fd9mvS\nC2swwwpV+71UP/9N9r13ATL2O7En+Dnv4p+Tf93zaK4oZdbUNYyuVJY//QJbHm9AdWmceNU4MrOd\nWIZFUqGX1JJkEjKjvRAsM9rzdt9fj2Bza8z84iSKTskf8fg/QxRCiJ1SypHlY/uW+8zgfzo4EHib\npSVXYes9D733DY4XRhAIlr3hp23iOFZtv4xPZPABLJj2QDIN2hRac1M/dj014id55WkcPyeXqpOO\n5atBcyKCbSx94wOcJMf2TeKhmlwtBbcBGgq2WHVrH3RF4RfLFtBjV3AcewvPnl/Qcc6bYIvGr73+\nAGP3HWPXnNJhG5ILKSlur6N6fi6WfYS8REhHW9mA82A9uWwilSNU3FRG62kNnzwYaYCrzsPkO2ai\nhqPbsRSVttETqJz5JbCOYwMpQdLTnuLaPyVFqYBSEHaHePauxzFsOjff9m3qSqvwdiST3Jb6kZsW\nqmDOl6dx1q8XsbZ1LZ/b8jmMOpPxz02l4Egx7gQPlkdSsrc0xuQCzaXiTLAT7AxjGVb/ONKS6DLC\nfnZSSRkaNiYzk1Im8fiPH6SxtC7+nFhg22Pjqvev45zR55MzM4NpX5iAwxtNVn9l71d44tgTBMwB\njfmkxhzGfG0pTaHtgCSdmeRwChp2TCLUL6mm7cK/4c5oprBiPJ7vXotNF8gkDZLa+fDqJ6lsuIZZ\nM5qZfu6z2JwBqg/nMyXlFO4quQ9N0fBon7FrPg18UoP/WUjnU8Ik9+m8+N6jnDPnOfLU8XSZZf2h\nBQWFiefbyT6rhSy5izU7L8SwYhzNj4KElEMq3d5yqN8IeXd/VEg/Ct3HsAFYS0f1N2AmjUY6U9kz\nJY0T9vWpyaj0kE+L0U2Oksx4y4jbM0MIalMT6XFFOe3hgtMw3Zn9xh7AkILawgwcEZ2ww46Q8v+1\nd95xUpT3438/M7P1eq9wRz16B6UpFhTFrqCxxESNiT0mtiQmMT+jseebxGisMRrRGCtgBZUiggh4\n1DsODjiu97a3dWae3x+7HFf2sIc2b173Yndm9pnPPjf3mWc+tbv5xDTJ+6KGikkZBDUl3NSlB4ow\nKGhfQBw7O7f5czu+nudJA1+ul5rTKsn4IBuhCzqScikbdBX4ALsOYt+fQQgbHVz2r3Sc3v0ncfid\nXPb7a9g2tRCBILdkAIFoOQI9kIbko0+bSS8N8KH/QzyGB7Kg8OefUchn4e8YUjnpuTOY+M50SjMy\niPnxVI6fEo974QZ2vF2K5tIYd/FwPnuukFdr/0mrbCbkDmEM0qlpr6Rmb0VvZQ+gQGh0iPEJozn7\nl90T4ltDrTxT9gx+s/t3aMuqIffjNtYdW0RFWRW/u/FePlz6EG41hRNnnM0Dd17PpvunsW5xOaVm\nDBIRtv43w8o7HqNl0G4YewvrdDvrlrvDBdFEI+/WbeSB7X8DIN2WzhvHvsG0lC8vV2Hx3WMp/O+R\n6QVn8swTIa654lUSApIM0wf2EOMX7iD51HDWkRBw3Vm/5fmlN9PsyYzEKnfR4kKEM2ojtMTXwEfP\n4h91LRgGKF1+hbJHrR1TopWviC6cNBC+ekgYCEKhvmAaKzI9DNpaRFJVMRodmPmTeH3oCM7/dCMF\nHg9BTUWVktr4WBYcO3b/WKoDPSNctVDVDSasKSKtrhVTEWiGgZA1VPIWqDE0D5vP7mEnIryVNAe3\nMfNJky1zBlI7PLWH7Caqz4/H1g+RGCK2pQyA+O2JeIa0I+1fI9NSk5RfuovW8U0EUwL4dQe8FQtB\nAXaTFLOVkKZRUFnJ5N2bcQXDOaZdUXWVCUunhn8lCIKuAAGHn4SWvkMEA6rK+1oqj/+uirmnT8IZ\n5+ylZFVdRXjjeO7EE2mIj8MsVXm1wovLPpxVG09mSFbYrl7bby/emz145rTjvbgDdAGq5JO6pai6\nihElikYEBKedcSae5gBBv0FSpgshBHu9e7Eptl6ySCRb2sP+g9y8bJ5969Fu+z21Hbz+QRm1pouu\nXYa9qY205Vfst9ppwfBPFOpCdUxfOZ11x69jYtLEPufO4vvBUvjfM+dddB4ut5tfXvMEZcG5JHi/\noLnJS5zPiy1i57TbDH446+88f+XLNOevgVGRMgkNbZCejK2sFmV7BY7VxdiKw5UGA/1md1f20Hlz\nEJhIISAkkc5spNAQsndWra1pC3rm/vKynoRYNk6bjGJMIKullca4WPx2Oy/MmYorGCK7uY12l4O6\n+NheY2GGyz6MLCwlra4V1TRRIzrZIJkQOVQZy9G2biO7o4TqUT/AFTMOe8NeJr3SQf3gVArPKcCw\naxg2BfwCvdLF3jHzEUji67cz7NPHmLR4Ou+d8gYBWx/Vu/pChbYxLeHXHV2K4AUVMmtamLP9CzTT\nQIu0Xew1VwF7N5OWLWCjJq+a+JbEbtv3lU0IqirVKW629o9HBiWL3hmHOD8GlB5PBlKhvPkS6gbE\nY2oqSPD4JR0ByaV/reOzP4Ud1iuWr6BtWBveH3SAA7BL4spyUQI2/N5wBZNuSj8IqduGcf1V7yID\nOlWspIkiNHsiZ/3kx4RO6dHekPCNbHxC7/aJ+/DUemnPbOHTua9QO6YIR0s8g944ndiqLOijYFpf\nXLruUopmF335gRbfKZbC/x9w2llzOO2sObyyoJaH7xzC8vsuZeyuyxh5XTv2uCDVRWNYdt/lmIsf\nIqW9GACp2NALfsAlN19G0fOXs3lbD9+GGT1JSEEyYXURxcOyGXDKEozi5dRuMHubfoQNaYuiuAFT\nValM6W6f9tltlGb05WCVYAYR2MndW4dqdj+Zip0sZlLF8nB7xb0vIUbMQ0ofLSxEI42Ynakc+/BC\nWlOCbJ9yNX5XNlI4QAuL3pY2DP+Q+YwuaSP3tnzevOQjKqdFulB9XZ9nR5cOMcDWzDya3HFM2VtC\nfmMDcabZy0LS07HaltJKTmm/Xtslgh05bt6/+lWaRy4LZ5i1DUQpvJffprzFA54z0aWOaUj0ZoXJ\nf7qRdYPzwsq+6zgSCvcEaGw3SIlTye6XjX+EBk6IK8tlyr03YW+LQwqJVA023fAsDVO2oJoqfuFD\nkRoj/n0JRjDAJv5CiA4kOgQrWfD3XzHsuYvpOK8OQgLNk4BsG4ie2s4lV52HrhtIE2w9fCv+fl7+\nfvd9+J1+pGbiS2+k8KanGPzqGSi6hkn0VX00tnds//KDLL5zrDj8/yHzL87gk+LTuDJUh37PH1mc\neRYLbAWsHC0xXnwEbZ+yR0HaYmkfdCHPftSOb+fAbiFzAI7dC8HovlpUBEwe6uLOX6n8ZMWHxDw+\njvY1tyJkz/o+gKIQyD2p9/ZIxmVG2ecM/fRRtICnm0mp9/ES+46XQXOimBLRx7Falxo2pmJDa9pE\nUdzycBEvavGxlaDcjdreTsCRQddWkACmZmfvgCkoKJQFJlL7/vMov98OFaMjDWV6/PRFCNh+HsLs\n/sRTnZjMwnFTWJPcjw60bnVzekZYhexB2hJb2BvYxXLeYx2r6KA9vE8RLLzlCZpHLgc1BIoBiTvw\nzvghya0KO8ft4r2p73H79sc4/dq/kVQymJ4VK6Mx+8LLCWVno4Q0pv72dlx1KWh+JzafC7snlnGP\n/BRnTTKO2mTG/flqTrvkH8SV51AjV+9X9vvmkhCVHe8x+IUzKXj5fNJWnIW7dArxn53Ib6/rYGbW\nIo7LXcSsvIU8fcs6qna2AfDn8kcIxQSR2n5zmuEMsmPeYoY/c9GX+5O6YBNRrkmL7x1rhf8/xmZX\nuX/bpezZupd/3N7CO6vKqWkrxRA2MATSFkMg73R8wy5HOlMI6ZJEOREXq/BRH1lFCWJ2/JfsKfOp\nMV3opsSuCWKdCi/dlM6AjLnM/ekcNr9ZxCe/W88iMZ6N8vMuST2SU+V8PvDbadB0gppKjFOgB/wY\nzbsZvPqvNPu20IxJUv3btI/7OaG82RgRl2644kLkr1sIggPPBWliaCqKpoQLoHUhiIemHsVSpS2W\n1vE3w57rum9XVPrSHH5Fp141eC9nNLoSWX0+8yZkFMGYV2HkOlBiQfghpjC8nOk6lA/sK10MaxtN\nWYyKJwCGCcQCdgkmrJsxiu2t+ZyxYj1pho8YDKpTPNicHuJaEqjNr2TpRYtQH9AIESBECBWV9XzK\nGcxHzbLjy9kKao/Vrgjx1OLbqJ43j0En9yfkS0X31QGSMXvLWTdkIIbaPXR1WLJKSlx42zslCYjK\nU0gvcaPoGqLHWk0YClkfH0u/j2fgbExCieRqtFDcTdl3Ho+ClypU9zBiO/zEtXm7TZehKkghuHNV\nBi///W0G5tpZ+del6KL3WKqu4Bu9LervrC+u6N+rNYbF/wBL4R8k8kf2577Ft3Ift9LS3MJZd27g\nk4pkpL17gkm8oWMLKYzmehr4gka2YCOGfNdx/OHCTEKD01hXGqBfisYZE93YtLCJQbOpjJ83inEX\njOScz07h1t9eTuzSkWho5JKHZtoY8uFnFOUmkXffieRn2vnLVadi7gnSSA0hDKTmQoRaiPn898Ss\nf4jSQAmaprJswU7+fetnlMSorI2rRm0rhVAHw5MuIT20i2qyEaiYBChhAa3sDCcHaTF4Jt6BnnFM\nxJwkaBchXFJ0xrHbfE3YvU0E4nr01TECUL6I/xpPYmuyo6fO3b+vdjgsuROxoxVX5RPY936OkpqF\nnOhA1SF2gw1/jZdAwI//jBt4+x/nYoQkdz7dyPvLG2i0OTGFglTDBb1aE+NYMmUsU1ZvIy6lAkdr\nAi/8v7/TkV4HmsT5qosY735zWLgWkcEHvMUx6cej6CpGzwWsqtOQU4sZNCldUoYjK5541cQwJMcV\nlVCWkUZTbAxBTcOmG2imwY/qdkGkh+7W8hBy+2XYXbsQZs8Hc4lDV0krGoTNE9Op7AFsRE9YkpjY\niMEWCKEa+2/Q+wxUWmTbqI272DZ6APb1JTSvtMNkepnQpE3ntFdO4p3EZkonbO8+UBTGxo/l4dEP\n932AxfeGpfAPARKTEnns9hlM/U0lvqDEMMP+V5dNcNMQyZb3VII+SGcy6UwGwGaoFByTRnyqk2kF\nfXf+EULQ79gs/rXkde793fUofxyHIY1wQS+35I+vTCNvario1UspseyuDtAw7XGM5EgymOFHq/6U\nQMtO/vXUp5w/dywrr3qPHJ9OOiG28SA55DGZGWSXbeJpnkYlnmyOp5pVeKlh37pR0TuIW3cPrbOe\nwEgqACNEpSwmngSSSUNFw4+X5LX3UH/CgwjNTshUINSB6q3FUfxPJCFcVfcTSJiMaUvf/z2DtSQu\n+gGqGUBiQh2IbQY5HMMsZhCHjXJXgPm/vBalrIm9n9Vgf6KYjrGjMXvkCUhVoS4zCb87xCe3PMux\nH05i1p238NEv78U3tBnHR06i1Ur248Xb0IrouboH1KBGbkm4pK8ZkgT3tpK/L8fZgCs/WkFQ0yjN\nSMdQVUZUVJF4XFbn56cOdbB0czxNtXcgjP2lONzo5BFuMj9iSzZegpThIBS5gWYzk1ZKMLuVT1Bw\nkYGLdEJf4v+wB3U64t14Yl0MevN0GsZsw3Du/35qUCWruID16TNJXXg8054t57hf2Dnpp9OpC9aR\nYc+g2FPMSxUvYVNsXJF3hRWSeRCxEq8OIXZWh7j3jWbW7gwwLNvGr85NYkSGws/HL6KxwkvQH9Yy\njhiN828dyUW/H/e1zxEMBilaUkxCYhJ503IRXUIhH33qbW5cnIe0ubqHSEoJ0sCuKcxP9THwqY87\nzTbLeJfpnIwNG2WU8gFvEozefTc8FIJA/1PomPIHMEIkfHwlWktJ536bZsMVl0ZHbH86YoaiO9Ox\nNRRir1zWGWkkhR1v1nX40y7s/FzM3v+Hs/ldopmD7CRyCT/ChYtdlLBGfMxorsCQ6bx35lRCjt72\nZEUauKaeRcegsF/F1RxL1sMjaSnfDW1mn9mxrzz5Krdsv40tkzYQcoaVrDAEDp+Ta2+4g9iW/Stu\nHR0FrVvZpJCisHrIQJqT4rntnEQuvn0MAI3tBsNuKqfJYzL68xKyy+txGjojaO0WU2QSbj6/jfjO\nUav5lDLeZl+ZbjeZDONH2IlHV5XO1Xw0dFVh1ayxTFyzjdiOABUzP2XLT17E1HSkapCxcTTJy+5m\ne9pgdFXFZppcNC2GZ25IJ+Q1cCU5ul1jFt8PVuLVYcjgLBvPXpvea/uf15/B4keLWf36XuKS7Zx5\n0wgmz/1m6eZ2u52xc8dE3fdMyWikLdS7ZIMQIDSCJrxU4+LHMbFkt4QdeTM4Gb8jlgaXg0BLIGoJ\niW5DIVE9leHXZgg9YXA3hR80JfUT/oQRm4tzx8vEFD7cy6kppA6mF8wAYUO9gav1kz7PHaSN1azl\nJGYxgCF8IN8kSBIqgozqRir6p/fKBs7KDNI6tKJzJe9L8rD7V2tJuia5l/18/zQJ+h2bzYU3/5jU\nUzNZfvp7SJfJwI3DmP38Wd2UvUR26Yy1H5tpMmP7Tp6ZNpafbIolf7ufaQVOUuJU1t2fw+3/bmKR\nbShNpYkct7kQvN3LJiuAhkkcOu3YMNHJZCrpTMJLNRoxuEiNyEDn/32p5JBdw+Hzo9s0IEDuymlk\nrzoGb0Y9dk8srnY3Bdou5qhl/HPWDNpcTjoeX8M9d1aiAHE5sZz1xEkMmm01LDkUsBT+YYA73s78\nX49h/q+jK+rviuJK/Uvr8xhCsKVfLtkt2wiqKm9MmcTOzAxU02T2WhOz+sCti6ViJ5QeXoio2FA9\n3YuTYerEL78G/8DzCGVOA9Uett93QUUQ0/A6it4GQsPetgKnqUVp5dI5KOUUA7MQCOzY2afuhm8p\noz4zmZCmYmoqwjCJiVF5+fp8flkzgs1tm/GZ4ZHd8W5Oev5E1v5wHZ52T6+zaDaNjP7pDL92HKWP\n2Il7/Xg8lDCBAWiRPzUTiYFCBzbiOw0v3RHAlZ9+wTPSx303VfLg9TkMPjWPvDQbL9+cETlqIK9e\n3sbm53s7SwXgipRUrmUdTtKJI5849ivdcHtygWKYmKqCiUQ19qdT7XPabpwwhJEbdyG61MtXTJXY\n6n0+FhOHbuBNbGBa43+IqRrGwJpmpGliAC2723jpnEVctfpCMsek9fkbsvjfYIVlWnSSEvvVLodW\nt4ugqrJ4wlh2ZmZgqCpBm401oyczWZyIhhbV5CGFitTc+AfPRzEMkj0e0hqrux0jAEX34ipZQNya\nX+HPm4tUu9e7MTCI04O4Gv6Dq/4F1EAZmYzoVlq4J45IJ1sTEz8+GtmMiY7TH+SE99cxbOseMirr\nyfduZesj/ZgxLIblM5dz96B7GNkyhsGlBcx//kfcsPFW/vyPR3C73d3HdzqYe+5clj9fyQt/30Gt\nqRAiGaeYzCYEVUoFHXiow8lWEijDja+P7ltEkreuWF3MyKWFvHrxuzyc+wz1RU3djho0KwdbTBRz\nFJCDnzz2UsZCSniRIC2RksnhfwHaMPCzw/EEv14ymMsfyaUu/Q028SjbHR9Qma5QlZPChDVFhGwq\nsR29zXQCSYyrlX//7h88/ui9rLnsGQbW12Azu5uIdL/BqtDA4TMAABm8SURBVAe+H1NtfXkHa97a\nS+kXjRxK5ulDFWuFb9HJ//04lXmP1B34ICEoys1GCkFJdmY3h2djfBwbTvwFx302jHrPKgIEcOGm\ngj14RYBg5lQaxl2DsCcyrLKa2RvWsoTorQwFJgQ9CCOAd8glxBQ/221/O62YiiNcv8fUqVVDDPKP\noIRNvcZSsDGOiUgkm9gIwB4WEUceTlLRQjr9y0vJau5g3QMP0j+tBlM3Wf3rQvSHY7jUvBYIO2Vf\nX7iYISMGc9tdt/LAHx5EURRCwRCz587mngfv57qhb3f6WgCkVHHHZHPWvSdhw8YTN61HIpAIanCR\nj6eb2jfQqWAPeQxGRaAaBsF2g6AnxIKzF3Lj9ss7beKjLipg+T1rad3bjhnxqZiAX9FwaZCmpZPq\ny6BRtqPipGuvXUckeqctUMut193Crh270XUDkHQEylGrVzCGm3GqqaQoQbyawDQkiiYwQiZKpHnJ\nuhv+SdnIUgy7jtsTg2HTsYW634SkKWkobua7xDQlj/1sNR8/X4rNoWIYktyCeP7w/mziU/sOYuj6\n+e0LS9nynxI0l8aEK0aSNyPnSz93uGMpfItOLpgay89m+/jHkvZu2xUBqgKqIjBMScgQFOdmRx2j\nOimRN0+ejzTmccKqjUxpqgrXP5dANei1n6NIiSIlQXTqqYk6DoS7ntprViNCvc0nAJ5p9xNKmwCq\nnWYgufAzCna+wk7WY6KjoGKiM5Ch9COfHVSymXDYoEGIcpYwlItBGOw66z12XLCY7NhMarc08PlD\nmyl6eSeyS9awisaK4Ae8VPgU5+fNY1vNFt55812CgRDHTJzC2sd2kqCY1Pewigc6dIpWtNLeGOjm\nZWjDRgUucvCjRjLGitnMRtaSx+DuX1ZCe6WH+qIm0keEM55tLo2r1/6APxQsQGloRyJowE6d6cTQ\nFUaekcfC373BtXMeRamxdfM9hB24JgKVncWlmLLrqtzEIMAm/o8Jxh0kOlJ5qeUCVE1h+Uu7WfLI\nJtq31RNjb+L1idsw7GFnen1uDWqot0pRbAr9pke/Xr4p7/1jO8tf3EUoYBIKhGXfs7mZhy9dyR/e\nm33Az5qm5D/nLWbX0r0EO0IgYOMLxaQdn8f8Z08irX/0DPQjAUvhW3Tj8avTuO+SZF5f20FSjMrY\nPDtvrfMS0iVnTXJT22ow/5E6OvwGnmjBOFIS0jRQJbv7ZTEkx05WWwuhvS0gQYu02Quhs4x3MaPF\nN3ZFKNhMgT9+AEb8IFRPOVrLdkxHcqey38fGccdwUYeNzOpYVrOMIEGcOBnIYBJIpposRjCUdsqo\nYiUxZNHMdpLkYArKU9ir2hj71FSeXvIfFI/ayyyloTGJ6ZRRyqtv/ZeViZ+QL4eQJjLYbTQiFJWB\npko6gm3YuxUYa6rx4XRr9HSPNuGkBTv98PAJ/6aCPQwleis/oQr0Lk8PHX6TlUV+vjDjcPYwD6mm\nZMMHNXz+i4mMKjiFkprqHqNJDEIw9JeYJb+Pej4DP6W8hq3qh8yPW4DNoaCHTEzDBOyYvhRy/z2V\n6rO/QO4KYvgMNhz/KeNXHos90oRe7Gti8ssJUc/xTVn8aDEBb/drxwhJNi+rwdMcIDYp+pMjwK4l\nZez6MKLsASRI3aTmw93cUPAaN744i2nnHZlOZkvhW/QiIUblxyfsjyj5+dyEztcFOVD1ZH8K9wRZ\nusnLXf9twRfsUd0TUFXByIuH8OitMxBCEPSG+OyvhWx7bQcJefE0Dq6i4m+7ceHC543ubhWKg4Fy\nKJunzyOYNh4hdaRQUFtLiSl8BNU0MXqYwVWhMJkZTGI6IUKddfx1wI6KgY9SXieEh1Z2INBwk8il\nqy+n5b8zmPjGNELoOKI2HFdIJrVzpVxllFNFOQoqo5nAseYJOHFSQmyveKHSdY1ccvd4Ni6t7lWp\nwkSwmH/STgU27ExkKgZGryJuql0lY0w4wub+l+q46y0PioDA8eOIbfUy5dOtOP374+2lhF8vaCK+\nTiFbCFRpko6fDAKoSPyo7HEOZp8RL9xO2R6pixSmmW2YGChSJeQ3O+cBQMHOqEU/ImFRP+LJJ4ZM\nQLAhqYZR8eBoj2VXSgpfTB2Fst7ktlyJEqUM9lch6DfYsjz8NDjq+Ey8bb2LvwEoisDn0Q+o8Ive\nKiXo6f15CTj9Af7vh58wcU4ODveRpx6PvG9k8b2jKIIJAx1MGOhgaoGTXy9oZv2uAL6gRBWQl6Zx\n90VJXDgtttPebHfbmHnHZGbeMblznKv+cDnrP9uAr83L0z99gVVVy/HjQ8OGgcFIczS+IVcTTMsH\nzbk/7T+xAH3IZahS9Ho+KMpKo6C6AZskEo0TkRloQ2EXrxGI9L+F8DrXSwMrWULmy7lEa0e5DxOT\nWqqRmBhdegyYGGxmAy00M5f5xBKiucu5Ibz6rCppZeTxGWxbUYdp7g+KbGADARpIIIls8minnSRS\n0Zwqut9AtSsomsL5/56DogpuveYL/lwTi7Gv4Jqq0poYw9ppIznuo3AlUEMRVPRPZ2+DjntQNlkl\n1WQZPtIizckB0AQV2bnIxllI1YF39LVIZzIi6MFZ9ByOnS93mpqiEX4CUujP7C7vwWx2U4SBJy4e\nV12I/DcLWbSylNKVQ/n7vUPxd4SIS/7q8fnr363g/guX7z+HgOHT02mt82GEusuWkOYkNdcdbZhO\nHPEOFE1g6tHawgsUVbBlRS0T5xx5Nn1L4Vt8K2YOd7Hybtc3+qzD4WDaceEa8yfsPYFd75Wz+tW1\nrHxhDQlGEm5ieXDAANB6rNZUO77c4xn++Sq2T5yOoSogBIphUJSZwmwgHhMl0kXKBMpRCSFpZhs9\nG8KYGJSwBR8+hhOu8+9XTFRhw94lKclAZy3R+wuYGFSwh1aaSEftpfBNU9LWEOTupaew5o29LF+w\nG4dLZda8/rz3ww6C7b/oPFZzqQyZk8+oiwrY+X4Z8bmxTLhiJIl58Xz6WhkvbjUw0ntEVCkK7fFu\n2mNdOP1BOmJdlBbk4hYmiVXNNKXGk1vbEbHaS0whePyUE/C4nDD5d+Ens0g0lHQk4Bt9Lb5R1zDj\n7XcRUezy+4gWjRWyOykelUdNTgrClOTvrGLQ9gpq/ryWOY+uIxjjJN9mcP2TUzn2nAObTlrqfNx9\n7jLMQPdb+5ZlNcQmOfC1hwj6DBRNYHOo3Pjs9C+9kYz/0Qg++2shpt67LlArNpyAZjsyAxgthW9x\nSKCoCoPn5jF4bh7HzJvEK/PfRigCXY0euiiFQmZlLUnezZQOycUb4yCttoVBOyrZLCUDaSGdOJoR\n1OKkHY1wFHz01aqOTiJJBAlQRzUuM4btw2YyoayaWJ+HGipZzvs00HcUk4pKC43k0LuMtCNGY+p5\n/VFVhekX5DP9gvzOfanvnsM71y+jZlM9jlg7E68exUn3TEezq4yaP7TbOO88VozXlhk1X8Kp61QO\nzKA1Loa6zCQSTJ3j3vsC3RNCM0xqcVKHgyF42JuVFlb2QoAWJapFtaHoBgmh5K9VfVpXFVaeNA6/\n045UFQo27WJISSTRDogJmcS0eGkF7jl3GQJIy3dz3RNTGXdyTi+Tz1/u205Il70MbCFDcvEtI5FS\nsumjGrKHxHPGDcPIGZrAl5E2LJm5j85i0TUfEQruf4IpJRaJQFEEI2b2ToA8EvhWCl8IMQ+4CxgO\nTJFSruuy71fAlYRzFW+UUr7/bc5lcfQw9PQB3Fp9NTvf28NHKwXLaqBHiX2ElNjNFlyNTUxu7B5V\nZKJiQ7KJ9azlC0byMwQGCnYSGUILO+hpphBCoNpU/hn8Cyo24hlK3J4g/8p0Eyz/J4qxrz1lX7Hz\n4fyARFKQQsVmUwgFw08HDrdK3shEZlyY3+14KSUf/W41nz60HsNlY/PAPPwjs0k/aSghKRBBg9ov\nGtDcGmmjkhFC4GsPkeFpoj3BjRm5GSZ5vFy0dgvZLW0IoD4hlm3DJzCpqZZtHUE0I9K8PpKzvBc3\n5SnJX5pkp+rm1241UNUvjaDdhlQVUmuaGVJS2W0MQdjc1BbvJr7Nh2qa1O/p4K5Tl6I5VH7+3HTG\nn5KDO96GqiksWdtOYpTSD0bIJBgwuejOMZx/2+ivKSVMuGIUw84dzJOXfEThhzV0CBvCpuAEfvPm\nCb16ARwpfNsV/hbgPOCJrhuFECOAi4CRQDawVAgxVEr5JSEZFhZhHHF2Rs4byjPHh5h8RyUdAYkv\nKFGUcHMtqSjsGn8cA78oJexGDF/KBgGE2IPbmcwGYxU+vQ193NvMn3MzTiWJ+JwHufHGS+kIeTpX\n+4pQuPk3P+exhx9H4GAE12MjFtVvx9yzijKMzttDAkm00twrukhFI5d8EkUy5//rZES/RN59vBhP\nc5AZ8/M54bJBvZTIxheKWP3IBpqEjaenzSSg2QjZND5+poHfLGjmZ++vJsEfQBqSmCw38xafxsyL\nBrDjrk3sHZBB0BE2qfxs2efE+oOdQZeZLe3kv7aGz4Ujip1a4Eclob2jd0vMHsS3Rg+HPRDNyXEY\ntvD3LNi6J+oxqilx+4KsOHkcMz/aiKaH51IPmDz0g5WYgKGp7C7IoSEriSmqgtJD6UtFYfDMzCij\nf3XcSU5+/s7pVBS3UrikCneCnWPP7Y877sit1f+tFL6UsgiIZjM7G3hZShkAdgshdgJTgNXf5nwW\nRx/56Ta2/6UfT3/Yzpodfkbm2qlrM3hxpYddQwpgYCLDVi8jWOsEJcigwSo/vP5Cxl42nD+4b8Yw\nDOz27vb0k3+4mZce+w8bPt3AsIkFXPWLK/nR+Vfg8/kYwsU4SOy8gaQwmjLe6fxsMw1kkI2LGOqp\nwYsHDRvDGcvp/c/m4tfOJntSuPzB6FkHVkirHlxPyKvzztTxdDgcnU3e/agEOkxeHTSIy1aHE8Va\nStt46YRFXFl8IR8/X8opn26mODONtGAAb9DAAftT2CToAQNs0c1XEhheWcX740cj1b4VflxrxwHl\n3zdW1xFi230ouoGpqdgDoT6fELSQjtftZPegLIZsr+g2ooJA0Q0GFpVjIKjOTSWroqGzyJuuKjQP\nTGPCzO+mVEPusARyh325KehI4Puy4ecAa7q8r4hs64UQ4mrgaoD+/ft/T+JYHM4kx6ncdk5it22P\nXZWKLyiJcQ5AiKl9flaN4gNwu91cecuPuZIfd25rqKsHIIVRncoewEEieZxOGe90NhKpoQotcQJT\nWnKpdO9h0bbXye6f/ZWjTqSU1G5qoK0qvILemZGC2rgJpIGeMhpUO1JR2J4ZDsHclZrEsmH5VCfF\n8+ufVZAydSyDJ3WQ98pmNG+QndhRkaSjMxAdAehenRHT4vniixaCvv1PI6omGHpcFgnXTUa+dODs\n1+r+6QzfWoZqRDft7HOI73uyEEBuWS0lI/pjSkl1TiqDd1RG/WxLchymplLVL62Hwt9/tGZKhm7b\nS1tiDLpNxe+044lzUTs0m3vuHvqNQzyPZr5U4QshlgLRliq/kVK+9W0FkFI+CTwJ4fLI33Y8i6MD\nVRXEur67P/hTzzyV4i3biVbZOZuZJDCYTfwFiYlLS8fWWkr/y+bw2N0PkZP31cP3ajbWs+Dshfga\n/OgBnWrKSXjnTOjSctFzzN2EsqYhkKzvn8VbE4YTikQiEYL2BoOcj4tJbPd3KlsJ1KERhyQdA3us\njVNvGk7g6VKKVtWBEAgBKTlu7lgwk6QMFzefEssfr1rDztd2kmt6cZsh2l1Olo8ooDC/PwGnnSVz\nj2H82mIy6loQpgQR9kmYBky7II+z/zSJV+/4nE9e3A2AI6gzbdkmCicNZefwfgzYVYVqhI1nCuEb\nhKkqbB07CAAtdGArryIlSc3hG6Op6LgVyW//OJgzZ0Zv7GJxYL6TevhCiGXALfucthGHLVLKP0Xe\nvw/cJaU8oEnnaK+Hb3HwaGluYdb4E4ktP5FEc3i3Vb6igEojcfZSBqvDSUpO5IJFp5Extq+m7tEJ\n+XUeznkaX1PYARwkwHP8jVCP5t9SddIy+0XSG4O0ZOXit/foZ+wLcNK7n/dqFg8Qg8koVSdjWAJX\nFF6AoimUbmhkV2ETGQNiGXV8JooikFJyyzFvU19YR16orVsVRc2tccydx6KcNhy/DhMGOshO1pBS\nUlnSRu2udvqPSiKtX8x+maXk+tFvUb61df/3takI3UCVUJuZhMsboDU5jp0FuXTEuVF1g3HrSsiu\naPjKc2h3qVz0+7FccPvXd9QeyRzsevgLgQVCiEcIO22HAGu/p3NZWHxrEpMSWV74Mf944Dk+ecQP\nhhN0DVecRmyyg/tXnk+gwoPqUMkYn/qNmnqULNqF0WVFu4sSoiY1SQNn6avEV/poyr6l125NN5BC\nRP1sUAg+G9qP+Q8dE+4vDAyakMKgCd1vTltX1lJe1Er/UEevkrm6V2f9A59z+20TUNQu9XeEILcg\ngdyC3vZuIQR/23Q2r92/mVf+uImA18DW5bvGtftYfdxoQnYtvNo3DPrvriarm7I/UGX+yPfzGRQu\nqbIU/jfk24Zlngv8DUgD3hZCFEopT5VSbhVCvAJsA3TgOitCx+JQJyExgdvvvYlf3GXw2VvlVBS1\nkjssgWPO6ReOsOn37YpqddT7MPX90SYBfJhE6TZlhnDueAmnNgdT7Z0A1BHrQtfUXp2qDEWwY0gu\nxSMHYNsc4NI5fcuyZ1Mzpi5xRDs/EPLp+FsCuFO+elKdogjm/WoM8341BsMwaa3z09rgxxVjo63R\nz73nfkxJ0E7AYSO5sQ2XL8A+BR9W9d2VfTT1r6iCjPwjt7jZ9823jdJ5A3ijj333APd8m/EtLA4G\nNrvKjHn53/m4ecfl0FWF5Ua62vYk3PJQ5ZhRx5E3JZb3C70EuyaFCkHh5KFMWl2EYpooMhy5EnTY\nKC3IRRFhR/eByBocj2oTBP0KWpQCdppDxZHQdz2aL0NVFZKz3CRnhcscZA6M47mK+QR8Oo2VHZgG\nJKQ72bWhia2r6/n3M7tR97Ts/4p2heQ0J221PowuoaU2h8IZNw7/xnId7ViZthYW/yMyRqUyct4Q\ntr22g1CHTgrpFKijKTG3EJJdi3kJRiScwx/fvAhnmpsL/1zLR1t86EY4AU0VEBiQwur48eSUVOHu\nCFCfnkh5fgaGTcNtE/x09oGdmuNmZ5GY6aLW6yLP8HQz69jcGjPumISqffflBRwujezB+01CY0/K\nYuxJWVx85xi2lvkp3uVjaJrKiOGxtDUEuH/eMnZ83oCqKthcKjc8NY0BY5K/c7mOFqwm5hYW/0NM\nU7Llpe2se3IzRsBg9KUF1GdU8K+nnqestJLc9CFcdfW1zLlscjeFW9moU9NiEOsUuB0K/VI1WjtM\nPin2ceXj9XiD4WLMQQMeviyZa+d8eVx5S52Pv/9kNSWLS8kyfdgxcSU5OP7OY5h68/hDpvl4Y5UX\nb2uQ7KHxqFFMXBZf3WlrKXwLi8Mcw5B8Uuyn3W8yc5iLhJivpxT1kIk0JaomujlpLQ4fDnaUjoWF\nxf8IVRUcP/KbVSyFI7cypEVvrN+0hYWFxVGCpfAtLCwsjhIshW9hYWFxlGApfAsLC4ujBEvhW1hY\nWBwlHFJhmUKIeqDsOxwyFfjqlZkOLQ5n2cGS/2ByOMsOlvzfhDwp5Zc2CDikFP53jRBi3VeJTT0U\nOZxlB0v+g8nhLDtY8n+fWCYdCwsLi6MES+FbWFhYHCUc6Qr/yYMtwLfgcJYdLPkPJoez7GDJ/71x\nRNvwLSwsLCz2c6Sv8C0sLCwsIlgK38LCwuIo4YhX+EKIu4QQlUKIwsjP6Qdbpi9DCDFHCLFdCLFT\nCHHHwZbn6yKE2COE2ByZ70O+3rUQ4lkhRJ0QYkuXbclCiCVCiB2R/5MOpox90Yfsh801L4ToJ4T4\nWAixTQixVQhxU2T7IT//B5D9kJ3/I96GL4S4C/BIKR862LJ8FYQQKlACzAYqgM+BH0gptx1Uwb4G\nQog9wCQp5WGRPCOEOA7wAM9LKUdFtj0ANEkp74vcdJOklLcfTDmj0Yfsd3GYXPNCiCwgS0q5QQgR\nB6wHzgF+xCE+/weQfT6H6Pwf8Sv8w5ApwE4p5S4pZRB4GTj7IMt0RCOlXAE09dh8NvCvyOt/Ef5D\nPuToQ/bDBilltZRyQ+R1O1AE5HAYzP8BZD9kOVoU/g1CiE2Rx99D7tGwBzlAeZf3FRziF1EUJLBU\nCLFeCHH1wRbmG5IhpayOvK4BMg6mMN+Aw+maB0AIkQ+MBz7jMJv/HrLDITr/R4TCF0IsFUJsifJz\nNvA4MBAYB1QDDx9UYY8OZkgpxwGnAddFzA6HLTJs9zycbJ+H3TUvhIgFXgN+LqVs67rvUJ//KLIf\nsvN/RLQ4lFKe/FWOE0I8BSz+nsX5tlQC/bq8z41sO2yQUlZG/q8TQrxB2Ey14uBK9bWpFUJkSSmr\nI7bauoMt0FdFSlm77/XhcM0LIWyEFeaLUsrXI5sPi/mPJvuhPP9HxAr/QEQuln2cC2zp69hDhM+B\nIUKIAUIIO3ARsPAgy/SVEULERBxYCCFigFM49Oc8GguByyOvLwfeOoiyfC0Op2teCCGAZ4AiKeUj\nXXYd8vPfl+yH8vwfDVE6LxB+tJLAHuCnXWyDhySRMK7/A1TgWSnlPQdZpK+MEGIg8EbkrQYsONTl\nF0K8BMwiXNa2Fvg98CbwCtCfcMnu+VLKQ8452ofsszhMrnkhxAxgJbAZMCObf03YFn5Iz/8BZP8B\nh+j8H/EK38LCwsIizBFv0rGwsLCwCGMpfAsLC4ujBEvhW1hYWBwlWArfwsLC4ijBUvgWFhYWRwmW\nwrewsLA4SrAUvoWFhcVRwv8HB5WW1am5QHYAAAAASUVORK5CYII=\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Draw PCA for profile_small\n", + "plt.scatter(profile_small[\"x_pca\"], profile_small[\"y_pca\"], c=[cm.spectral(float(i) /mx_color) for i in color])\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'plt' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprofile\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"color\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malign\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'left'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbins\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m400\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtitle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Number of contigs in each bin\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'plt' is not defined" + ] + } + ], + "source": [ + "# Draw histogram with number of contigs in each bin\n", + "plt.hist(profile[\"color\"], align='left', bins = 400)\n", + "plt.title(\"Number of contigs in each bin\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
      \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
      12345678910111213141516
      color
      18.87618618.26128251.1879606.13405412.6337005.4770376.0145855.3048146.87419112.43904629.79373222.6652198.70258911.27214318.4198513.956024
      22.9897241.4882777.84037121.4694975.4739866.48420227.5890868.52453836.25553713.43926116.89387616.09643917.42774516.57461821.10641925.184315
      33.72080712.42974928.1741605.8695528.5632575.1554277.0754345.3488207.06656711.24672228.06656720.51042812.53540714.22336719.6887105.910453
      45.7620031.76908214.54923430.4236718.7157949.01570342.40114611.75189945.33286120.20420922.50675823.95385523.17022823.51036227.95791033.567834
      52.8037442.2604645.3842549.7099946.9489612.9237617.1590262.57901517.6321186.02505710.53929411.21526212.41792420.25455611.32211021.699245
      60.59718211.34620419.1329688.4551705.9683457.5369169.4680416.3188109.36951012.42342841.20419228.74623918.62727221.54735227.55778810.249935
      72.21802910.15711423.7731945.3369613.9082775.0857547.6893714.6381996.6448709.31405227.60352821.12920212.51988114.14974116.1510588.002107
      81.57574110.34677122.6380905.6761623.9742905.6690858.5814754.7966507.2078299.25073928.59213522.00528513.80139715.09011917.5474338.376870
      95.28309017.77589146.9939086.2907516.7595534.6349464.4915083.9120366.6049475.85674715.01236818.4857856.3986528.89911415.5552894.008030
      103.4528750.6688825.48555315.0410785.3609347.01532415.5546949.0986806.35678010.8444574.1344049.0896344.1111427.36924210.7114376.492015
      128.24608415.03380032.7101205.18188412.1977543.9534213.3861298.0578114.9367275.82089918.61737410.1841515.14004515.91065512.1377785.062036
      130.5849970.0627442.6098887.97120910.1163115.36545111.2662313.40433610.0071124.6732056.8703835.3528336.7243634.7299846.3202575.385639
      1410.2856323.37020439.66079031.09793863.58439927.36893747.73914017.80493122.05841714.02880531.78476824.16096315.56308313.80401013.53070654.312478
      155.79171822.04410041.4880285.0912535.3684343.7312484.2002203.3884686.1801865.27302714.28328917.3749085.7863437.88260414.7548252.979721
      169.8412744.17440339.46184921.62633250.36448346.40955344.90018435.05976752.55027623.41543233.41947387.86723854.24286677.07434225.26723838.063686
      178.07657520.318730104.82651377.19696249.13281444.75558694.59364844.41752424.69633465.64587055.305800102.82312382.419784109.87157966.819106110.729977
      183.1177642.16336224.95625267.61062687.48466849.99828267.80121655.48969159.34681526.62503327.43061132.19125010.56502822.09767432.50277619.317209
      192.3936820.0553532.0966656.1778055.6153644.6335907.5471854.9194018.7125695.1930615.5901176.2073715.6385853.2723106.8863244.478871
      203.5257653.09534310.60471228.57536537.0724729.32281633.52617816.51956536.14453021.23174426.39914623.18572612.70694432.46404022.16864125.041196
      215.7432604.8328745.8391542.9486839.5215991.1672793.4998478.15180814.43443618.30698525.75337024.04810017.90150190.24555831.7712935.635110
      220.0017190.0003130.0015633.7934384.2917194.4756252.6293755.2703132.4153132.8264064.0409388.9059380.8478124.7039064.3979694.439375
      233.3812650.0673795.6675432.8668865.00854611.2336895.8726388.5050126.40838110.8622846.73360714.3722273.5069844.2174206.7283484.541988
      243.9503652.1611891.69953542.6915676.21148710.54316135.0400071.1484068.0816739.01826010.3044494.331341101.074037144.08831324.46397712.969290
      257.0163125.6969047.13898150.49484011.1810929.33305628.12350210.74550663.32806320.45056615.5018318.43492049.98818253.85253034.23185861.643975
      273.94151210.15867826.5090854.39522411.3709982.8709123.1072853.2341244.7518608.97110218.99342410.5109888.26198310.29088116.6328081.611005
      282.0804106.65699918.2599862.5328246.4338311.6177491.9518931.6979852.9447724.76380712.0329984.6972914.8733947.73827710.4616191.374957
      2912.27029410.46273617.7517491.9060532.3220781.8087822.5432122.2958361.2318051.18876810.3703647.1401332.8490204.9384183.8876842.684220
      302.1379012.51182811.1398282.6961634.2956024.46259016.9430528.03977617.72682720.55668529.30436327.27632734.19870340.28316149.78973220.057123
      321.9167866.8048788.0441184.2564560.7109041.2708033.2998571.8428982.5966644.05846513.0600793.0505744.3857604.9743548.5546991.846664
      3459.52699518.24120150.92555743.80652351.43044069.37350343.467846126.27418534.42896667.27621263.054358163.46637223.61323044.51077965.31675015.552792
      364.1690380.8424169.64880814.82542017.14649920.67060823.28431632.35470222.2041387.84832816.88010316.8311476.98171114.13670811.7548496.418252
      390.4660910.2782271.85233913.83956410.5622776.75502528.64174317.24403571.87431946.77493938.57599122.364832121.582754258.35543940.232012164.324629
      403.4666920.0804313.3993190.9911051.82721449.0350112.2931496.9524984.0454203.31037112.9647995.5124910.8660110.8012877.3626040.928274
      470.3804021.2160922.6148182.1635131.37124113.11790516.18263312.89504111.4811797.60406342.33698531.82055420.55028925.56781526.99143621.536746
      48121.099206382.128336104.400896188.9802405.04277971.577918135.162762200.053779148.8928508.43634176.961703133.352007132.365248147.41617435.5489920.536769
      492.5427876.4370424.5360641.4439692.3512631.2469442.3834561.3628773.1336593.69722911.4266502.8610434.2626325.8995528.5629583.708028
      510.12114319.62406214.38198512.3250630.0327364.36613820.6286495.11655536.3071318.1603429.8801087.23936613.68869913.48477919.00271121.630734
      568.0966330.38828212.9940973.9416273.31854016.4331000.65085356.7103192.85964144.5446005.20179321.5944470.3491470.69064312.6556620.188675
      570.5858090.0000000.0077610.0022174.2782717.7966740.0022174.3246120.0033263.9609761.9472284.6385810.0011094.4778270.0104210.217073
      599.00223328.17883550.6463500.8845723.3136860.8037516.3369060.8823406.0955573.69837017.8412594.25742414.32217032.0660863.7117665.126367
      623.8525059.9241653.7339982.7782932.6516701.1006493.8501863.1883127.1212897.04846912.54336711.08441618.49791378.72402617.0658634.556586
      710.0265649.2197122.9277562.0052140.7795430.5335154.8373886.06107232.9910636.73609731.25397214.20134122.1417581.37065514.24056626.971698
      740.0058960.0000000.0000003.6813640.0017941.8697770.9523200.01307414.3304281.0420411.4824401.10766514.00102517.9779547.4216872.789797
      772.1736706.89707418.4829792.4101065.3880321.5914891.7821811.8164892.5231383.28563810.9316493.7765963.9944158.6792559.3380321.817819
      851.1083290.0081600.0137870.0016880.0028140.0301070.1052340.0908842.7833430.1541920.1930220.1435007.6435006.82639322.13168341.960889
      1220.0145810.0024302.0627780.0008100.4434990.2004861.3313080.0129611.9623330.3835560.0218710.23288810.9756997.1360876.4981775.916565
      \n", + "
      " + ], + "text/plain": [ + " 1 2 3 4 5 6 \\\n", + "color \n", + "1 8.876186 18.261282 51.187960 6.134054 12.633700 5.477037 \n", + "2 2.989724 1.488277 7.840371 21.469497 5.473986 6.484202 \n", + "3 3.720807 12.429749 28.174160 5.869552 8.563257 5.155427 \n", + "4 5.762003 1.769082 14.549234 30.423671 8.715794 9.015703 \n", + "5 2.803744 2.260464 5.384254 9.709994 6.948961 2.923761 \n", + "6 0.597182 11.346204 19.132968 8.455170 5.968345 7.536916 \n", + "7 2.218029 10.157114 23.773194 5.336961 3.908277 5.085754 \n", + "8 1.575741 10.346771 22.638090 5.676162 3.974290 5.669085 \n", + "9 5.283090 17.775891 46.993908 6.290751 6.759553 4.634946 \n", + "10 3.452875 0.668882 5.485553 15.041078 5.360934 7.015324 \n", + "12 8.246084 15.033800 32.710120 5.181884 12.197754 3.953421 \n", + "13 0.584997 0.062744 2.609888 7.971209 10.116311 5.365451 \n", + "14 10.285632 3.370204 39.660790 31.097938 63.584399 27.368937 \n", + "15 5.791718 22.044100 41.488028 5.091253 5.368434 3.731248 \n", + "16 9.841274 4.174403 39.461849 21.626332 50.364483 46.409553 \n", + "17 8.076575 20.318730 104.826513 77.196962 49.132814 44.755586 \n", + "18 3.117764 2.163362 24.956252 67.610626 87.484668 49.998282 \n", + "19 2.393682 0.055353 2.096665 6.177805 5.615364 4.633590 \n", + "20 3.525765 3.095343 10.604712 28.575365 37.072472 9.322816 \n", + "21 5.743260 4.832874 5.839154 2.948683 9.521599 1.167279 \n", + "22 0.001719 0.000313 0.001563 3.793438 4.291719 4.475625 \n", + "23 3.381265 0.067379 5.667543 2.866886 5.008546 11.233689 \n", + "24 3.950365 2.161189 1.699535 42.691567 6.211487 10.543161 \n", + "25 7.016312 5.696904 7.138981 50.494840 11.181092 9.333056 \n", + "27 3.941512 10.158678 26.509085 4.395224 11.370998 2.870912 \n", + "28 2.080410 6.656999 18.259986 2.532824 6.433831 1.617749 \n", + "29 12.270294 10.462736 17.751749 1.906053 2.322078 1.808782 \n", + "30 2.137901 2.511828 11.139828 2.696163 4.295602 4.462590 \n", + "32 1.916786 6.804878 8.044118 4.256456 0.710904 1.270803 \n", + "34 59.526995 18.241201 50.925557 43.806523 51.430440 69.373503 \n", + "36 4.169038 0.842416 9.648808 14.825420 17.146499 20.670608 \n", + "39 0.466091 0.278227 1.852339 13.839564 10.562277 6.755025 \n", + "40 3.466692 0.080431 3.399319 0.991105 1.827214 49.035011 \n", + "47 0.380402 1.216092 2.614818 2.163513 1.371241 13.117905 \n", + "48 121.099206 382.128336 104.400896 188.980240 5.042779 71.577918 \n", + "49 2.542787 6.437042 4.536064 1.443969 2.351263 1.246944 \n", + "51 0.121143 19.624062 14.381985 12.325063 0.032736 4.366138 \n", + "56 8.096633 0.388282 12.994097 3.941627 3.318540 16.433100 \n", + "57 0.585809 0.000000 0.007761 0.002217 4.278271 7.796674 \n", + "59 9.002233 28.178835 50.646350 0.884572 3.313686 0.803751 \n", + "62 3.852505 9.924165 3.733998 2.778293 2.651670 1.100649 \n", + "71 0.026564 9.219712 2.927756 2.005214 0.779543 0.533515 \n", + "74 0.005896 0.000000 0.000000 3.681364 0.001794 1.869777 \n", + "77 2.173670 6.897074 18.482979 2.410106 5.388032 1.591489 \n", + "85 1.108329 0.008160 0.013787 0.001688 0.002814 0.030107 \n", + "122 0.014581 0.002430 2.062778 0.000810 0.443499 0.200486 \n", + "\n", + " 7 8 9 10 11 12 \\\n", + "color \n", + "1 6.014585 5.304814 6.874191 12.439046 29.793732 22.665219 \n", + "2 27.589086 8.524538 36.255537 13.439261 16.893876 16.096439 \n", + "3 7.075434 5.348820 7.066567 11.246722 28.066567 20.510428 \n", + "4 42.401146 11.751899 45.332861 20.204209 22.506758 23.953855 \n", + "5 7.159026 2.579015 17.632118 6.025057 10.539294 11.215262 \n", + "6 9.468041 6.318810 9.369510 12.423428 41.204192 28.746239 \n", + "7 7.689371 4.638199 6.644870 9.314052 27.603528 21.129202 \n", + "8 8.581475 4.796650 7.207829 9.250739 28.592135 22.005285 \n", + "9 4.491508 3.912036 6.604947 5.856747 15.012368 18.485785 \n", + "10 15.554694 9.098680 6.356780 10.844457 4.134404 9.089634 \n", + "12 3.386129 8.057811 4.936727 5.820899 18.617374 10.184151 \n", + "13 11.266231 3.404336 10.007112 4.673205 6.870383 5.352833 \n", + "14 47.739140 17.804931 22.058417 14.028805 31.784768 24.160963 \n", + "15 4.200220 3.388468 6.180186 5.273027 14.283289 17.374908 \n", + "16 44.900184 35.059767 52.550276 23.415432 33.419473 87.867238 \n", + "17 94.593648 44.417524 24.696334 65.645870 55.305800 102.823123 \n", + "18 67.801216 55.489691 59.346815 26.625033 27.430611 32.191250 \n", + "19 7.547185 4.919401 8.712569 5.193061 5.590117 6.207371 \n", + "20 33.526178 16.519565 36.144530 21.231744 26.399146 23.185726 \n", + "21 3.499847 8.151808 14.434436 18.306985 25.753370 24.048100 \n", + "22 2.629375 5.270313 2.415313 2.826406 4.040938 8.905938 \n", + "23 5.872638 8.505012 6.408381 10.862284 6.733607 14.372227 \n", + "24 35.040007 1.148406 8.081673 9.018260 10.304449 4.331341 \n", + "25 28.123502 10.745506 63.328063 20.450566 15.501831 8.434920 \n", + "27 3.107285 3.234124 4.751860 8.971102 18.993424 10.510988 \n", + "28 1.951893 1.697985 2.944772 4.763807 12.032998 4.697291 \n", + "29 2.543212 2.295836 1.231805 1.188768 10.370364 7.140133 \n", + "30 16.943052 8.039776 17.726827 20.556685 29.304363 27.276327 \n", + "32 3.299857 1.842898 2.596664 4.058465 13.060079 3.050574 \n", + "34 43.467846 126.274185 34.428966 67.276212 63.054358 163.466372 \n", + "36 23.284316 32.354702 22.204138 7.848328 16.880103 16.831147 \n", + "39 28.641743 17.244035 71.874319 46.774939 38.575991 22.364832 \n", + "40 2.293149 6.952498 4.045420 3.310371 12.964799 5.512491 \n", + "47 16.182633 12.895041 11.481179 7.604063 42.336985 31.820554 \n", + "48 135.162762 200.053779 148.892850 8.436341 76.961703 133.352007 \n", + "49 2.383456 1.362877 3.133659 3.697229 11.426650 2.861043 \n", + "51 20.628649 5.116555 36.307131 8.160342 9.880108 7.239366 \n", + "56 0.650853 56.710319 2.859641 44.544600 5.201793 21.594447 \n", + "57 0.002217 4.324612 0.003326 3.960976 1.947228 4.638581 \n", + "59 6.336906 0.882340 6.095557 3.698370 17.841259 4.257424 \n", + "62 3.850186 3.188312 7.121289 7.048469 12.543367 11.084416 \n", + "71 4.837388 6.061072 32.991063 6.736097 31.253972 14.201341 \n", + "74 0.952320 0.013074 14.330428 1.042041 1.482440 1.107665 \n", + "77 1.782181 1.816489 2.523138 3.285638 10.931649 3.776596 \n", + "85 0.105234 0.090884 2.783343 0.154192 0.193022 0.143500 \n", + "122 1.331308 0.012961 1.962333 0.383556 0.021871 0.232888 \n", + "\n", + " 13 14 15 16 \n", + "color \n", + "1 8.702589 11.272143 18.419851 3.956024 \n", + "2 17.427745 16.574618 21.106419 25.184315 \n", + "3 12.535407 14.223367 19.688710 5.910453 \n", + "4 23.170228 23.510362 27.957910 33.567834 \n", + "5 12.417924 20.254556 11.322110 21.699245 \n", + "6 18.627272 21.547352 27.557788 10.249935 \n", + "7 12.519881 14.149741 16.151058 8.002107 \n", + "8 13.801397 15.090119 17.547433 8.376870 \n", + "9 6.398652 8.899114 15.555289 4.008030 \n", + "10 4.111142 7.369242 10.711437 6.492015 \n", + "12 5.140045 15.910655 12.137778 5.062036 \n", + "13 6.724363 4.729984 6.320257 5.385639 \n", + "14 15.563083 13.804010 13.530706 54.312478 \n", + "15 5.786343 7.882604 14.754825 2.979721 \n", + "16 54.242866 77.074342 25.267238 38.063686 \n", + "17 82.419784 109.871579 66.819106 110.729977 \n", + "18 10.565028 22.097674 32.502776 19.317209 \n", + "19 5.638585 3.272310 6.886324 4.478871 \n", + "20 12.706944 32.464040 22.168641 25.041196 \n", + "21 17.901501 90.245558 31.771293 5.635110 \n", + "22 0.847812 4.703906 4.397969 4.439375 \n", + "23 3.506984 4.217420 6.728348 4.541988 \n", + "24 101.074037 144.088313 24.463977 12.969290 \n", + "25 49.988182 53.852530 34.231858 61.643975 \n", + "27 8.261983 10.290881 16.632808 1.611005 \n", + "28 4.873394 7.738277 10.461619 1.374957 \n", + "29 2.849020 4.938418 3.887684 2.684220 \n", + "30 34.198703 40.283161 49.789732 20.057123 \n", + "32 4.385760 4.974354 8.554699 1.846664 \n", + "34 23.613230 44.510779 65.316750 15.552792 \n", + "36 6.981711 14.136708 11.754849 6.418252 \n", + "39 121.582754 258.355439 40.232012 164.324629 \n", + "40 0.866011 0.801287 7.362604 0.928274 \n", + "47 20.550289 25.567815 26.991436 21.536746 \n", + "48 132.365248 147.416174 35.548992 0.536769 \n", + "49 4.262632 5.899552 8.562958 3.708028 \n", + "51 13.688699 13.484779 19.002711 21.630734 \n", + "56 0.349147 0.690643 12.655662 0.188675 \n", + "57 0.001109 4.477827 0.010421 0.217073 \n", + "59 14.322170 32.066086 3.711766 5.126367 \n", + "62 18.497913 78.724026 17.065863 4.556586 \n", + "71 22.141758 1.370655 14.240566 26.971698 \n", + "74 14.001025 17.977954 7.421687 2.789797 \n", + "77 3.994415 8.679255 9.338032 1.817819 \n", + "85 7.643500 6.826393 22.131683 41.960889 \n", + "122 10.975699 7.136087 6.498177 5.916565 " + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Look at mean frequencies in each sample for each bin\n", + "profile.groupby('color').mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "214731" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Number of unique profiles in profile\n", + "len(-profile.groupby(profile.columns.tolist()[2:],as_index=False).size())" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Make new dataframe - only unique profiles and run bhtsne on it\n", + "new_profile = profile.drop_duplicates(profile.columns.tolist()[2:])" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
      \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
      color12345678910111213141516
      count214731.000000214731.000000214731.000000214731.000000214731.000000214731.000000214731.000000214731.000000214731.000000214731.000000214731.000000214731.000000214731.000000214731.000000214731.000000214731.000000214731.000000
      mean22.1298007.89397414.11113420.94918318.04824613.38751712.81982619.86394616.55127119.72295113.62941121.11511624.00221220.52228129.78130819.06157017.828744
      std20.74971320.23891256.42913231.03987033.74995822.09605221.87661730.79712537.64633229.44624720.32913022.52313840.98666034.66952553.13951020.30620731.855980
      min1.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
      25%5.0000000.0000000.0000003.0000003.0000003.0000002.0000003.0000003.0000004.0000004.0000007.0000006.0000004.0000005.0000007.0000004.000000
      50%17.0000003.0000003.0000009.0000007.0000006.0000006.0000008.0000006.00000010.0000008.00000015.00000012.00000010.00000013.00000012.0000007.000000
      75%32.0000007.00000010.00000028.00000017.00000012.00000011.00000025.00000013.00000023.00000018.00000030.00000027.00000019.00000030.00000027.00000022.000000
      max122.000000922.0000002890.000000965.0000001499.0000001290.0000001281.0000001273.0000001922.0000001199.0000001492.000000925.0000002252.0000001641.0000003422.000000897.0000002124.000000
      \n", + "
      " + ], + "text/plain": [ + " color 1 2 3 \\\n", + "count 214731.000000 214731.000000 214731.000000 214731.000000 \n", + "mean 22.129800 7.893974 14.111134 20.949183 \n", + "std 20.749713 20.238912 56.429132 31.039870 \n", + "min 1.000000 0.000000 0.000000 0.000000 \n", + "25% 5.000000 0.000000 0.000000 3.000000 \n", + "50% 17.000000 3.000000 3.000000 9.000000 \n", + "75% 32.000000 7.000000 10.000000 28.000000 \n", + "max 122.000000 922.000000 2890.000000 965.000000 \n", + "\n", + " 4 5 6 7 \\\n", + "count 214731.000000 214731.000000 214731.000000 214731.000000 \n", + "mean 18.048246 13.387517 12.819826 19.863946 \n", + "std 33.749958 22.096052 21.876617 30.797125 \n", + "min 0.000000 0.000000 0.000000 0.000000 \n", + "25% 3.000000 3.000000 2.000000 3.000000 \n", + "50% 7.000000 6.000000 6.000000 8.000000 \n", + "75% 17.000000 12.000000 11.000000 25.000000 \n", + "max 1499.000000 1290.000000 1281.000000 1273.000000 \n", + "\n", + " 8 9 10 11 \\\n", + "count 214731.000000 214731.000000 214731.000000 214731.000000 \n", + "mean 16.551271 19.722951 13.629411 21.115116 \n", + "std 37.646332 29.446247 20.329130 22.523138 \n", + "min 0.000000 0.000000 0.000000 0.000000 \n", + "25% 3.000000 4.000000 4.000000 7.000000 \n", + "50% 6.000000 10.000000 8.000000 15.000000 \n", + "75% 13.000000 23.000000 18.000000 30.000000 \n", + "max 1922.000000 1199.000000 1492.000000 925.000000 \n", + "\n", + " 12 13 14 15 \\\n", + "count 214731.000000 214731.000000 214731.000000 214731.000000 \n", + "mean 24.002212 20.522281 29.781308 19.061570 \n", + "std 40.986660 34.669525 53.139510 20.306207 \n", + "min 0.000000 0.000000 0.000000 0.000000 \n", + "25% 6.000000 4.000000 5.000000 7.000000 \n", + "50% 12.000000 10.000000 13.000000 12.000000 \n", + "75% 27.000000 19.000000 30.000000 27.000000 \n", + "max 2252.000000 1641.000000 3422.000000 897.000000 \n", + "\n", + " 16 \n", + "count 214731.000000 \n", + "mean 17.828744 \n", + "std 31.855980 \n", + "min 0.000000 \n", + "25% 4.000000 \n", + "50% 7.000000 \n", + "75% 22.000000 \n", + "max 2124.000000 " + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_profile.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "new_profile = new_profile.sample(frac=0.1)\n", + "data = new_profile.as_matrix(columns = new_profile.columns[2:])\n", + "v = (1.0/2000)\n", + "data = data + v\n", + "along_Y = np.apply_along_axis(sum, 0, data)\n", + "data = data/along_Y[None, :]\n", + "along_X = np.apply_along_axis(sum, 1, data)\n", + "data = data/along_X[:, None]\n", + "data = np.log(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "ValueError", + "evalue": "Length of values does not match length of index", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mar2\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 16\u001b[0;31m \u001b[0mnew_profile\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"x\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mar2\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 17\u001b[0m \u001b[0mnew_profile\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"y\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mar2\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36m__setitem__\u001b[0;34m(self, key, value)\u001b[0m\n\u001b[1;32m 2427\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2428\u001b[0m \u001b[0;31m# set column\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2429\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_set_item\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2430\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2431\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_setitem_slice\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36m_set_item\u001b[0;34m(self, key, value)\u001b[0m\n\u001b[1;32m 2493\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2494\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_ensure_valid_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2495\u001b[0;31m \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sanitize_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2496\u001b[0m \u001b[0mNDFrame\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_set_item\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2497\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36m_sanitize_column\u001b[0;34m(self, key, value, broadcast)\u001b[0m\n\u001b[1;32m 2664\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2665\u001b[0m \u001b[0;31m# turn me into an ndarray\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2666\u001b[0;31m \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_sanitize_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2667\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mndarray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mIndex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2668\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/anaconda/lib/python2.7/site-packages/pandas/core/series.pyc\u001b[0m in \u001b[0;36m_sanitize_index\u001b[0;34m(data, index, copy)\u001b[0m\n\u001b[1;32m 2877\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2878\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2879\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Length of values does not match length of '\u001b[0m \u001b[0;34m'index'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2880\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2881\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mPeriodIndex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: Length of values does not match length of index" + ] + } + ], + "source": [ + "np.savetxt(\"data2.in\", data, delimiter=\"\\t\")\n", + "\n", + "path_bhtsne = '/Users/tanunia/PycharmProjects/biolab_t-sne/'\n", + "import sys, os\n", + "os.system(path_bhtsne + 'bhtsne.py -p 50 -m 3000 -i data2.in -o data_canopy2.out')\n", + "\n", + "ar2 = np.loadtxt(\"data2.out\", delimiter=\"\\t\")\n", + "len(ar2[:, 0])\n", + "\n", + "new_profile[\"x\"] = ar2[:, 0]\n", + "new_profile[\"y\"] = ar2[:, 1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from matplotlib import pyplot as plt\n", + "\n", + "plt.scatter(new_profile[\"x\"], new_profile[\"y\"], c=new_profile[\"color\"])\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "new_profile[\"color\"].value_counts()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff -Nru spades-3.10.1+dfsg/src/projects/mts/scripts/filter_bin.py spades-3.11.1+dfsg/src/projects/mts/scripts/filter_bin.py --- spades-3.10.1+dfsg/src/projects/mts/scripts/filter_bin.py 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/scripts/filter_bin.py 2017-09-28 09:05:14.000000000 +0000 @@ -0,0 +1,17 @@ +#!/usr/bin/env python +from __future__ import print_function + +import sys +from Bio import SeqIO +import common + +def print_usage(): + print("Usage: filter_bins.py ") + +contigs = sys.argv[1] +binning = common.load_annotation(sys.argv[2], False) +bin_name = sys.argv[3] + +for seq in SeqIO.parse(contigs, "fasta"): + if bin_name in binning.get(seq.id, set()): + SeqIO.write(seq, sys.stdout, "fasta") diff -Nru spades-3.10.1+dfsg/src/projects/mts/scripts/filter_nucmer.py spades-3.11.1+dfsg/src/projects/mts/scripts/filter_nucmer.py --- spades-3.10.1+dfsg/src/projects/mts/scripts/filter_nucmer.py 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/scripts/filter_nucmer.py 2017-09-28 09:05:14.000000000 +0000 @@ -7,48 +7,48 @@ def print_usage(): print("For a sample assembly aligned to a reference, outputs only contigs which were aligned more than percent of their length total, and that percent.") - print("Usage: filter_nucmer.py ") + print("Usage: filter_nucmer.py ") print("Parameters:") print(" is minimal contig length (default: INF)") print(" is the minimal total alignment of a contig (0-100%)") -if len(sys.argv) != 5: +if len(sys.argv) != 4: print_usage() sys.exit(1) nucmer_output_fn = sys.argv[1] -output_fn = sys.argv[2] -min_length = int(sys.argv[3]) -threshold = float(sys.argv[4]) +min_length = int(sys.argv[2]) +threshold = float(sys.argv[3]) if not path.exists(nucmer_output_fn): print("File {} doesn't exist".format(nucmer_output_fn)) sys.exit(2) with open(nucmer_output_fn, "r") as nucmer_output: - with open(output_fn, "w") as output: - align_data = re.compile("\d+ \d+ \| \d+ \d+ \| \d+ (\d+) \| [\d.]+ \| [^ ]+ NODE_(\d+)_length_(\d+)") - contig = None - contig_len = 0 - align_len = 0 - def process_contig(): - per = 100.0 * align_len / contig_len - if per > threshold and contig_len >= min_length: - print("{}\t{}\t{}".format(contig, contig_len, per), file=output) - return align_len - return 0 - for line in nucmer_output: - res = align_data.search(line) - if res is None: - continue - new_contig = res.group(2) - if contig != new_contig: - if contig is not None: - process_contig() - contig = new_contig - contig_len = int(res.group(3)) - align_len = 0 - #Assuming that all alignments of the same contig are consequent - align_len += int(res.group(1)) - #Print the last contig separately - process_contig() + align_data = re.compile("\d+ \d+ \| \d+ \d+ \| \d+ (\d+) \| [\d.]+ \| [^ ]+ (NODE_(\d+)_length_(\d+)_.*$)") + contig = None + name = "" + contig_len = 0 + align_len = 0 + def process_contig(): + per = 100.0 * align_len / contig_len + if per > threshold and contig_len >= min_length: + print("{}\t{}\t{}".format(name, contig_len, per)) + return align_len + return 0 + for line in nucmer_output: + res = align_data.search(line) + if res is None: + continue + new_contig = res.group(3) + if contig != new_contig: + if contig is not None: + process_contig() + contig = new_contig + name = res.group(2) + contig_len = int(res.group(4)) + align_len = 0 + #Assuming that all alignments of the same contig are consequent + align_len += int(res.group(1)) + #Print the last contig separately + process_contig() diff -Nru spades-3.10.1+dfsg/src/projects/mts/scripts/gather_stats.py spades-3.11.1+dfsg/src/projects/mts/scripts/gather_stats.py --- spades-3.10.1+dfsg/src/projects/mts/scripts/gather_stats.py 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/scripts/gather_stats.py 2017-09-28 09:05:14.000000000 +0000 @@ -1,28 +1,102 @@ #!/usr/bin/env python +from __future__ import (print_function) +import numpy as np import pandas from pandas import DataFrame -from math import isnan +import argparse import os.path +from operator import add import sys -quast_dir = sys.argv[1] +parser = argparse.ArgumentParser(description="MTS - Metagenomic Time Series") +parser.add_argument("dir", type=str, help="QUAST output directory") +parser.add_argument("name", type=str, help="Output base name") +parser.add_argument("--problematic", action="store_true", help="Problematic references report") +parser.add_argument("--heatmap", action="store_true", help="Best reference summary table") +args = parser.parse_args() + +# Write summary table with correspondence between bins and their best references res_table = DataFrame(columns=["bin", "ref", "GF", "purity", "NGA50", "misassemblies"]) -gf_table = pandas.read_table(os.path.join(quast_dir, "summary", "TSV", "Genome_fraction_(%).tsv"), dtype=str).set_index("Assemblies") +gf_table = pandas.read_table(os.path.join(args.dir, "summary", "TSV", "Genome_fraction_(%).tsv"), dtype=str).set_index("Assemblies") gfs = gf_table.apply(pandas.to_numeric, errors="coerce") +#Drop zeroes +gfs.fillna(0, inplace=True) +gfs = gfs.loc[gfs.apply(lambda row: row.sum() > 0, axis=1), gfs.apply(lambda col: col.sum() > 0)] + best_ref = gfs.apply(lambda col: col.idxmax()) +with open(args.name + "_best.tsv", "w") as out_file: + best_ref.to_csv(out_file, sep="\t") + for bin, ref in best_ref.iteritems(): if type(ref) is float: row = {"bin": bin, "GF": "-", "ref": "unknown", "purity": "-", "NGA50": "-", "misassemblies": "-"} else: - all_stats = pandas.read_table(os.path.join(quast_dir, "runs_per_reference", ref, "report.tsv"), index_col=0) + all_stats = pandas.read_table(os.path.join(args.dir, "runs_per_reference", ref, "report.tsv"), index_col=0) col = all_stats.get(bin) purity = 100 - float(col["Unaligned length"]) / float(col["Total length"]) * 100 row = {"bin": bin, "GF": col["Genome fraction (%)"], "ref": ref, "purity": "{0:.2f}".format(purity), "NGA50": col["NGA50"], "misassemblies": col["# misassemblies"]} res_table = res_table.append(row, ignore_index=True) -res_table.to_csv(sys.stdout, index=False, sep="\t") +with open(args.name + "_summary.tsv", "w") as out_file: + res_table.to_csv(out_file, index=False, sep="\t") + +# (Optional) Draw GF heatmap +if args.heatmap: + try: + import matplotlib + # Force matplotlib to not use any Xwindows backend. + matplotlib.use('Agg') + import matplotlib.pyplot as plt + import seaborn as sns + from sklearn.cluster.bicluster import SpectralCoclustering + model = SpectralCoclustering(n_clusters=gfs.shape[1], random_state=0) + model.fit(gfs.as_matrix()) + fit_data = gfs.iloc[np.argsort(model.row_labels_), np.argsort(model.column_labels_)] + + plot = sns.heatmap(fit_data, square=True) + fig = plot.get_figure() + fig.savefig(args.name + "_gf.png", bbox_inches="tight") + plt.gcf().clear() + except: + print("Can't import matplotlib and/or seaborn; heatmap drawing will be disabled") + args.heatmap = False + +# (Optional) Write summary for problematic references +if args.problematic: + BAD_THRESHOLD = 90 + ZERO_THRESHOLD = 5 + total_gf_ref = gfs.sum(1) + max_gf_ref = gfs.max(1) + nonzeroes = gfs.applymap(lambda x: x > ZERO_THRESHOLD) + nonzeroes_cnt_ref = nonzeroes.sum(1) + good_refs = list() + with open(args.name + "_problems.txt", "w") as out_file: + for ref, gf in total_gf_ref.iteritems(): + if max_gf_ref[ref] < BAD_THRESHOLD: + if gf < BAD_THRESHOLD: + print(ref, "is underassembled: at least", 100 - gf, "% GF was lost", file=out_file) + else: + print(ref, "is fractured: best bin is only", max_gf_ref[ref], "% GF", file=out_file) + continue + if nonzeroes_cnt_ref[ref] > 1: + print(ref, "is presented in", nonzeroes_cnt_ref[ref], "bins", file=out_file) + continue + good_refs.append(ref) + nonzeroes_cnt_bin = nonzeroes.sum(0) + good_bins = list() + for bin, cnt in nonzeroes_cnt_bin.iteritems(): + if cnt > 1: + print(bin, "is a mixture of", cnt, "references", file=out_file) #TODO: which ones? + else: + good_bins.append(bin) + if args.heatmap: + bad_table = gfs.drop(good_refs, axis=0).drop(good_bins, axis=1) + if bad_table.size: + plot = sns.heatmap(bad_table, square=True) + fig = plot.get_figure() + fig.savefig(args.name + "_bad.png", bbox_inches="tight") diff -Nru spades-3.10.1+dfsg/src/projects/mts/scripts/gen_samples.py spades-3.11.1+dfsg/src/projects/mts/scripts/gen_samples.py --- spades-3.10.1+dfsg/src/projects/mts/scripts/gen_samples.py 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/scripts/gen_samples.py 2017-09-28 09:05:14.000000000 +0000 @@ -13,12 +13,9 @@ def gen_profile(args): if args.distribution == "uni": - #def rand(): - # return random.randint(0, args.scale) - pass + rand = lambda: random.randint(0, args.scale) elif args.distribution == "exp": - def rand(): - return int(expon.rvs(scale=args.scale)) + rand = lambda: int(expon.rvs(scale=args.scale)) refs = dict(gather_refs(args.references)) if args.dump_desc: @@ -40,39 +37,58 @@ read_len = args.read_length adj_qual = "2" * read_len + "\n" + table = None + with open(args.profile) as input: first_line = True for line in input: params = line.split() - ref_name = params[0] - ref_path = refs.get(ref_name) - if not ref_path: - print("Warning: no reference provided for", ref_name) + ref = params[0] + if not refs.get(ref): + print("Warning: no reference provided for", ref) continue - for i, abundance in enumerate(map(int, params[1:]), start=1): - ref_len = os.stat(ref_path).st_size - reads = ref_len * abundance // read_len - print("Generating", reads, "reads for subsample", i, "of", ref_name) - sample_dir = os.path.join(args.out_dir, "sample" + str(i)) - if first_line: - shutil.rmtree(sample_dir, ignore_errors=True) - subprocess.check_call(["mkdir", "-p", sample_dir]) - - temp_1 = sample_dir + ".tmp.r1.fastq" - temp_2 = sample_dir + ".tmp.r2.fastq" - subprocess.check_call(["wgsim", "-N", str(reads), "-r", "0", "-1", str(read_len), "-2", str(read_len), "-d", "300", "-s", "10", "-e", "{:.2f}".format(args.error_rate), "-S", str(i), ref_path, temp_1, temp_2], stdout=subprocess.DEVNULL) - - print("Merging temporary files") - for temp, out in [(temp_1, os.path.join(sample_dir, "r1.fastq")), (temp_2, os.path.join(sample_dir, "r2.fastq"))]: - with open(temp) as input, open(out, "a") as output: - for line in input: - if line.startswith("IIIII"): #TODO: remove this hack - output.write(adj_qual) - else: - output.write(line) - os.remove(temp) + if first_line: + table = [None] * (len(params) - 1) + first_line = False + for i, abundance in enumerate(map(int, params[1:])): + if not table[i]: + table[i] = dict() + table[i][ref] = abundance + + for i, abundancies in enumerate(table, start=1): + print("Generating reads for", i, "sample") + sample_dir = os.path.join(args.out_dir, "sample" + str(i)) + shutil.rmtree(sample_dir, ignore_errors=True) + subprocess.check_call(["mkdir", "-p", sample_dir]) + outs = [None] * 2 + for dir, name in enumerate([os.path.join(sample_dir, "r1.fastq"), os.path.join(sample_dir, "r2.fastq")]): + if args.gzip: + name = name + ".gz" + output = open(name, "wb") + if args.gzip: + output = subprocess.Popen("gzip", stdin=subprocess.PIPE, stdout=output).stdin + outs[dir] = output + + for ref, abundance in abundancies.items(): + ref_path = refs.get(ref) + ref_len = os.stat(ref_path).st_size + reads = ref_len * abundance // read_len + print("Generating", reads, "reads for subsample", i, "of", ref) + + temp_1 = sample_dir + ".tmp.r1.fastq" + temp_2 = sample_dir + ".tmp.r2.fastq" + subprocess.check_call(["wgsim", "-N", str(reads), "-r", "0", "-1", str(read_len), "-2", str(read_len), "-d", "300", "-s", "10", "-e", "{:.2f}".format(args.error_rate), "-S", str(i), ref_path, temp_1, temp_2], stdout=subprocess.DEVNULL) + + print("Merging temporary files") + for temp, output in zip([temp_1, temp_2], outs): + with open(temp) as input: + for line in input: + if line.startswith("IIIII"): #TODO: remove this hack + output.write(adj_qual.encode()) + else: + output.write(line.encode()) + os.remove(temp) print() - first_line = False parser = argparse.ArgumentParser(description="Metagenomic Time Series Simulator") parser.add_argument("--references", "-r", type=str, help="Comma-separated list of references, or a directory with them, or a desc file with reference paths prepended with @", required=True) @@ -89,6 +105,7 @@ gen_samples_args.add_argument("--out-dir", "-o", type=str, help="Output directory. Will be totally overwritten!") gen_samples_args.add_argument("--read-length", "-l", type=int, help="Read length", default=100) gen_samples_args.add_argument("--error-rate", "-e", type=float, help="Base error rate", default=0) +gen_samples_args.add_argument("--gzip", "-z", action="store_true", help="Compress resulted files") gen_samples_args.add_argument("profile", type=str, help="File with reference profiles") gen_samples_args.set_defaults(func=gen_samples) diff -Nru spades-3.10.1+dfsg/src/projects/mts/scripts/make_input.py spades-3.11.1+dfsg/src/projects/mts/scripts/make_input.py --- spades-3.10.1+dfsg/src/projects/mts/scripts/make_input.py 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/scripts/make_input.py 2017-09-28 09:05:14.000000000 +0000 @@ -5,15 +5,20 @@ except ImportError: pass +import re import argparse import os +import os.path import sys +from common import contig_length + parser = argparse.ArgumentParser(description="Binner input formatter") -parser.add_argument("--type", "-t", type=str, help="Binner type (canopy or concoct)", default="canopy") + +parser.add_argument("--type", "-t", choices=["canopy", "concoct", "gattaca", "binsanity"], help="Binner type", default="canopy") +parser.add_argument("--count", "-n", type=int, help="Number of data samples") parser.add_argument("--output", "-o", type=str, help="Output file") -parser.add_argument("--dir", "-d", type=str, help="Directory with profiles (pairs of .id .mpl files)") -parser.add_argument("samples", type=str, nargs="+", help="Sample names") +parser.add_argument("profiles", type=str, help="Groups profiles in .tsv format") args = parser.parse_args() @@ -25,7 +30,7 @@ pass def profile(self, file, contig, profile): - print(contig, profile, file=out) + print(contig, " ".join(profile), file=out) class ConcoctFormatter: def __init__(self): @@ -35,19 +40,35 @@ print("\t".join(["contig"] + ["cov_mean_" + sample for sample in samples]), file=out) def profile(self, file, contig, profile): - print(contig.replace(",", "~"), profile.replace(" ", "\t"), sep="\t", file=out) + print(contig, *profile, sep="\t", file=out) + +class BinSanityFormatter: + def __init__(self): + pass + + def header(self, file, samples): + pass + + def profile(self, file, contig, profile): + print(contig, *profile, sep="\t", file=out) + +class GattacaFormatter: + def __init__(self): + pass + + def header(self, file, samples): + print("\t".join(["contig", "length"] + ["cov_mean_" + sample for sample in samples]), file=out) + + def profile(self, file, contig, profile): + l = contig_length(contig) + print(contig, l, *profile, sep="\t", file=out) -formatters = {"canopy": CanopyFormatter(), "concoct": ConcoctFormatter()} +formatters = {"canopy": CanopyFormatter(), "concoct": ConcoctFormatter(), "gattaca": GattacaFormatter(), "binsanity": BinSanityFormatter()} formatter = formatters[args.type] with open(args.output, "w") as out: - formatter.header(out, args.samples) - for sample in args.samples: - id_file = "{}/{}.id".format(args.dir, sample) - mpl_file = "{}/{}.mpl".format(args.dir, sample) - - print("Processing abundances from %s" % id_file) - - with open(id_file, "r") as ctg_id, open(mpl_file, "r") as ctg_mpl: - for cid, cmpl in zip(ctg_id, ctg_mpl): - formatter.profile(out, sample + "-" + cid.strip(), cmpl.strip()) + formatter.header(out, ["sample" + str(i) for i in range(1, args.count + 1)]) + with open(args.profiles, "r") as input: + for line in input: + params = line.strip().split("\t") + formatter.profile(out, params[0], params[1:]) diff -Nru spades-3.10.1+dfsg/src/projects/mts/scripts/make_points_matrix.py spades-3.11.1+dfsg/src/projects/mts/scripts/make_points_matrix.py --- spades-3.10.1+dfsg/src/projects/mts/scripts/make_points_matrix.py 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/scripts/make_points_matrix.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 - -import random - -ctg = open("canopy/contigs.in", "r") -ctr = open("canopy/clusters.out", "r") - -out = open("canopy/points_matrix.csv", "w") - -ctg_to_ctr = dict() - -while True: - s = ctr.readline().strip() - if (s == ""): - break - a = s.split() - ctr_id = a[0][3:] - - if (random.randint(1, 25) == 1): - ctg_to_ctr[a[1]] = ctr_id - -while True: - s = ctg.readline().strip() - if s == "": - break - - a = s.split() - if (a[0] in ctg_to_ctr): - out.write(ctg_to_ctr[a[0]]) - for x in a[1:]: - out.write("," + x) - - out.write("\n") - -out.close() \ No newline at end of file diff -Nru spades-3.10.1+dfsg/src/projects/mts/scripts/parse_output.py spades-3.11.1+dfsg/src/projects/mts/scripts/parse_output.py --- spades-3.10.1+dfsg/src/projects/mts/scripts/parse_output.py 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/scripts/parse_output.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,58 +0,0 @@ -#!/usr/bin/env python -from __future__ import print_function - -import argparse -import os.path - -argparser = argparse.ArgumentParser(description="Binner output formatter") -argparser.add_argument("--type", "-t", type=str, help="Binner type (canopy or concoct)", default="canopy") -argparser.add_argument("--output", "-o", type=str, help="Output directory with annotations") -argparser.add_argument("input", type=str, help="File with binning info") - -class Parser: - def __init__(self): - self.samples_annotation = dict() - - def add(self, line): - sample_contig, bin_id = self.parse(line) - sample_contig = sample_contig.split('-', 1) - sample = sample_contig[0] - contig = sample_contig[1] - if sample not in self.samples_annotation: - self.samples_annotation[sample] = dict() - - annotation = self.samples_annotation[sample] - if contig not in annotation: - annotation[contig] = list() - - annotation[contig].append(bin_id) - -class CanopyParser(Parser): - def parse(self, line): - annotation_str = line.split() - bin_id = annotation_str[0].strip() - sample_contig = annotation_str[1].strip() - return (sample_contig, bin_id) - -class ConcoctParser(Parser): - def parse(self, line): - annotation_str = line.split(",", 1) - bin_id = annotation_str[1].strip() - sample_contig = annotation_str[0].replace("~", ",") - return (sample_contig, bin_id) - -parsers = {"canopy": CanopyParser(), "concoct": ConcoctParser()} - -args = argparser.parse_args() -parser = parsers[args.type] - -with open(args.input, "r") as input_file: - for line in input_file: - parser.add(line) - -for sample, annotation in parser.samples_annotation.items(): - with open(os.path.join(args.output, sample + ".ann"), "w") as sample_out: - annotation = parser.samples_annotation[sample] - - for contig in annotation: - print(contig, ":", " ".join(annotation[contig]), file=sample_out) diff -Nru spades-3.10.1+dfsg/src/projects/mts/scripts/pca.R spades-3.11.1+dfsg/src/projects/mts/scripts/pca.R --- spades-3.10.1+dfsg/src/projects/mts/scripts/pca.R 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/scripts/pca.R 2017-09-28 09:05:14.000000000 +0000 @@ -5,22 +5,40 @@ unique(table) } -load_binning <- function(canopy_in, canopy_out) { - data <- read.table(canopy_in) +# my_normalize<-function(X) { +# X_norm<-X +# #column normalisation +# X_norm<-t(t(X_norm) / ifelse(colSums(X_norm) == 0, 1, colSums(X_norm))) +# #row normalisation +# X_norm<-X_norm / rowSums(X_norm) +# #mean/variance normalisation +# #X_norm<-exprs(standardise(ExpressionSet(X_norm))) +# #my variant of mean/var normalisation +# #X_norm<-t(as.matrix(scale(t(X_norm)))) +# return(X_norm) +# } + +normalize <- function(X) { + return (X / rowSums(X)) +} + +load_binning <- function(profiles_in, binning_out) { + data <- read.table(profiles_in) + data[,-1] <- normalize(data[,-1]) names(data) <- c('contig', sapply(seq(1, dim(data)[2]-1, 1), function(x) {paste('mlt', x, sep='')})) data <- format_ids(data) - binned <- read.table(canopy_out) - names(binned) <- c('clust', 'contig') + binned <- read.table(binning_out) + names(binned) <- c('contig', 'bin') binned <- format_ids(binned) merge(x=data, y=binned, by='contig') } -load_clusters <- function(canopy_in, canopy_out, int_contigs) { - data <- load_binning(canopy_in, canopy_out) +load_clusters <- function(profiles_in, binning_out, int_contigs) { + data <- load_binning(profiles_in, binning_out) if (missing(int_contigs)) { - pieces <- split(data, data$clust)[1:10] - lims <- lapply(pieces, function(x) head(x, 50)) + pieces <- split(data, data$bin)[1:10] + lims <- lapply(pieces, function(x) head(x, 500)) do.call(rbind, c(lims, list(make.row.names=FALSE))) } else { interesting <- read.table(int_contigs) @@ -33,14 +51,14 @@ prcomp(~ ., data = clusters[, grep('mlt', colnames(clusters))]) } -print_clusters <- function(pr, clust, image) { +print_clusters <- function(pr, bin, image) { if (!missing(image)) png(filename=image, width=1024, height=768) - lev <- levels(factor(clust)) + lev <- levels(factor(bin)) cols <- 1:length(lev) #layout(rbind(1,2), heights=c(7,1)) - plot(pr$x, col = as.numeric(clust), xlim=c(-100, 200), ylim=c(-50,50)) - a <- split(as.data.frame(pr$x), clust) + plot(pr$x, col = as.numeric(bin))#, xlim=c(-100, 200), ylim=c(-50,50)) + a <- split(as.data.frame(pr$x), bin) for (l in lev) { x <- a[[l]] text(median(x$PC1), median(x$PC2), l) @@ -56,7 +74,7 @@ "/Volumes/Chihua-Sid/mts/out/70p_3.log") prc_data <- do_prc(clusters) - print_clusters(prc_data, clusters$clust) + print_clusters(prc_data, clusters$bin) prc_data } @@ -72,6 +90,5 @@ clusters <- load_clusters(in_fn, out_fn, cont_fn) } -print(clusters[1:10,]) prc_data <- do_prc(clusters) -print_clusters(prc_data, clusters$clust, image_out) +print_clusters(prc_data, clusters$bin, image_out) diff -Nru spades-3.10.1+dfsg/src/projects/mts/scripts/ref_stats.sh spades-3.11.1+dfsg/src/projects/mts/scripts/ref_stats.sh --- spades-3.10.1+dfsg/src/projects/mts/scripts/ref_stats.sh 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/scripts/ref_stats.sh 1970-01-01 00:00:00.000000000 +0000 @@ -1,63 +0,0 @@ -#/bin/bash - -if [ "$#" -lt 3 ]; then - echo "Usage: identify.sh " - exit 1 -fi - -CTG_LENGTH_THR=5000 -process_cnt=4 -thread_cnt=8 -assemblies_folder=$1 -refs_folder=$2 -#canopy_out=$3 -out_dir=$3 - -folder=$out_dir/metaquast - -export LC_ALL=C -mkdir -p $out_dir - -~/git/quast/metaquast.py --debug -R $refs_folder -o $out_dir/metaquast $assemblies_folder/*.fasta - -#awk ' {print $2,$1} ' $canopy_out | sort > $folder/clusters.txt - -rm -rf $out_dir/ref_summary.txt - -for ref in $refs_folder/*.fasta ; do - echo "Processing reference $ref" - ref_name=$(basename "$ref") - ref_name="${ref_name%.*}" - - rm -rf $out_dir/${ref_name}.ctgs - - #for sample in $assemblies_out_dir/sample9.fasta ; do - for sample in $assemblies_folder/*.fasta ; do - sample_name=$(basename "$sample") - sample_name="${sample_name%.*}" - aligned=$out_dir/metaquast/quast_corrected_input/${sample_name}_to_${ref_name}.fasta - ~/git/ngs_scripts/contig_length_filter.py $CTG_LENGTH_THR $aligned $out_dir/long.fasta.tmp - ~/git/ngs_scripts/contig_info.py $out_dir/long.fasta.tmp $out_dir/ctg.info.tmp - sed_command="s/ID_/${sample_name}-/g" - grep -Eo "ID_.*$" $out_dir/ctg.info.tmp | sed -e $sed_command >> $out_dir/${ref_name}.ctgs - rm $out_dir/long.fasta.tmp - rm $out_dir/ctg.info.tmp - done - - sed 's/$/ '"${ref_name}"'/g' $out_dir/${ref_name}.ctgs >> $out_dir/ref_summary.txt - - #sort $out_dir/${ref_name}.ctgs.tmp > $out_dir/${ref_name}.ctgs - - #join $out_dir/${ref_name}.ctgs $out_dir/clusters.txt | awk ' { print $2 } ' | sort | uniq -c | sort -nr | head -10 - - #join $out_dir/${ref_name}.ctgs $out_dir/clusters.txt > $out_dir/join.txt - #awk ' { print $2 } ' $out_dir/join.txt | sort | uniq -c | sort -nr | head -10 - - report=$out_dir/metaquast/runs_per_reference/$ref_name/report.txt - - grep "Assembly" $report - grep "Genome fraction" $report -done - -#rm -rf $out_dir -echo "Finished" diff -Nru spades-3.10.1+dfsg/src/projects/mts/scripts/run_tsne.py spades-3.11.1+dfsg/src/projects/mts/scripts/run_tsne.py --- spades-3.10.1+dfsg/src/projects/mts/scripts/run_tsne.py 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/scripts/run_tsne.py 2017-09-28 09:05:14.000000000 +0000 @@ -0,0 +1,239 @@ +__author__ = 'tanunia' + + +import copy +import argparse +import numpy as np +import pandas as pd +import matplotlib +matplotlib.use('Agg') +from matplotlib import pyplot +from matplotlib.backends.backend_pdf import PdfPages +import bhtsne +import matplotlib.patches as mpatches +import matplotlib.cm as cm + +def draw_points(points, names, fig): + ax = fig.add_subplot(111) + for i in xrange(len(points)): + ax.annotate(names[i], xy=points[i], textcoords='data') + +def points_mean(x, y): + x_mean = sum(x)*1.0/len(x) + y_mean = sum(y)*1.0/len(y) + + return x_mean, y_mean + +def find_cluster_centers(mp): + points = [] + names = [] + for c in mp.keys(): + names.append("Cl-" + str(c)) + x, y = points_mean(mp[c]['x'], mp[c]['y']) + points.append([x, y]) + return points, names + +def divide_by_cluster(names, clusters): + res = {} + for i in xrange(len(clusters)): + c = clusters[i] + if not c in res.keys(): + res[c] = [] + res[c].append(names[i]) + + return res + +def take_first_per(clusters, per = 0.1): + res = [] + for c in clusters.keys(): + for i in xrange(max(int(len(clusters[c])*0.1), min(10, len(clusters[c])) ) ): + res.append(clusters[c][i]) + + return res + + + +def divide_by_color(x, y, color): + res = {} + + for i in xrange(len(color)): + c = color[i] + if not c in res.keys(): + res[c] = {} + res[c]["x"] = [] + res[c]["y"] = [] + res[c]["x"].append(x[i]) + res[c]["y"].append(y[i]) + + return res + +def form_points(df): + x = df["x"].tolist() + y = df["y"].tolist() + names = [z[8:18] for z in df[0].tolist()] + points = zip(x, y) + + return points, names + +import re +extract_num = re.compile("\d+") + +def run_tsne(features_file, colors_file, output_prefix + , filter_sample=[] + , filter_cluster=[] + , lst=[] + , draw_per = 1.0 + , iter = 1000 + , perplexity = 50): + # read data + data_df = pd.read_table(features_file, header=None) + cluster_colors = pd.read_table(colors_file, header=None) + print(data_df.head()) + + # make dataframe pretty + cluster_colors = cluster_colors.rename(columns={1:'color'}) + cluster_colors["color"] = [int(extract_num.findall(str(x))[0]) for x in cluster_colors["color"].tolist()] + print(cluster_colors.head()) + #cluster_colors = cluster_colors.rename(columns={0:0}) + + # filter by samples + if len(filter_sample) > 0: + filter1 = [] + for x in cluster_colors[0].tolist(): + for it in filter_sample: + st = "sample" + it + "-" + if x.startswith(st): + filter1.append(x) + cluster_colors = cluster_colors[cluster_colors[0].isin(filter1)] + + # filter by percent + if draw_per < 1: + clusters = divide_by_cluster(cluster_colors[0].tolist(), cluster_colors["color"].tolist()) + filter2 = take_first_per(clusters, lst) + s = set(filter2) + lst_new = [] + for n in lst: + for x in cluster_colors[0].tolist(): + if x.startswith(n): + print x + lst_new.append(x) + if x not in s: + filter2.append(x) + lst = lst_new + cluster_colors = cluster_colors[cluster_colors[0].isin(filter2)] + + + # merge data + mapped = pd.merge(cluster_colors, data_df, on=0) + + # filter by length + mapped["length"] = [int(x.split("_")[3]) for x in mapped[0].tolist()] + mapped = mapped[mapped["length"] > 2000] + print(mapped) + + # normalize like in CONCOCT + data = mapped.as_matrix(columns=mapped.columns[2:-1]) + + v = (1.0/mapped["length"]).as_matrix()[:, np.newaxis] + data = data + v + along_Y = np.apply_along_axis(sum, 0, data) + data = data/along_Y[None, :] + along_X = np.apply_along_axis(sum, 1, data) + data = data/along_X[:, None] + data = np.log(data) + #print(data) + + embedding_array = bhtsne.run_bh_tsne(data, initial_dims=data.shape[1], perplexity=perplexity, max_iter=iter) + mapped["x"] = embedding_array[:, 0] + mapped["y"] = embedding_array[:, 1] + + # draw result of TSNE on scatter plot + + pp = PdfPages(output_prefix) + + + # filter clusters to show + fc = filter_cluster + if len(fc) > 0: + filtered = mapped[mapped["color"].isin(fc)] + #mapped = filtered + else: + filtered = mapped + + fig = pyplot.figure() + + # draw scatter plot + color = mapped["color"].tolist() + mx_color = max(color) + pyplot.scatter(mapped["x"].tolist(), mapped["y"].tolist(), c=[cm.spectral(float(i) /mx_color) for i in color]) + + # make a legend for specific clusters + # find cluster centers + x = filtered["x"].tolist() + y = filtered["y"].tolist() + mp = divide_by_color(x, y, filtered["color"].tolist()) + points, names = find_cluster_centers(mp) + patches = [] + dcolors = list(set(color)) + for c in dcolors: + if c in fc and len(fc) < 5: + patches.append(mpatches.Patch(color=cm.spectral(float(c)/mx_color), label='C-'+ str(c))) + pyplot.legend(handles=patches) + draw_points(points, names, fig) + + # mark specific points + filtered = mapped[mapped[0].isin(lst)] + pyplot.scatter(filtered["x"].tolist(), filtered["y"].tolist(), marker="p", edgecolors='black', c=[cm.spectral(float(i) /mx_color) for i in filtered["color"].tolist()]) + + + pyplot.title('Perp = '+ str(perplexity)+ ' Iter = ' + str(iter)) + pp.savefig() + + pp.close() + +def get_points(file): + if file == "": + return [] + else: + points = [] + fin = open(file, "r") + for l in fin.readlines(): + points.append(l.strip()) + fin.close() + return points + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("profile", help="profile information (depth)") + parser.add_argument("binning", help="file with binning results") + parser.add_argument("output", help="path to pdf-file to save graph") + parser.add_argument("-p", "--percent", help="sets size of random subsample from profile to run TSNE", + type=float, + default=1.0) + parser.add_argument("-i", "--iteration", help="number of TSNE iterations", + type=int, + default=1000) + parser.add_argument("-e", "--perplexity", help="TSNE perplexity", + type=float, + default=50) + parser.add_argument("-s", "--samples", help="run TSNE only on samples from the list", + nargs='+', + default=[]) + parser.add_argument("-c", "--clusters", help="draw only clusters from the list", + nargs='+', + default=[]) + parser.add_argument("-f", "--pointsfile", help="highlight specific points on the graph", + default="") + + args = parser.parse_args() + points = get_points(args.pointsfile) + run_tsne(args.profile, args.binning, args.output + , args.samples + , args.clusters + , points + , args.percent + , args.iteration + , args.perplexity) + +if __name__ == "__main__": + main() diff -Nru spades-3.10.1+dfsg/src/projects/mts/scripts/split_bins.py spades-3.11.1+dfsg/src/projects/mts/scripts/split_bins.py --- spades-3.10.1+dfsg/src/projects/mts/scripts/split_bins.py 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/scripts/split_bins.py 2017-09-28 09:05:14.000000000 +0000 @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python from __future__ import print_function import os @@ -9,11 +9,14 @@ import subprocess def print_usage(): - print("Usage: split_bins.py ") + print("Usage: split_bins.py [-p]") contigs = sys.argv[1] sample, _ = path.splitext(path.basename(contigs)) out_dir = sys.argv[3] +prepend_name = False +if len(sys.argv) > 4 and sys.argv[4] == "-p": + prepend_name = True binning = common.load_annotation(sys.argv[2], False) @@ -22,9 +25,12 @@ cags = set() for seq in SeqIO.parse(contigs, "fasta"): seq_id = seq.id - seq.id = sample + "-" + seq_id - #seq.id = common.get_id(seq.id, sample) - seq.description = "" + if prepend_name: + seq.id = sample + "-" + seq_id + seq.description = "" for cag in binning.get(seq_id, []): - with open(path.join(out_dir, "{}-{}.fasta".format(sample, cag)), "a") as output: + filename = cag + ".fasta" + if prepend_name: + filename = sample + "-" + filename + with open(path.join(out_dir, filename), "a") as output: SeqIO.write(seq, output, "fasta") diff -Nru spades-3.10.1+dfsg/src/projects/mts/scripts/validate.pl spades-3.11.1+dfsg/src/projects/mts/scripts/validate.pl --- spades-3.10.1+dfsg/src/projects/mts/scripts/validate.pl 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/scripts/validate.pl 2017-09-28 09:05:14.000000000 +0000 @@ -0,0 +1,404 @@ +#!/usr/bin/perl + +use strict; +use Getopt::Long; + +my $tFile = ''; +my $zFile = ''; +my $fFile = ''; +my $help = ''; +my $quiet = ''; +my $outFile = "Conf.csv"; + +my $USAGE = <<"USAGE"; +Usage: ./Validate.pl --cfile=clustering.csv --sfile=species.csv --ffile=Contigs.fasta + +Regular options: +--ffile Contigs.fasta weight statistics by contig lengths +--ofile filename -- outputfile for confusion matrix default Conf.csv +--quiet -- suppress variable names +--help + +USAGE + +GetOptions("ffile=s" => \$fFile, "cfile=s" => \$tFile,"sfile=s" => \$zFile, "ofile=s" => \$outFile, 'quiet' => \$quiet, 'help' => \$help) or die("Error in command line arguments\n"); + +if ($help ne '') {print $USAGE;} + +die $USAGE unless ($tFile ne '' && $zFile ne ''); + +my @t = (); +my $maxt = 0; +my $N = 0; +my $S = 0; +my %hashCluster = {}; +my @ctotals = (); + + +my @Seq = (); +my @id = (); +my %hashLengths = {}; +my $count = 0; +if($fFile ne ''){ + open(FILE, $fFile) or die "Can't open $fFile\n"; + + my $seq = ""; + + while(my $line = ){ + chomp($line); + + if($line =~ />(.*)/){ + + $id[$count] = $1; + + if($seq ne ""){ + $Seq[$count - 1] = $seq; + + $seq = ""; + } + + $count++; + } + else{ + $seq .= $line; + } + } + close(FILE); + + $Seq[$count - 1] = $seq; + my $stotal = $count; + + + for(my $i = 0; $i < $stotal; $i++){ + my $iid = $id[$i]; + my $slength = length($Seq[$i]); + $hashLengths{$iid} = $slength; + } +} + +open(FILE, $tFile) or die "Can't open $tFile"; + +while(my $line = ){ + $N++; + chomp($line); + + my @tokens = split(/\t/,$line); + + my $name = $tokens[0]; + my $cluster = $tokens[1]; + $cluster =~ s/\D//g; + $ctotals[$cluster]++; + $hashCluster{$name} = $cluster; + #print "$name $cluster\n"; + if($cluster > $maxt){ + $maxt = $cluster; + } +} + +close(FILE); + +open(FILE, $zFile) or die "Can't open $zFile"; + +my $tweight = 0.0; +my %hashC = {}; +my $count = 0; +while(my $line = ){ + chomp($line); + my @tokens = split(/\t/,$line); + + my $name = $tokens[0]; + if($hashCluster{$name} ne undef){ + my $tcluster = $hashCluster{$name}; + + my $genus = $tokens[3]; + + my $l = 0.0; + if($hashLengths{$name} ne undef){ + $l = $hashLengths{$name}; + } + else{ + $l = 1.0; + } + + if($hashC{$genus} eq undef){ + my @temp = (); + + for(my $i = 0; $i < $maxt + 1; $i++){ + $temp[$i] = 0; + } + + $temp[$tcluster]+=$l; + + $hashC{$genus} = \@temp; + } + else{ + @{$hashC{$genus}}[$tcluster]+=$l; + } + $count++; + $tweight += $l; + $S++; + } +} + +close(FILE); + +my $classcount = 0; +my @cluster = (); +my $j = 0; + +open(OUTFILE,">$outFile") or die "Can't open $outFile\n"; + +printf OUTFILE "Taxa,"; + +my @names = (); + +for(my $i = 0; $i < $maxt + 1; $i++){ + if($ctotals[$i] > 0){ + push(@names,"D$i"); + } +} + +my $nameString = join(",",@names); + +printf OUTFILE "$nameString\n"; + +my $nJ = 0; +my $nI = 0; + +foreach my $key(sort keys %hashC){ + if($hashC{$key} ne undef){ + my @temp = @{$hashC{$key}}; + my $ptotal = 0; + $nI = 0; + for(my $i = 0; $i < $maxt + 1; $i++){ + $ptotal += $temp[$i]; + } + + if($ptotal > 0){ + my @vals = (); + for(my $i = 0; $i < $maxt + 1; $i++){ + if($ctotals[$i] > 0){ + $cluster[$nI][$nJ] = $temp[$i]; + push(@vals,$temp[$i]); + $nI++; + } + } + $nJ++; + + my $cTemp = join(",",@vals); + + print OUTFILE "$key,$cTemp\n"; + } + } +} + +close(OUTFILE); + +if($quiet eq ''){ + printf("N\tM\tTL\tS\tK\tRec.\tPrec.\tNMI\tRand\tAdjRand\n"); +} + +my $NK = scalar(@cluster);; +my $NS = scalar(@{$cluster[0]}); + +printf("%d\t%d\t%.4e\t%d\t%d\t%f\t%f\t%f\t%f\t%f\n",$N,$S,$tweight,$NS,$NK,recall(@cluster),precision(@cluster),nmi(@cluster),randindex(@cluster),adjrandindex(@cluster)); + +sub precision(){ + my @cluster = @_; + my $nN = 0; + my $nC = scalar(@cluster); + my $nK = scalar(@{$cluster[0]}); + my $precision = 0; + + for(my $i = 0; $i < $nC; $i++){ + my $maxS = 0; + + for(my $j = 0; $j < $nK; $j++){ + if($cluster[$i][$j] > $maxS){ + $maxS = $cluster[$i][$j]; + } + + $nN += $cluster[$i][$j]; + } + $precision += $maxS; + } + + return $precision/$nN; +} + +sub recall(){ + my @cluster = @_; + my $nN = 0; + my $nC = scalar(@cluster); + my $nK = scalar(@{$cluster[0]}); + my $recall = 0; + + for(my $i = 0; $i < $nK; $i++){ + my $maxS = 0; + + for(my $j = 0; $j < $nC; $j++){ + if($cluster[$j][$i] > $maxS){ + $maxS = $cluster[$j][$i]; + } + + $nN += $cluster[$j][$i]; + } + + $recall += $maxS; + } + + return $recall/$nN; +} + +sub choose2{ + my $N = shift; + my $ret = $N*($N - 1); + + return int($ret/2); +} + +sub randindex{ + my @cluster = @_; + my @ktotals = (); + my @ctotals = (); + my $nN = 0; + my $nC = scalar(@cluster); + my $nK = scalar(@{$cluster[0]}); + my $cComb = 0; + my $kComb = 0; + my $kcComb = 0; + + for(my $i = 0; $i < $nK; $i++){ + $ktotals[$i] = 0; + for(my $j = 0; $j < $nC; $j++){ + $ktotals[$i]+=$cluster[$j][$i]; + } + $nN += $ktotals[$i]; + $kComb += choose2($ktotals[$i]); + } + + + for(my $i = 0; $i < $nC; $i++){ + $ctotals[$i] = 0; + for(my $j = 0; $j < $nK; $j++){ + $ctotals[$i]+=$cluster[$i][$j]; + } + $cComb += choose2($ctotals[$i]); + } + + for(my $i = 0; $i < $nC; $i++){ + for(my $j = 0; $j < $nK; $j++){ + $kcComb += choose2($cluster[$i][$j]); + } + } + + my $nComb = choose2($nN); + + return ($nComb - $cComb - $kComb + 2*$kcComb)/$nComb; + +} + +sub adjrandindex{ + my @cluster = @_; + my @ktotals = (); + my @ctotals = (); + my $nN = 0; + my $nC = scalar(@cluster); + my $nK = scalar(@{$cluster[0]}); + my $cComb = 0; + my $kComb = 0; + my $kcComb = 0; + + for(my $i = 0; $i < $nK; $i++){ + $ktotals[$i] = 0; + for(my $j = 0; $j < $nC; $j++){ + $ktotals[$i]+=$cluster[$j][$i]; + } + $nN += $ktotals[$i]; + $kComb += choose2($ktotals[$i]); + } + + + for(my $i = 0; $i < $nC; $i++){ + $ctotals[$i] = 0; + for(my $j = 0; $j < $nK; $j++){ + $ctotals[$i]+=$cluster[$i][$j]; + } + $cComb += choose2($ctotals[$i]); + } + + for(my $i = 0; $i < $nC; $i++){ + for(my $j = 0; $j < $nK; $j++){ + $kcComb += choose2($cluster[$i][$j]); + } + } + + my $nComb = choose2($nN); + + my $temp = ($kComb*$cComb)/$nComb; + + my $ret = $kcComb - $temp; + + return $ret/(0.5*($cComb + $kComb) - $temp); + +} + + + +sub nmi{ + my @cluster = @_; + my @ktotals = (); + my @ctotals = (); + my $nN = 0; + my $nC = scalar(@cluster); + my $nK = scalar(@{$cluster[0]}); + my $HC = 0.0; + my $HK = 0.0; + + for(my $i = 0; $i < $nK; $i++){ + $ktotals[$i] = 0; + for(my $j = 0; $j < $nC; $j++){ + $ktotals[$i]+=$cluster[$j][$i]; + } + $nN += $ktotals[$i]; + } + + + for(my $i = 0; $i < $nC; $i++){ + $ctotals[$i] = 0; + for(my $j = 0; $j < $nK; $j++){ + $ctotals[$i]+=$cluster[$i][$j]; + } + my $dFC = $ctotals[$i]/$nN; + if($dFC > 0.0){ + $HC += -$dFC*log($dFC); + } + } + + for(my $i = 0; $i < $nK; $i++){ + my $dFK = $ktotals[$i]/$nN; + if($dFK > 0.0){ + $HK += -$dFK*log($dFK); + } + } + + + my $NMI = 0.0; + + for(my $i = 0; $i < $nK; $i++){ + my $NMII = 0.0; + + for(my $j = 0; $j < $nC; $j++){ + if($ctotals[$j] >0 && $ktotals[$i] > 0){ + my $dF = ($nN*$cluster[$j][$i])/($ctotals[$j]*$ktotals[$i]); + if($dF > 0.0){ + $NMII += $cluster[$j][$i]*log($dF); + } + } + } + $NMII /= $nN; + $NMI += $NMII; + } + + return (2.0*$NMI)/($HC + $HK); +} diff -Nru spades-3.10.1+dfsg/src/projects/mts/Snakefile spades-3.11.1+dfsg/src/projects/mts/Snakefile --- spades-3.10.1+dfsg/src/projects/mts/Snakefile 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/Snakefile 2017-09-28 09:05:13.000000000 +0000 @@ -1,175 +1,330 @@ include: "Common.snake" -import os -import os.path - -from scripts.common import dump_dict +configfile: "config.yaml" -#Path to saves of necessary assembly stage -SAVES = "K{0}/saves/01_before_repeat_resolution/graph_pack".format(K) +import os.path onstart: - try: - os.mkdir("tmp") - except: - pass - print("Detected", SAMPLE_COUNT, "samples in", IN) - print("They form: ", GROUPS) + shell("mkdir -p tmp") -# ---- Main pipeline ----------------------------------------------------------- +def final_stage(w): + if config["reassembly"]["enabled"]: + return ["propagation.done", "binning/bins_total.prof"] #Stop after the bin choosing + if config["propagation"]["enabled"]: + return "propagation.done" #Stop on the propagation + return "binning/{}/binning.done".format(BINNER) #Stop on the preliminary binning rule all: - input: dynamic("reassembly/{cag}.fasta") + input: final_stage message: "Dataset of {SAMPLE_COUNT} samples from {IN} has been processed." -rule assemble: +# ---- Assembly ---------------------------------------------------------------- + +# Assemble with MegaHIT +rule megahit: input: left=left_reads, right=right_reads - output: "assembly/{sample}.fasta" - #TODO: remove this boilerplate + output: "assembly/megahit/{group}.fasta" + params: left=lambda w: ",".join(left_reads(w)), + right=lambda w: ",".join(right_reads(w)), + dir="assembly/megahit/{group}" + threads: THREADS + log: "assembly/megahit.log" + message: "Assembling {wildcards.group} with MegaHIT" + shell: "rm -rf {params.dir} &&" + " {SOFT}/megahit/megahit -1 {params.left} -2 {params.right}" + " -t {threads} -o {params.dir} >{log} 2>&1 &&" + " cp {params.dir}/final.contigs.fa {output}" + +# Assemble with SPAdes +rule spades: + input: left=left_reads, right=right_reads + output: "assembly/spades/{group}.fasta" params: left=lambda w: " ".join(expand("-1 {r}", r=left_reads(w))), right=lambda w: " ".join(expand("-2 {r}", r=right_reads(w))), - dir="assembly/{sample}" - log: "assembly/{sample}.log" + dir="assembly/spades/{group}", bh=lambda w: "" if is_fastq(w) else "--only-assembler" threads: THREADS - message: "Assembling {wildcards.sample} with SPAdes" - shell: "{SPADES}/spades.py --meta -m 400 -t {threads} {params.left} {params.right}" - " -o {params.dir} >{log} 2>&1 && " + log: "assembly/{group}.log" + message: "Assembling {wildcards.group} with metaSPAdes" + shell: "{ASSEMBLER_DIR}/spades.py {params.bh} --meta -m 400 -t {threads}" + " {params.left} {params.right}" + " --save-gp -o {params.dir} >{log} 2>&1 && " "cp {params.dir}/scaffolds.fasta {output}" -rule assemble_all: - input: expand("assembly/{sample}.fasta", sample=GROUPS) - message: "Assembled all samples" - -rule descriptions: - output: expand("profile/{sample}.desc", sample=SAMPLES) - message: "Generating sample descriptions" - run: - for sample in SAMPLES: - with open("profile/{}.desc".format(sample), "w") as out: - wildcards.sample = sample - print(left_reads(wildcards), file=out) - print(right_reads(wildcards), file=out) +rule copy_contigs: + input: "assembly/{}/{{group}}.fasta".format(ASSEMBLER) + output: "assembly/full/{group,(sample|group\d+)}.fasta" + shell: "cp {input} {output}" + +rule split_contigs: + input: "assembly/{}/{{group}}.fasta".format(ASSEMBLER) + output: "assembly/splits/{group,(sample|group)\d+}.fasta" + message: "Cutting {wildcards.group} into {SPLIT_LENGTH} bp splits" + shell: "{SCRIPTS}/cut_fasta.py -c {SPLIT_LENGTH} -o 0 -m {input} > {output}" + +#---- Generating profiles/depths ----------------------------------------------- + +# MetaBAT way + +rule bowtie_index: + input: "assembly/{frags}/all.fasta" + output: "profile/jgi/{frags}/index.done" + log: "profile/jgi/{frags}/bowtie-build.log" + message: "Building bowtie index" + shell: "bowtie2-build {input} profile/jgi/index_{wildcards.frags} >{log} 2>&1 && touch {output}" + +rule align: + input: left=left_sample_reads, right=right_sample_reads, + index="profile/jgi/{frags}/index.done" + output: "profile/jgi/{frags}/{sample}.bam" + threads: THREADS + log: "profile/jgi/{frags}/bowtie-{sample}.log" + message: "Aligning {wildcards.sample} with bowtie" + shell: "bowtie2 -x profile/jgi/index_{wildcards.frags} -p {threads}" + " -1 {input.left} -2 {input.right} 2>{log} | samtools view -bS - > {output}" + +rule depth: + input: expand("profile/jgi/{{frags}}/{sample}.bam", sample=SAMPLES) + output: "profile/jgi/{frags}/depth_metabat.txt" + log: "profile/jgi/{frags}/depths.log" + message: "Calculating contig depths" + shell: "{SOFT}/metabat/jgi_summarize_bam_contig_depths --outputDepth {output} {input} >{log} 2>&1" + +rule concoct_depth: + input: "profile/jgi/splits/depth_metabat.txt" + output: "binning/concoct/profiles_jgi.in" + message: "Converting depth file into CONCOCT format" + shell: "awk 'NR > 1 {{for(x=1;x<=NF;x++) if(x == 1 || (x >= 4 && x % 2 == 0)) printf \"%s\", $x (x == NF || x == (NF-1) ? \"\\n\":\"\\t\")}}' {input} > {output}" +# Our way rule kmc: - input: "profile/{sample}.desc" output: temp("tmp/{sample}.kmc_pre"), temp("tmp/{sample}.kmc_suf") - params: min_mult=2, tmp="tmp/{sample}_kmc", out="tmp/{sample}" + params: min_mult=2, tmp="tmp/{sample}_kmc", out="tmp/{sample}", + desc="profile/{sample}.desc", + left=left_sample_reads, right=right_sample_reads, + format=lambda w: "-fq" if is_fastq(w) else "-fa" log: "profile/kmc_{sample}.log" threads: THREADS message: "Running kmc for {wildcards.sample}" - shell: "mkdir {params.tmp} && " - "{SOFT}/kmc -k{SMALL_K} -t{threads} -ci{params.min_mult} -cs65535" - " @{input} {params.out} {params.tmp} >{log} 2>&1 && " + shell: "mkdir -p {params.tmp}\n" + "echo '{params.left}\n{params.right}' > {params.desc}\n" + "{SOFT}/kmc {params.format} -k{PROFILE_K} -t{threads} -ci{params.min_mult}" + " -cs65535 @{params.desc} {params.out} {params.tmp} >{log} 2>&1 && " "rm -rf {params.tmp}" rule multiplicities: input: expand("tmp/{sample}.kmc_pre", sample=SAMPLES), expand("tmp/{sample}.kmc_suf", sample=SAMPLES) - output: "profile/kmers.kmm" - params: kmc_files=" ".join(expand("tmp/{sample}", sample=SAMPLES)), out="profile/kmers" - log: "profile/kmers.log" - message: "Gathering {SMALL_K}-mer multiplicities from all samples" - shell: "{BIN}/kmer_multiplicity_counter -n {SAMPLE_COUNT} -k {SMALL_K} -s 3" + output: "profile/mts/kmers.kmm" + params: kmc_files=" ".join(expand("tmp/{sample}", sample=SAMPLES)), out="profile/mts/kmers" + log: "profile/mts/kmers.log" + message: "Gathering {PROFILE_K}-mer multiplicities from all samples" + shell: "{BIN}/kmer_multiplicity_counter -n {SAMPLE_COUNT} -k {PROFILE_K} -s 2" " -f tmp -t {threads} -o {params.out} >{log} 2>&1 && " "rm tmp/*.sorted" -rule profile: - input: contigs="assembly/{sample,\w+\d+}.fasta", mpl="profile/kmers.kmm" - output: id="profile/{sample}.id", mpl="profile/{sample}.mpl", splits= "assembly/{sample}_splits.fasta" - log: "profile/{sample}.log" - message: "Counting contig abundancies for {wildcards.sample}" - shell: "{BIN}/contig_abundance_counter -k {SMALL_K} -w tmp -c {input.contigs}" - " -n {SAMPLE_COUNT} -m profile/kmers -o profile/{wildcards.sample}" - " -f {output.splits} -l {MIN_CONTIG_LENGTH} >{log} 2>&1" +rule abundancies: + input: contigs="assembly/splits/{group}.fasta", mpl="profile/mts/kmers.kmm" + output: "profile/mts/{group}.tsv" + log: "profile/mts/{group}.log" + message: "Counting contig abundancies for {wildcards.group}" + shell: "{BIN}/contig_abundance_counter -k {PROFILE_K} -w tmp -c {input.contigs}" + " -n {SAMPLE_COUNT} -m profile/mts/kmers -o {output}" + " -l {MIN_CONTIG_LENGTH} >{log} 2>&1" + +rule combine_profiles: + input: expand("profile/mts/{group}.tsv", group=GROUPS) + output: "profile/mts/all.tsv" + message: "Combine all profiles" + run: + shell("rm -f {output}") + for sample_ann in input: + sample, _ = os.path.splitext(os.path.basename(sample_ann)) + shell("sed -e 's/^/{sample}-/' {sample_ann} >> {output}") rule binning_pre: - input: expand("profile/{sample}.id", sample=GROUPS) - output: "binning/{binner}/profiles.in" + input: "profile/mts/all.tsv" + output: "binning/{binner}/profiles_mts.in" params: " ".join(list(GROUPS.keys())) + log: "binning/input.log" message: "Preparing input for {wildcards.binner}" - shell: "{SCRIPTS}/make_input.py -t {wildcards.binner} -d profile -o {output} {params}" + shell: "{SCRIPTS}/make_input.py -n {SAMPLE_COUNT} -t {wildcards.binner}" + " -o {output} {input} >{log}" + +rule filter_contigs: + input: contigs="assembly/splits/all.fasta", profile="binning/{}/profiles_mts.in".format(BINNER) + output: contigs="assembly/splits/all_filtered.fasta" + message: "Leave contigs that have profile information" + shell: "cut -f1 < {input.profile} > tmp/names_tmp.txt && sed -i '1d' tmp/names_tmp.txt && " + "{SCRIPTS}/contig_name_filter.py {input.contigs} tmp/names_tmp.txt {output.contigs}" + +#---- Binning ------------------------------------------------------------------ +# Binning with Canopy rule canopy: - input: "binning/canopy/profiles.in" - output: out="binning/canopy/binning.out", prof="binning/canopy/bins.prof" + input: "binning/canopy/profiles_{}.in".format(PROFILER) + output: out="binning/canopy/binning.out", prof="binning/canopy/bins.prof", + flag=touch("binning/canopy/binning.done") threads: THREADS + log: "binning/canopy.log" message: "Running canopy clustering" - shell: "{SOFT}/cc.bin -n {threads} -i {input} -o {output.out} -c {output.prof} >binning/canopy/canopy.log 2>&1" + shell: "{SOFT}/cc.bin --filter_max_dominant_obs 1 -n {threads}" + " -i {input} -o {output.out} -c binning/canopy/canopy_bins.prof >{log} 2>&1 && " + "sed 's/CAG/BIN/g' binning/canopy/canopy_bins.prof >{output.prof}" -rule combine_splits: - input: expand("assembly/{sample}_splits.fasta", sample=GROUPS) - output: "assembly/samples_splits.fasta" - message: "Combine splitted contigs" - shell: "{SCRIPTS}/combine_contigs.py -r {input} > {output}" - -#FIXME what does gt1000 mean? +# Binning with CONCOCT rule concoct: - input: contigs=rules.combine_splits.output[0], profiles="binning/concoct/profiles.in" - output: out="binning/concoct/clustering_gt1000.csv" - params: "binning/concoct" + input: contigs="assembly/splits/all.fasta", profiles="binning/concoct/profiles_{}.in".format(PROFILER) + output: "binning/concoct/binning.out" + params: max_clusters=40, out="binning/concoct" + threads: THREADS + log: "binning/concoct.log" message: "Running CONCOCT clustering" - shell: "mkdir -p {params} && " - "set +u; source activate concoct_env; set -u && " - "concoct --composition_file {input.contigs} --coverage_file {input.profiles} -b {params}" - -binning_inputs = {"canopy": rules.canopy.output.out, "concoct": rules.concoct.output.out} - -rule binning_post: - input: binning_inputs[BINNER] - output: expand("annotation/{sample}.ann", sample=GROUPS) + shell: "set +u; source activate py27; set -u\n" + "concoct -c {params.max_clusters} --composition_file {input.contigs}" + " --coverage_file {input.profiles} --length_threshold {MIN_CONTIG_LENGTH}" + " -b {params.out} >{log} 2>&1 && " + "cp binning/concoct/clustering_gt{MIN_CONTIG_LENGTH}.csv {output}" + +rule extract_bins: + input: "assembly/splits/all.fasta", "binning/annotation/all.ann" + output: touch("binning/concoct/binning.done") + message: "Extracting CONCOCT bins" + shell: "mkdir -p binning/bins && {SCRIPTS}/split_bins.py {input} binning/bins" + +# Binning with MetaBAT +rule metabat: + input: contigs="assembly/full/all.fasta", profiles="profile/jgi/full/depth_metabat.txt" + output: flag=touch("binning/metabat/binning.done"), + out="binning/metabat/binning.out" + threads: THREADS + params: "binning/metabat/cluster" + log: "binning/metabat.log" + message: "Running MetaBAT clustering" + shell: "{SOFT}/metabat/metabat -t {threads} -m {MIN_CONTIG_LENGTH} " + " --minContigByCorr {MIN_CONTIG_LENGTH} --saveCls" + " -i {input.contigs} -a {input.profiles}" + " -o {params} > {log} && " + "sed 's/\t/,/g' {params} > {output.out} && mkdir -p binning/bins && " + "for file in binning/metabat/*.fa ; do bin=${{file##*/}}; mv $file binning/bins/${{bin%.*}}.fasta; done" + +# Binning with MAXBIN2 +rule maxbin: + input: contigs="assembly/splits/all.fasta", profiles="binning/maxbin/profiles_{}.in".format(PROFILER) + output: "binning/maxbin/binning.out" + threads: THREADS + params: out="binning/maxbin/cluster" + log: "binning/maxbin.log" + message: "Running MaxBin2 clustering" + shell: "perl {SOFT}/MaxBin/run_MaxBin.pl -thread {threads} -min_contig_length {MIN_CONTIG_LENGTH} " + " -contig {input.contigs} -abund {input.profiles}" + " -out {params.out} > {log}" + "&& {SCRIPTS}/make_maxbincsv.py -o {output} {params.out}" + +# Binning with GATTACA +# conda create -n py27 python=2.7.9 numpy scipy scikit-learn anaconda +# conda install -c bioconda pysam=0.11.2.2 +rule gattaca: + input: contigs="assembly/splits/all_filtered.fasta", profiles="binning/gattaca/profiles_{}.in".format(PROFILER) + output: "binning/gattaca/binning.out" + threads: THREADS + log: "binning/gattaca.log" + message: "Running GATTACA clustering" + shell: "set +u; source activate py27; set -u\n" + "python {SOFT}/gattaca/src/python/gattaca.py cluster --contigs {input.contigs}" + " --coverage {input.profiles} --algorithm dirichlet --clusters {output} >{log} 2>&1" + +# Binning with BinSanity +rule binsanity: + input: contigs="assembly/splits/all_filtered.fasta", profiles="binning/binsanity/profiles_{}.in".format(PROFILER) + output: "binning/binsanity/binning.out" + threads: THREADS + log: "binning/binsanity.log" + message: "Running BinSanity clustering" + shell: "python2 /home/tdvorkina/binsanity/src/BinSanity/test-scripts/Binsanity-lc " + " -f ./ -l {input.contigs} -c {input.profiles} -o binning/binsanity/BINSANITY-RESULTS > {log} && " + "{SCRIPTS}/clusters2csv.py binning/binsanity/BINSANITY-RESULTS/KMEAN-BINS {output} && mv Binsanity-log.txt binning/ " + +# Postprocessing +rule bin_profiles: + input: "profile/{}/all.tsv".format(PROFILER), "binning/{}/unified_binning.tsv".format(BINNER) + output: "binning/{}/bins.prof".format(BINNER) + message: "Deriving bin profiles" + shell: "{SCRIPTS}/bin_profiles.py {input} > {output}" + +ruleorder: canopy > bin_profiles + +rule binning_format: + input: "binning/{}/binning.out".format(BINNER) + output: "binning/{}/unified_binning.tsv".format(BINNER) + message: "Making unified binning results" + shell: "{SCRIPTS}/convert_output.py -t {BINNER} -o {output} {input} &&" + "cp {output} binning/binning.tsv" #Additional table for stats + +rule annotate: + input: "binning/{}/unified_binning.tsv".format(BINNER) + output: expand("binning/annotation/{sample}.ann", sample=GROUPS) + params: "binning/annotation/" message: "Preparing raw annotations" - shell: "{SCRIPTS}/parse_output.py -t {BINNER} -o annotation {input}" + run: + samples_annotation = dict() + #Load the whole annotation: {sample: [bins]} + with open(input[0]) as input_file: + for line in input_file: + annotation_str = line.split("\t", 1) + bin_id = annotation_str[1].strip() + sample_contig = annotation_str[0].split('-', 1) + if len(sample_contig) > 1: + sample = sample_contig[0] + contig = sample_contig[1] + else: #Backward compatibility with old alternative pipeline runs + sample = "group1" + contig = sample_contig[0] + annotation = samples_annotation.setdefault(sample, dict()) + if contig not in annotation: + annotation[contig] = [bin_id] + else: + annotation[contig].append(bin_id) + + #Serialize it in the propagator format + for sample, annotation in samples_annotation.items(): + with open(os.path.join(params[0], sample + ".ann"), "w") as sample_out: + for contig in samples_annotation[sample]: + print(contig, "\t", " ".join(annotation[contig]), sep="", file=sample_out) -#Post-clustering pipeline -rule read_binning: - input: contigs="assembly/{sample}.fasta", ann="annotation/{sample}.ann", - left=left_reads, right=right_reads - output: "propagation/{sample}_edges.ann" - params: saves=os.path.join("assembly/{sample}/", SAVES), - splits="assembly/{sample}_splits.fasta", - out="propagation/{sample}_edges", - group=lambda wildcards: GROUPS[wildcards.sample] - #left=" ".join(input.left), right=" ".join(input.right) - log: "binning/{sample}.log" - message: "Propagating annotation & binning reads for {wildcards.sample}" - shell: - "{BIN}/prop_binning -k {K} -s {params.saves} -c {input.contigs}" - " -n {params.group} -l {input.left} -r {input.right}" - " -a {input.ann} -f {params.splits} -o binning -d {params.out} >{log} 2>&1" -#TODO: bin profiles for CONCOCT +#---- Post-clustering pipeline ------------------------------------------------- + +# Propagation stage +#Path to saves of necessary assembly stage +SAVES = "K{0}/saves/01_before_repeat_resolution/graph_pack".format(ASSEMBLY_K) + +rule prop_binning: + input: contigs="assembly/spades/{group}.fasta", splits="assembly/splits/{group}.fasta", + ann="binning/annotation/{group}.ann", left=left_reads, right=right_reads, + bins="binning/{}/filtered_bins.tsv".format(BINNER) + output: ann="propagation/annotation/{group}.ann", edges="propagation/edges/{group}.fasta" + params: saves=os.path.join("assembly/spades/{group}/", SAVES), + samples=lambda wildcards: " ".join(GROUPS[wildcards.group]) + log: "binning/{group}.log" + message: "Propagating annotation & binning reads for {wildcards.group}" + shell: "{BIN}/prop_binning -k {ASSEMBLY_K} -s {params.saves} -c {input.contigs} -b {input.bins}" + " -n {params.samples} -l {input.left} -r {input.right} -t {MIN_CONTIG_LENGTH}" + " -a {input.ann} -f {input.splits} -o binning -p {output.ann} -e {output.edges} >{log} 2>&1" + +rule prop_all: + input: expand("propagation/annotation/{group}.ann", group=GROUPS) + output: touch("propagation.done") + message: "Finished propagation of all annotations." + +rule choose_bins: + input: "binning/{}/unified_binning.tsv".format(BINNER) + output: "binning/{}/filtered_bins.tsv".format(BINNER) + message: "Filter small bins" + shell: "{SCRIPTS}/choose_bins.py {input} >{output} 2>&1" + rule choose_samples: - input: binned=expand("propagation/{sample}_edges.ann", sample=GROUPS), - prof=rules.canopy.output.prof - output: dynamic("binning/{cag}/left.fastq"), - dynamic("binning/{cag}/right.fastq") + input: "binning/{}/bins.prof".format(BINNER), "binning/{}/filtered_bins.tsv".format(BINNER) + output: "binning/bins_total.prof" log: "binning/choose_samples.log" - message: "Choosing samples for all CAGs" - shell: "{SCRIPTS}/choose_samples.py {input.prof} binning/ >{log} 2>&1" - -rule reassembly_config: - input: "binning/{cag}/left.fastq" - output: "reassembly/{cag}.yaml" - message: "Generated config file for reassembly of {wildcards.cag}" - run: - with open(output[0], "w") as outfile: - conf = {"k": SMALL_K, "sample_cnt": SAMPLE_COUNT, - "kmer_mult": str(rules.multiplicities.params.out), - "bin": wildcards.cag, "bin_prof": str(rules.canopy.output.prof), - "edges_sqn": "profile/{}_edges.fasta".format(wildcards.cag), - "edges_mpl": "profile/{}_edges.mpl".format(wildcards.cag), - "edge_fragments_mpl": "profile/{}_edges_frag.mpl".format(wildcards.cag), - "frag_size": 10000, "min_len": 100} - dump_dict(conf, outfile) - -rule reassemble: - input: left="binning/{cag}/left.fastq", right="binning/{cag}/right.fastq", - config="reassembly/{cag}.yaml" - output: "reassembly/{cag}.fasta" - params: "reassembly/reassembly_{cag}" - log: "reassembly/reassembly_{cag}.log" - threads: THREADS - message: "Reassembling reads for {wildcards.cag}" - shell: "{SPADES_REASSEMBLY}/spades.py --meta -t {threads}" - " --pe1-1 {input.left} --pe1-2 {input.right} --pe1-ff" - " -o {params} --series-analysis {input.config} >{log} 2>&1 && " - "cp {params}/scaffolds.fasta {output}" + message: "Choosing bins for reassembly and samples for them" + shell: "rm -f binning/*.info && rm -rf binning/excluded && " + "{SCRIPTS}/choose_samples.py {input} {output} binning >{log} 2>&1" diff -Nru spades-3.10.1+dfsg/src/projects/mts/stats.cpp spades-3.11.1+dfsg/src/projects/mts/stats.cpp --- spades-3.10.1+dfsg/src/projects/mts/stats.cpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/stats.cpp 2017-09-28 09:05:13.000000000 +0000 @@ -7,8 +7,8 @@ #include "pipeline/graphio.hpp" #include "pipeline/graph_pack.hpp" -#include "utils/simple_tools.hpp" -#include "utils/path_helper.hpp" +#include "utils/stl_utils.hpp" +#include "utils/filesystem/path_helper.hpp" #include "utils/logger/log_writers.hpp" #include "math/xmath.h" #include @@ -34,7 +34,7 @@ } io::SingleRead ReadGenome(const string& genome_path) { - path::CheckFileExistenceFATAL(genome_path); + fs::CheckFileExistenceFATAL(genome_path); auto genome_stream_ptr = std::make_shared(genome_path); return ReadSequence(*genome_stream_ptr); } @@ -122,7 +122,7 @@ << "Propagated edges\tPropagated length" << endl; for (const auto genome_path : genomes_path) { - auto ref_name = path::basename(genome_path); + auto ref_name = fs::basename(genome_path); io::SingleRead genome = ReadGenome(genome_path); visualization::position_filler::FillPos(gp, genome_path, "", true); diff -Nru spades-3.10.1+dfsg/src/projects/mts/Stats.snake spades-3.11.1+dfsg/src/projects/mts/Stats.snake --- spades-3.10.1+dfsg/src/projects/mts/Stats.snake 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/Stats.snake 2017-09-28 09:05:13.000000000 +0000 @@ -6,7 +6,7 @@ import pandas from pandas import DataFrame -from scripts.common import gather_refs, dump_dict +from scripts.common import gather_refs #Additional config parameters try: @@ -18,215 +18,189 @@ METAQUAST = "metaquast" #Autodetect bins -CAGS, = glob_wildcards("binning/{cag,CAG\d+}/left.fastq") -CAGS.sort() +if config["reassembly"]["enabled"]: + BINS, = glob_wildcards("binning/{{bin,{}}}.info".format(NAME_TEMPLATE)) +else: + BINS, = glob_wildcards("binning/bins/{{bin,{}}}.fasta".format(NAME_TEMPLATE)) +if not BINS: + raise WorkflowError("No bins detected!") +BINS.sort() -CAG_EDGES = [c + "_edges" for c in CAGS] +#Additional config parameters +try: + QUAST_DIR = config["stats"]["quast"] + QUAST = os.path.join(QUAST_DIR, "quast.py") + METAQUAST = os.path.join(QUAST_DIR, "metaquast.py") +except KeyError: + QUAST = "quast" + METAQUAST = "metaquast" #Detect references -REFS = dict(gather_refs(config.get("REFS", []))) -ALL_REFS = ",".join(path for path in REFS.values()) - -FRAGMENT_NAMES_BY_TYPE = {"reassembly": CAG_EDGES, - "initial_assembly": list(GROUPS.keys())} +REFS = dict(gather_refs(config["stats"].get("refs", []))) +REFS_STR = ",".join(path for path in sorted(REFS.values())) def ref_path(wildcards): return REFS[wildcards.ref] onstart: - try: - os.mkdir("tmp") - except: - pass print("Detected", SAMPLE_COUNT, "samples in", IN) - if CAGS: - print("Detected good (abundant) CAGs:", " ".join(CAGS)) + if BINS: + print("Detected good (abundant) bins:", " ".join(BINS)) if REFS: print("Detected references:", " ".join(REFS)) +enabled_stats = [] +stats_targets = {"checkm": "checkm_{}.tsv", + "pca": "pca_{}.png", + "tsne": "tsne_{}.png", + "f1": "f1_{}.txt"} +if config["stats"].get("refs"): + stats_targets["gf"] = "gf_{}.tsv" + stats_targets["summary"] = "{}_summary.tsv" +for stage in ["binning", "propagation", "reassembly"]: + stage_stats = config["stats"].get(stage, []) + if stage_stats == "all": + stats = stats_targets.keys() + else: + stats = stage_stats + config["stats"].get("all", []) + enabled_stats.extend(os.path.join("stats/summary", stats_targets[st].format(stage)) for st in stats) + +misc_targets = {"ref_profiles": "ref_profiles.tsv"} +enabled_stats.extend(os.path.join("stats/summary", misc_targets[st].format(stage)) for st in config["stats"].get("misc", [])) + +rule all_stats: + input: enabled_stats + message: "Gathered stats: {input}" + #=============================================================================== #---- Statistics section ------------------------------------------------------- #=============================================================================== -#---- Single alignments for samples per reference ------------------------------- -#TODO: use alignments from meta version instead -rule quast_all_samples: - input: ref_fn=ref_path, contigs=expand("assembly/{sample}.fasta", sample=GROUPS) - output: summary_tsv="stats/summary/q_{ref}.tsv", report="stats/initial_assembly/{ref}/report.txt" - params: "stats/initial_assembly/{ref}" - log: "stats/initial_assembly/{ref}/quast.log" - threads: THREADS - message: "Aligning all samples on {wildcards.ref}" - shell: "{QUAST} -t {threads} -R {input.ref_fn} {input.contigs} -o {params} >/dev/null 2>&1 && " - "cp {params}/report.tsv {output.summary_tsv}" - -rule quast_all_reassemblies: - input: ref=ref_path, fragments=expand("profile/{cag_edges}.fasta", cag_edges=CAG_EDGES) - output: "stats/reassembly/{ref}/report.txt" - params: "stats/reassembly/{ref}" - log: "stats/reassembly/{ref}/quast.log" - threads: THREADS - message: "Aligning all samples on {wildcards.ref}" - shell: "{QUAST} -t {threads} -R {input.ref} {input.fragments} -o {params} >/dev/null 2>&1 && " - "cp {params}/report.tsv {output}" - -#---- Contigs of interest ------------------------------------------------------ +#---- F1 stats ---------------------------------------------------------------- rule filter_ref_alignments: - input: "{path}/report.txt" - output: "{path}/{fragments}.info" - params: "{path}/contigs_reports/nucmer_output/{fragments}.coords.filtered" - shell: "if [ -f {params} ] ; then {SCRIPTS}/filter_nucmer.py {params} {output} {MIN_CONTIG_LENGTH} 70 ; else touch {output} ; fi" - -#---- GF of combined sample ---------------------------------------------------- -#rule combine_filtered: -# input: contigs=expand("assembly/{sample}.fasta", sample=GROUPS), -# filters=expand("stats/{{ref}}/{sample}.cont", sample=GROUPS) -# output: "stats/{ref}.fasta" -# message: "Gathering all interesting contigs for {wildcards.ref} into a single assembly" -# shell: "{SCRIPTS}/filter_contigs.py {SAMPLE_COUNT} {output} {input.contigs} {input.filters}" - -rule quast_combined: - input: ref=ref_path, contigs="stats/{ref}.fasta" - output: "stats/q_{ref}_all/report.tsv" - params: "stats/q_{ref}_all" - log: "stats/q_{ref}_all.log" - threads: THREADS - message: "Aligning combined sample on {wildcards.ref}" - shell: "{QUAST} -t {threads} -R {input.ref} {input.contigs} -o {params} >{log} 2>&1" - -# Run this -rule quast_combined_all: - input: expand("stats/q_{ref}_all/report.tsv", ref=REFS) - message: "Calculated QUAST metrics on all combined samples" - -#---- Bins of interest --------------------------------------------------------- -rule int_bins: - input: "annotation/{sample}.ann", "stats/{ref}/{sample}.info" - output: "stats/{ref}/{sample}.bin" - message: "Filtering interesting bins for {wildcards.sample} aligned to {wildcards.ref}" - shell: "{SCRIPTS}/filter_bins.py {input} > {output}" - -rule int_bins_all_samples: - input: expand("stats/{{ref}}/{sample}.bin", sample=GROUPS) - output: "stats/{ref}/total.bin" - message: "Gathering interesting bins for {wildcards.ref} from all samples" - run: - bins = set() - for in_fn in input: - with open(in_fn) as infile: - for line in infile: - bins.add(line) - with open(output[0], "w") as outfile: - for bin in bins: - print(bin, file=outfile) - -# Run this -rule int_bins_all: - input: expand("stats/{ref}/total.bin", ref=REFS) - message: "Gathered all interesting bins" + input: "stats/summary/gf_{stage}.tsv" + output: "stats/q_{stage}/{ref}.tsv" + params: " ".join(BINS) + message: "Filtering alignments from {wildcards.stage} onto {wildcards.ref}" + shell: "rm -f {output}\n" + "for bin in {params}\ndo\n" + " file=stats/q_{wildcards.stage}/runs_per_reference/{wildcards.ref}/contigs_reports/nucmer_output/$bin.coords.filtered\n" + " if [ -f $file ] ; then\n" + " {SCRIPTS}/filter_nucmer.py $file {MIN_CONTIG_LENGTH} 70 | awk -v b=$bin '{{print b \"-\" $0}}' >> {output}\n" + " fi\n" + "done\n" + "if [ -n {output} ] ; then touch {output} ; fi\n" -#---- GF per bin per reference ------------------------------------------------- -#Helper formatters for determining input files from different stages -PROP = {"prelim": ("assembly/{}_splits.fasta", "annotation/{}.ann"), - "prop": ("propagation/{}_edges.fasta", "propagation/{}_edges.ann")} - -#TODO: split into different directories per sample -rule split_bins: - input: lambda w: PROP[w.prop][0].format(w.sample), - lambda w: PROP[w.prop][1].format(w.sample) - output: touch("binning/{prop}/{sample}.log") - log: "binning/{prop}/split_{sample}.log" - params: "binning/{prop}" - message: "Splitting assembly of {wildcards.sample} between {wildcards.prop} bins" - shell: "{SCRIPTS}/split_bins.py {input} {params} >{log}" - -rule cat_binned_contigs: - input: expand("binning/{{prop}}/{sample}.log", sample=SAMPLES) - output: "binning/{prop}/{cag,CAG\d+}.fasta" - params: "`ls binning/{prop}/*-{cag}.fasta`" - message: "Combine binned contigs ({wildcards.prop}) for {wildcards.cag}" - shell: "cat {params} > {output}" - -#Two helpers for determining dependencies of QUAST targets. -#For split contigs and reassemblies, we need only corresponding FASTA. -#For combined contigs, we need to glue their split pieces first. -def stats_input(wildcards): - if wildcards.stage == "reassembly": - return expand("reassembly/{cag}.fasta", cag=CAGS) - w_bin, w_prop = wildcards.stage.split("_", 2) - if w_bin == "split": - return expand("binning/{prop}/{sample}.log", prop=w_prop, sample=GROUPS) - elif w_bin == "bin": - return expand("binning/{prop}/{cag}.fasta", prop=w_prop, cag=CAGS) - -def stats_data(wildcards): - if wildcards.stage == "reassembly": - return "`ls reassembly/CAG*.fasta`" - w_bin, w_prop = wildcards.stage.split("_", 2) - masks = {"bin": "CAG*", "split": "*-CAG*"} - return "`ls binning/{}/{}.fasta`".format(w_prop, masks[w_bin]) - -rule quast_stats: - input: stats_input +rule combine_refs_info: + input: expand("stats/q_{{stage}}/{ref}.tsv", ref=REFS) + output: "stats/q_{stage}/total.tsv" + params: " ".join(REFS) + message: "Combining good contigs of {wildcards.stage} for all bins" + shell: "rm -f {output}\n" + "for ref in {params}\ndo\n" + " file=stats/q_{wildcards.stage}/$ref.tsv\n" + " if [ -f $file ] ; then\n" + " awk -v r=$ref '{{print $0 \"\\t\" r}}' $file >> {output}\n" + " fi\n" + "done" + +ruleorder: combine_refs_info > filter_ref_alignments + +rule calc_f1: + input: binning="{stage}/binning.tsv", contigs="{stage}/all.fasta", etalons="stats/q_{stage}/total.tsv" + output: "stats/summary/f1_{stage}.txt" + log: "stats/f1_{stage}.log" + message: "Calculating F1 metrics for {wildcards.stage}" + shell: "{SCRIPTS}/validate.pl --ffile={input.contigs} --cfile={input.binning} --sfile={input.etalons} --ofile={log} > {output}" + +#---- Reference profiles ------------------------------------------------------- +rule combine_refs: + input: list(REFS.values()) + output: "stats/refs.fasta" + message: "Combining all references in a single file" + shell: "cat {input} > {output}" + +rule ref_profiles: + input: "stats/refs.fasta" + output: "stats/summary/ref_profiles.tsv" + log: "stats/ref_profiles.log" + message: "Calculating etalon profiles of references" + shell: "{BIN}/contig_abundance_counter -k {PROFILE_K} -w tmp -c {input}" + " -n {SAMPLE_COUNT} -m profile/mts/kmers -o {output} >{log} 2>&1" + +#---- Genome fraction ---------------------------------------------------------- +rule genome_fraction: + input: expand("{{stage}}/bins/{bin}.fasta", bin=BINS) #stats_input output: "stats/summary/gf_{stage}.tsv" - params: data=stats_data, out="stats/q_{stage}" - log: "stats/q_{stage}.log" + params: out="stats/q_{stage}" + log: "stats/q_{stage}/.log" threads: THREADS - message: "Aligning {wildcards.stage} assemblies on all references" - shell: "{METAQUAST} -t {threads} -R {ALL_REFS} {params.data} -o {params.out} >{log} 2>&1 && " + message: "Aligning all of {wildcards.stage} on all references" + shell: "{METAQUAST} -t {threads} -R {REFS_STR} {input} -o {params.out} >/dev/null 2>&1 && " "cp '{params.out}/summary/TSV/Genome_fraction_(%).tsv' {output}" -# Run this AFTER 'all' -rule stats_all: - input: expand("stats/summary/gf_{bin}_{prop}.tsv", bin=["bin"], prop=["prelim", "prop"]), - "stats/initial_assembly/total.cont" - message: "Gathered some numbers, deal with them." - -#---- Reassembly statistics ---------------------------------------------------- - -# Run this AFTER 'reassembly_all' -rule stats_reassembly: - input: "stats/summary/gf_reassembly.tsv", - "stats/reassembly/total.cont" - output: "stats/summary/reassembly.tsv" - params: "stats/q_reassembly" - message: "Gathered bins stats" - shell: "{SCRIPTS}/gather_stats.py {params} > {output}" +#---- GF per bin per reference ------------------------------------------------- +CONTIGS_INPUT_DIR = {"binning": "assembly/splits", "propagation": "propagation/edges"} + +rule combine_splits: + input: expand("propagation/edges/{group}.fasta", assembler=ASSEMBLER, group=GROUPS) + output: "propagation/edges/all.fasta" + message: "Combine all propagated edges" + shell: "{SCRIPTS}/combine_contigs.py {input} > {output}" + +#Redistribute sample pieces to corresponding bins +rule filter_bin: + input: contigs=lambda w: "{}/all.fasta".format(CONTIGS_INPUT_DIR[w.stage]), + ann="{stage}/annotation/all.ann" + output: "{stage,(binning|propagation)}/bins/{bin,\w*\d+}.fasta" + message: "Filtering contigs from {wildcards.bin} for all of {wildcards.stage}" + shell: "{SCRIPTS}/filter_bin.py {input.contigs} {input.ann} {wildcards.bin} >{output}" #---- Propagator statistics ---------------------------------------------------- -rule prop_stats: - input: prelim="annotation/{sample}.ann", prop="annotation/{sample}_edges.ann", - contigs="assembly/{sample}.fasta", edges="assembly/{sample}_edges.fasta", - ref=REFS.values() #, bins="{sample}/{ref}.bin" - output: "stats/prop_{cag}/{sample}.tsv" - log: "stats/prop_{cag}/{sample}.log" - message: "Calculating propagation statistics for {wildcards.sample}" - shell: "{BIN}/stats -k {K} -s {wildcards.sample}/assembly/{SAVES} -r {input.ref}" - " -c {input.contigs} -a {input.prelim} -e {input.edges} -p {input.prop}" - " -b {wildcards.cag} -o {output} >{log}" +# rule prop_stats: +# input: prelim="binning/annotation/{sample}.ann", prop="propagation/annotation/{sample}.ann", +# contigs="assembly/{sample}.fasta", edges="assembly/edges/{sample}.fasta", +# ref=REFS.values() #, bins="{sample}/{ref}.bin" +# output: "stats/prop_{bin}/{sample}.tsv" +# log: "stats/prop_{bin}/{sample}.log" +# message: "Calculating propagation statistics for {wildcards.sample}" +# shell: "{BIN}/stats -k {K} -s {wildcards.sample}/assembly/{SAVES} -r {input.ref}" +# " -c {input.contigs} -a {input.prelim} -e {input.edges} -p {input.prop}" +# " -b {wildcards.bin} -o {output} >{log}" # Run this -rule prop_stats_all: - input: expand("stats/prop_{cag}/{sample}.tsv", sample=GROUPS, cag=CAGS) - message: "Calculated propagation statistics" +# rule prop_stats_all: +# input: expand("stats/prop_{cag}/{sample}.tsv", sample=GROUPS, cag=BINS) +# message: "Calculated propagation statistics" + +#=============================================================================== +#---- Reference-free stats and metrics ----------------------------------------- +#=============================================================================== #---- CheckM stats ------------------------------------------------------------- rule checkm: - input: expand("reassembly/{cag}.fasta", cag=CAGS) - output: qa="stats/checkm/qa.tsv", tree_qa="stats/checkm/tree_qa.tsv" - params: dir="stats/checkm" + output: qa="stats/checkm_{stage}/qa.tsv", tree_qa="stats/checkm_{stage}/tree_qa.tsv" + params: dir="{stage}/bins", out="stats/checkm_{stage}", threads: THREADS - shell: "set +u; source activate concoct_env; set -u \n" - "checkm tree -x fasta reassembly {params.dir} \n" - "checkm tree_qa -o 2 --tab_table -f {output.tree_qa} {params.dir}\n" - "checkm lineage_set {params.dir} {params.dir}/lineage.ms\n" - "checkm analyze -x fasta {params.dir}/lineage.ms reassembly {params.dir}\n" - "checkm qa -o 2 --tab_table -f {output.qa} {params.dir}/lineage.ms {params.dir}" + log: "stats/checkm_{stage}.log" + message: "Running CheckM for results of {wildcards.stage}" + run: + # for file in os.listdir(params.dir): + # ext = os.path.splitext(file) + # if ext in FASTA_EXTS: + # break + ext = ".fasta" + shell("set +u; source activate py27; set -u \n" + "checkm tree -t {THREADS} --pplacer_threads {THREADS} -x {ext} {params.dir} {params.out} >{log} 2>&1\n" + "checkm tree_qa -o 2 --tab_table -f {output.tree_qa} {params.out} >>{log} 2>&1\n" + "checkm lineage_set {params.out} {params.out}/lineage.ms >>{log} 2>&1\n" + "checkm analyze -t {THREADS} -x fasta {params.out}/lineage.ms {params.dir} {params.out} >>{log} 2>&1\n" + "checkm qa -t {THREADS} -o 2 --tab_table -f {output.qa} {params.out}/lineage.ms {params.out} >>{log} 2>&1") rule parse_checkm: input: qa=rules.checkm.output.qa, tree_qa=rules.checkm.output.tree_qa - output: "stats/summary/checkm.tsv" - #shell: "{SCRIPTS}/parse_checkm.py {input.qa} {input.tree_qa} > {output}" + output: "stats/summary/checkm_{stage}.tsv" + message: "Parse CheckM results for {wildcards.stage}" run: table = pandas.read_table(input.qa, dtype="str") tree_table = pandas.read_table(input.tree_qa, dtype="str", na_filter=False) @@ -238,33 +212,50 @@ res_table[column] = res_table[column].apply(extract_taxon) res_table.to_csv(output[0], index=False, sep="\t") +rule combine_bins: + input: expand("{{stage}}/bins/{bin}.fasta", bin=BINS) + output: "{stage,(binning|propagation|reassembly)}/all.fasta" + message: "Combine all contigs from {wildcards.stage}" + shell: "{SCRIPTS}/combine_contigs.py -r {input} > {output}" + +rule reassembly_mock_binning: + output: "reassembly/binning.tsv" + message: "Preparing reassembly mock binning" + run: + shell("rm -f {output}") + for bin in BINS: + shell("grep '>' reassembly/bins/{bin}.fasta | cut -c 2- | awk '{{print \"{bin}-\" $0 \"\\t{bin}\"}}' >> {output}") + +#---- Summary table ------------------------------------------------------------ +rule gather_stats: + input: "stats/summary/gf_{stage}.tsv" + output: "stats/summary/{stage}_summary.tsv" + params: "--problematic", "--heatmap", "stats/q_{stage}", "stats/summary/{stage}" + message: "Gathering {wildcards.stage} stats" + shell: "{SCRIPTS}/gather_stats.py {params}" + #---- PCA ---------------------------------------------------------------------- -rule pca: - input: "profile/canopy.in", "profile/canopy.out", "{sample}.cont" - output: "stats/{sample}.png" - message: "Doing some visualization" - shell: - "Rscript {SCRIPTS}/pca.R {input} {output}" - -def fragments_info_by_assembly_type(wildcards): - frags=FRAGMENT_NAMES_BY_TYPE[wildcards.assembly_type] - return expand("stats/{assembly_type}/{ref}/{fragments}.info", assembly_type=wildcards.assembly_type, ref=wildcards.ref, fragments=frags) - -rule combine_fragments_info: - input: fragments_info_by_assembly_type - output: "stats/{assembly_type}/{ref}/ref.cont" - shell: "rm -rf {output}; for f in {input}; do name=$(basename $f .info); cat $f | sed 's/^/'$name'-/g' >> {output} ; done" +# FRAGMENT_NAMES_BY_STAGE = {"reassembly": CAG_EDGES, +# "assembly": list(GROUPS.keys())} -rule combine_refs_info: - input: expand("stats/{{assembly_type}}/{ref}/ref.cont", ref=list(REFS.keys())) - output: "stats/{assembly_type}/total.cont" - run: - shell("rm -rf {output}") - for ref in REFS.keys(): - shell("awk '{{print $0 \"\t{ref}\"}}' stats/{wildcards.assembly_type}/{ref}/ref.cont >> {output}") +def fragments_info_by_stage(wildcards): + fragments=FRAGMENT_NAMES_BY_STAGE[wildcards.stage] + return expand("stats/q_{stage}/runs_per_reference/{ref}/{fs}.info", stage=wildcards.stage, ref=wildcards.ref, fs=fragments) -# Run this -rule pca_total: - input: "binning/canopy/profiles.in", "binning/canopy/binning.out", "stats/total.cont" - output: "stats/summary/pca.png" +rule pca: + input: "profile/{}/all.tsv".format(PROFILER), + "binning/{}/unified_binning.tsv".format(BINNER), + #"stats/q_{stage}/total.info" + output: "stats/summary/pca_{stage}.png" + message: "Drawing PCA visualisation for {wildcards.stage}" shell: "Rscript {SCRIPTS}/pca.R {input} {output}" + +#---- TSNE ---------------------------------------------------------------------- + +rule tsne: + input: "profile/{}/all.tsv".format(PROFILER), + "binning/{}/unified_binning.tsv".format(BINNER), + output: "stats/summary/tsne_{stage}.png" + message: "Drawing BH-TSNE visualisation for {wildcards.stage}" + shell: "python2 {SCRIPTS}/run_tsne.py {input} {output}" + " --percent 1.0 --iteration 2000 --perplexity 50" diff -Nru spades-3.10.1+dfsg/src/projects/mts/test.py spades-3.11.1+dfsg/src/projects/mts/test.py --- spades-3.10.1+dfsg/src/projects/mts/test.py 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/test.py 2017-09-28 09:05:13.000000000 +0000 @@ -4,12 +4,12 @@ import argparse import os import os.path +import pwd import re import shutil import sys import subprocess from traceback import print_exc -import yaml from scripts.common import Table @@ -18,8 +18,10 @@ text = "" def log(self, s): - self.text += s + "\n" - print(s) + msg = s + "\n" + self.text += msg + sys.stdout.write(msg) + sys.stdout.flush() def warn(self, s): msg = "WARNING: " + s @@ -82,41 +84,32 @@ def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument("--config", "-c", help="Config template") + parser.add_argument("--data", "-c", help="Directory with config and saves") parser.add_argument("dir", help="Output directory") - parser.add_argument("--saves", "-s", type=str) - parser.add_argument("--no-clean", action="store_true") - parser.add_argument("--etalons", "-e", type=str, help="Directory of GF etalons") + parser.add_argument("--saves", "-s", action="store_true", help="Reuse assemblies and/or profiles from data directory") + parser.add_argument("--multirun", "-m", action="store_true", help="Enable multiruns") + parser.add_argument("--no-clean", action="store_true", help="Do not clean the prvieous run") + parser.add_argument("--etalons", "-e", action="store_true", help="Compare with etalons") + args = parser.parse_args() return args -def prepare_config(args, workdir): - with open(os.path.join(args.config)) as template: - params = yaml.load(template) - params["BIN"] = os.path.join(workdir, "build/release/bin") - params["SCRIPTS"] = os.path.join(workdir, "src/projects/mts/scripts") - with open(os.path.join(args.dir, "config.yaml"), "w") as config: - config.write(yaml.dump(params)) - def run_mts(args, workdir): + mts_args = ["./multirun.py"] if args.multirun else ["./mts.py"] if not args.no_clean: shutil.rmtree(args.dir, True) + mts_args.extend(["--config", os.path.join(args.data, "config.yaml")]) if not os.path.exists(args.dir): os.mkdir(args.dir) - prepare_config(args, workdir) - mts_args = ["./mts.py", "--stats", args.dir] if args.saves: - log.log("Copying saves from" + args.saves) - for saves_dir in ["assembly", "reassembly"]: - full_dir = os.path.join(args.saves, saves_dir) - if os.path.isdir(full_dir): - #shutil.copytree(os.path.join(args.saves, saves_dir), os.path.join(args.dir, saves_dir)) - os.symlink(full_dir, os.path.join(args.dir, saves_dir)) - else: - log.warn("No " + saves_dir + " dir provided; skipping") - #Don't touch symlinked assemblies because it may corrupt other runs with the same dependencies - #mts_args.append("--reuse-assemblies") + for dir_name, saves_arg in [("assembly", "--reuse-assemblies"), ("profile", "--reuse-profiles")]: + saves_dir = os.path.join(args.data, dir_name) + if os.path.exists(saves_dir): + log.log("Reusing {} saves from {}".format(dir_name, saves_dir)) + mts_args.extend([saves_arg, saves_dir]) + mts_args.append(args.dir) os.chdir(os.path.join(workdir, "src/projects/mts")) + log.log("Calling " + " ".join(mts_args)) return subprocess.call(mts_args) def check_etalons(args, workdir): @@ -151,8 +144,9 @@ log.err("GF of {} in {} = {}% is higher than expected {:.2f}%".format(cag, ref, est_val, upper)) mut.res = 7 - for file in os.listdir(args.etalons): - etalon = os.path.join(args.etalons, file) + etalons_dir = os.path.join(args.data, "etalons") + for file in os.listdir(etalons_dir): + etalon = os.path.join(etalons_dir, file) estimated = os.path.join(args.dir, "stats", "summary", file) log.log("Trying to compare " + etalon + " and " + estimated) if not os.path.isfile(estimated): @@ -185,11 +179,12 @@ ecode = compile_mts(workdir) if ecode != 0: log.err("MTS compilation finished abnormally with exit code " + str(ecode)) - sys.exit(3) + sys.exit(2) ecode = run_mts(args, workdir) if ecode != 0: log.err("Error while running MTS: " + str(ecode)) + sys.exit(3) if args.etalons: ecode = check_etalons(args, workdir) diff -Nru spades-3.10.1+dfsg/src/projects/mts/visualization.hpp spades-3.11.1+dfsg/src/projects/mts/visualization.hpp --- spades-3.10.1+dfsg/src/projects/mts/visualization.hpp 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/mts/visualization.hpp 2017-09-28 09:05:13.000000000 +0000 @@ -34,7 +34,7 @@ auto ann = annotation_.Annotation(edge); std::ostringstream ss; std::transform(ann.begin(), ann.end(), ostream_iterator(ss, ":"), [&](bin_id b){ - return get(color_map_, b); + return utils::get(color_map_, b); }); return ss.str(); } @@ -63,4 +63,4 @@ visualization::visualization_utils::WriteComponentsAlongPath(gp.g, path, output_prefix, colorer_ptr, labeler); } -} \ No newline at end of file +} diff -Nru spades-3.10.1+dfsg/src/projects/online_vis/debruijn_environment.hpp spades-3.11.1+dfsg/src/projects/online_vis/debruijn_environment.hpp --- spades-3.10.1+dfsg/src/projects/online_vis/debruijn_environment.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/online_vis/debruijn_environment.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -42,7 +42,7 @@ max_vertices_(40), edge_length_bound_(1000), gp_(K, "./tmp", cfg::get().ds.reads.lib_count(), - "", + std::vector(0), cfg::get().flanking_range, cfg::get().pos.max_mapping_gap, cfg::get().pos.max_gap_diff), diff -Nru spades-3.10.1+dfsg/src/projects/online_vis/drawing_commands/draw_missasemblies.hpp spades-3.11.1+dfsg/src/projects/online_vis/drawing_commands/draw_missasemblies.hpp --- spades-3.10.1+dfsg/src/projects/online_vis/drawing_commands/draw_missasemblies.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/online_vis/drawing_commands/draw_missasemblies.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -78,7 +78,7 @@ vector genome_edges = curr_env.path_finder().FindReadPath(genome_path); vector rc_genome_edges = curr_env.path_finder().FindReadPath(reverse_genome_path); vector rc_and_usual_genome_edges(genome_edges); - push_back_all(rc_and_usual_genome_edges, rc_genome_edges); + utils::push_back_all(rc_and_usual_genome_edges, rc_genome_edges); vector edges = path.simple_path(); auto filtered_edges = FilterNonUnique(curr_env.graph(), edges, rc_and_usual_genome_edges); if(filtered_edges.size() < 2) @@ -132,7 +132,7 @@ } - DrawPicturesAlongPath(curr_env, path_to_draw, name + "_" + ToString(curr_env.graph().int_id(filtered_edges[i]))); + DrawPicturesAlongPath(curr_env, path_to_draw, name + "_" + std::to_string(curr_env.graph().int_id(filtered_edges[i]))); real_difference = (int)genome_path[index_genome].second.initial_range.start_pos - (int)path[index_contig].second.initial_range.start_pos; INFO("Diff is set to " << real_difference); continue; @@ -143,7 +143,7 @@ real_difference = (int)genome_path[index_genome].second.initial_range.start_pos - (int)path[index_contig].second.initial_range.start_pos; vector path_to_draw; path_to_draw.push_back(genome_path[index_genome].first); - DrawPicturesAlongPath(curr_env, path_to_draw, name + "_" + ToString(curr_env.graph().int_id(filtered_edges[i]))); + DrawPicturesAlongPath(curr_env, path_to_draw, name + "_" + std::to_string(curr_env.graph().int_id(filtered_edges[i]))); INFO("Diff is set to " << real_difference); } ++i; diff -Nru spades-3.10.1+dfsg/src/projects/online_vis/drawing_commands/draw_part_of_genome_command.hpp spades-3.11.1+dfsg/src/projects/online_vis/drawing_commands/draw_part_of_genome_command.hpp --- spades-3.10.1+dfsg/src/projects/online_vis/drawing_commands/draw_part_of_genome_command.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/online_vis/drawing_commands/draw_part_of_genome_command.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -18,7 +18,7 @@ namespace online_visualization { class DrawPartOfGenomeCommand : public DrawingCommand { private: - void CheckPathIntegrity(const omnigraph::de::GraphDistanceFinder& dist_finder, EdgeId first_edge, EdgeId second_edge) const { + void CheckPathIntegrity(const omnigraph::de::GraphDistanceFinder& dist_finder, EdgeId first_edge, EdgeId second_edge) const { vector distances = dist_finder.GetGraphDistancesLengths(first_edge, second_edge); if (distances[0] == 0) { INFO("Edges " << first_edge << " and " << second_edge << " are neighbouring"); diff -Nru spades-3.10.1+dfsg/src/projects/online_vis/drawing_commands/draw_polymorphic_regions.hpp spades-3.11.1+dfsg/src/projects/online_vis/drawing_commands/draw_polymorphic_regions.hpp --- spades-3.10.1+dfsg/src/projects/online_vis/drawing_commands/draw_polymorphic_regions.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/online_vis/drawing_commands/draw_polymorphic_regions.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -82,11 +82,11 @@ using namespace visualization::visualization_utils; WriteComponentSinksSources(polymorphicRegion, curr_env.folder() + "/" + - ToString(curr_env.graph().int_id(*polymorphicRegion.vertices().begin())) + ".dot", + std::to_string(curr_env.graph().int_id(*polymorphicRegion.vertices().begin())) + ".dot", visualization::graph_colorer::DefaultColorer(curr_env.graph()), curr_env.labeler()); - INFO("Component is written to " + curr_env.folder() + ToString(curr_env.graph().int_id(*polymorphicRegion.vertices().begin())) + ".dot"); + INFO("Component is written to " + curr_env.folder() + std::to_string(curr_env.graph().int_id(*polymorphicRegion.vertices().begin())) + ".dot"); } i += curr_env.graph().length(positionSecond.first) - positionSecond.second; diff -Nru spades-3.10.1+dfsg/src/projects/online_vis/drawing_commands/draw_poorly_assembled.hpp spades-3.11.1+dfsg/src/projects/online_vis/drawing_commands/draw_poorly_assembled.hpp --- spades-3.10.1+dfsg/src/projects/online_vis/drawing_commands/draw_poorly_assembled.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/online_vis/drawing_commands/draw_poorly_assembled.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -130,8 +130,8 @@ make_dir(curr_env.folder()); string pics_folder = curr_env.folder() + "/" + curr_env.GetFormattedPictureCounter() + "_" + repeat_info.seq_name + "/"; make_dir(pics_folder); - string pic_name = ToString(repeat_info.local_cnt) + "_" + ToString(repeat_info.genomic_gap) + - "_" + ToString(curr_env.graph().int_id(repeat_info.e1)) + "_" + ToString(curr_env.graph().int_id(repeat_info.e2)) + "_"; + string pic_name = std::to_string(repeat_info.local_cnt) + "_" + std::to_string(repeat_info.genomic_gap) + + "_" + std::to_string(curr_env.graph().int_id(repeat_info.e1)) + "_" + std::to_string(curr_env.graph().int_id(repeat_info.e2)) + "_"; DrawGap(curr_env, repeat_info.ref_path, pics_folder + pic_name); } @@ -153,7 +153,7 @@ const Graph& g = curr_env.graph(); vector edges; edges.push_back(repeat_info.e1); - push_back_all(edges, repeat_info.ref_path); + utils::push_back_all(edges, repeat_info.ref_path); edges.push_back(repeat_info.e2); for (EdgeId e : edges) { if (!CheckInfo(clustered_pi_idx, repeat_info.e1, e)) { diff -Nru spades-3.10.1+dfsg/src/projects/online_vis/errors.hpp spades-3.11.1+dfsg/src/projects/online_vis/errors.hpp --- spades-3.10.1+dfsg/src/projects/online_vis/errors.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/online_vis/errors.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -52,7 +52,7 @@ } bool CheckFileExists(const string& file) { - if (!path::is_regular_file(file)) { + if (!fs::is_regular_file(file)) { FireFileDoesNotExist(file); return false; } diff -Nru spades-3.10.1+dfsg/src/projects/online_vis/main.cpp spades-3.11.1+dfsg/src/projects/online_vis/main.cpp --- spades-3.10.1+dfsg/src/projects/online_vis/main.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/online_vis/main.cpp 2017-09-28 09:05:11.000000000 +0000 @@ -13,11 +13,11 @@ #include "utils/stacktrace.hpp" #include "pipeline/config_struct.hpp" #include "io/reads/io_helper.hpp" -#include "utils/simple_tools.hpp" +#include "utils/stl_utils.hpp" #include #include #include -#include "utils/memory_limit.hpp" +#include "utils/perf/memory_limit.hpp" #include "io/dataset_support/read_converter.hpp" #include "debruijn_online_visualizer.hpp" @@ -27,10 +27,10 @@ string log_props_file = cfg::get().log_filename; - if (!path::FileExists(log_props_file)) - log_props_file = path::append_path(path::parent_path(cfg_filename), cfg::get().log_filename); + if (!fs::FileExists(log_props_file)) + log_props_file = fs::append_path(fs::parent_path(cfg_filename), cfg::get().log_filename); - logger *lg = create_logger(path::FileExists(log_props_file) ? log_props_file : ""); + logger *lg = create_logger(fs::FileExists(log_props_file) ? log_props_file : ""); lg->add_writer(std::make_shared()); attach_logger(lg); @@ -43,7 +43,7 @@ VERIFY(argc > 1) using namespace online_visualization; string cfg_filename = argv[1]; - path::CheckFileExistenceFATAL(cfg_filename); + fs::CheckFileExistenceFATAL(cfg_filename); cfg::create_instance(cfg_filename); @@ -53,7 +53,7 @@ create_console_logger(cfg_filename); cout << "\nGAF (Graph Analysis Framework) started" << endl; cout << "Print help to see readme file" << endl; - limit_memory(cfg::get().max_memory * GB); + utils::limit_memory(cfg::get().max_memory * GB); DebruijnOnlineVisualizer online_vis; online_vis.init(); diff -Nru spades-3.10.1+dfsg/src/projects/online_vis/online_visualizer.hpp spades-3.11.1+dfsg/src/projects/online_vis/online_visualizer.hpp --- spades-3.10.1+dfsg/src/projects/online_vis/online_visualizer.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/online_vis/online_visualizer.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -58,15 +58,15 @@ } inline void init() { - string p = path::append_path(cfg::get().load_from, "simplification"); // just for default + string p = fs::append_path(cfg::get().load_from, "simplification"); // just for default - path::make_dir("tmp"); + fs::make_dir("tmp"); DEBUG("Adding Commands"); AddBaseCommands(); AddSpecificCommands(); DEBUG("Commands added"); DEBUG("Adding auto-completion option"); - InitAutocompletion(command_mapping_.GetCommandNamesList()); + utils::InitAutocompletion(command_mapping_.GetCommandNamesList()); //stringstream ss("load default " + p); //const Command& load_command = command_mapping_.GetCommand("load"); //DEBUG("Loading current environment"); diff -Nru spades-3.10.1+dfsg/src/projects/online_vis/setting_commands.hpp spades-3.11.1+dfsg/src/projects/online_vis/setting_commands.hpp --- spades-3.10.1+dfsg/src/projects/online_vis/setting_commands.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/online_vis/setting_commands.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -120,7 +120,7 @@ if (!CheckCorrectness(args)) return; string folder_name = args[1]; - path::make_dirs(folder_name); + fs::make_dirs(folder_name); curr_env.set_folder(folder_name); } }; diff -Nru spades-3.10.1+dfsg/src/projects/scaffold_correction/main.cpp spades-3.11.1+dfsg/src/projects/scaffold_correction/main.cpp --- spades-3.10.1+dfsg/src/projects/scaffold_correction/main.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/scaffold_correction/main.cpp 2017-09-28 09:05:11.000000000 +0000 @@ -11,9 +11,9 @@ #include "utils/segfault_handler.hpp" #include "utils/stacktrace.hpp" -#include "utils/memory_limit.hpp" -#include "utils/copy_file.hpp" -#include "utils/perfcounter.hpp" +#include "utils/perf/memory_limit.hpp" +#include "utils/filesystem/copy_file.hpp" +#include "utils/perf/perfcounter.hpp" #include "scaffold_correction.hpp" #include "pipeline/config_struct.hpp" @@ -26,7 +26,7 @@ //FIXME code duplication void load_config(const vector& cfg_fns) { for (const auto& s : cfg_fns) { - path::CheckFileExistenceFATAL(s); + fs::CheckFileExistenceFATAL(s); } cfg::create_instance(cfg_fns); @@ -49,16 +49,16 @@ string log_props_file = cfg::get().log_filename; - if (!path::FileExists(log_props_file)) - log_props_file = path::append_path(path::parent_path(cfg_filename), cfg::get().log_filename); + if (!fs::FileExists(log_props_file)) + log_props_file = fs::append_path(fs::parent_path(cfg_filename), cfg::get().log_filename); - logger *lg = create_logger(path::FileExists(log_props_file) ? log_props_file : ""); + logger *lg = create_logger(fs::FileExists(log_props_file) ? log_props_file : ""); lg->add_writer(std::make_shared()); attach_logger(lg); } int main(int argc, char** argv) { - perf_counter pc; + utils::perf_counter pc; const size_t GB = 1 << 30; @@ -68,7 +68,7 @@ try { using namespace debruijn_graph; - string cfg_dir = path::parent_path(argv[1]); + string cfg_dir = fs::parent_path(argv[1]); vector cfg_fns; for (int i = 1; i < argc; ++i) { @@ -83,7 +83,7 @@ // read configuration file (dataset path etc.) - limit_memory(cfg::get().max_memory * GB); + utils::limit_memory(cfg::get().max_memory * GB); // assemble it! INFO("Assembling dataset (" << cfg::get().dataset_file << ") with K=" << cfg::get().K); diff -Nru spades-3.10.1+dfsg/src/projects/scaffold_correction/scaffold_correction.hpp spades-3.11.1+dfsg/src/projects/scaffold_correction/scaffold_correction.hpp --- spades-3.10.1+dfsg/src/projects/scaffold_correction/scaffold_correction.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/scaffold_correction/scaffold_correction.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -6,6 +6,7 @@ #pragma once #include "io/reads/osequencestream.hpp" +#include "io/reads/single_read.hpp" #include "io/reads/file_reader.hpp" #include "pipeline/stage.hpp" #include "pipeline/graph_pack.hpp" @@ -254,12 +255,11 @@ } void OutputResults(const vector &results) { - io::osequencestream_simple oss(output_file_); + io::OutputSequenceStream oss(output_file_); for(size_t i = 0; i < results.size(); i++) { string sequence = results[i].GetSequenceString(); - if(sequence != "") { - oss.set_header(results[i].name()); - oss << sequence; + if (sequence != "") { + oss << io::SingleRead(results[i].name(), sequence); } } } diff -Nru spades-3.10.1+dfsg/src/projects/spades/chromosome_removal.cpp spades-3.11.1+dfsg/src/projects/spades/chromosome_removal.cpp --- spades-3.10.1+dfsg/src/projects/spades/chromosome_removal.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/spades/chromosome_removal.cpp 2017-09-28 09:05:12.000000000 +0000 @@ -15,7 +15,7 @@ namespace debruijn_graph { - +//TODO replace with standard methods void ChromosomeRemoval::CompressAll(Graph &g) { for (auto it = g.SmartVertexBegin(); ! it.IsEnd(); ++it) { if (g.IsDeadStart(*it) && g.IsDeadEnd(*it)) { @@ -26,27 +26,6 @@ } } -void ChromosomeRemoval::DeleteAndCompress(EdgeId e, Graph &g){ - auto start = g.EdgeStart(e); - auto end = g.EdgeEnd(e); - g.DeleteEdge(e); - bool is_cycle = (start == end || start == g.conjugate(end)); - if (g.IsDeadStart(start) && g.IsDeadEnd(start)) { - g.DeleteVertex(start); - } else { - g.CompressVertex(start); - } - if (is_cycle) { - return; - } - if (g.IsDeadStart(end) && g.IsDeadEnd(end)) { - g.DeleteVertex(end); - } else { - g.CompressVertex(end); - } -} - - size_t ChromosomeRemoval::CalculateComponentSize(EdgeId e, Graph &g_) { std::stack next; size_t deadend_count = 0; @@ -158,7 +137,7 @@ void ChromosomeRemoval::run(conj_graph_pack &gp, const char*) { //FIXME Seriously?! cfg::get().ds like hundred times... - OutputContigs(gp.g, cfg::get().output_dir + "before_chromosome_removal", false); + OutputEdgeSequences(gp.g, cfg::get().output_dir + "before_chromosome_removal"); INFO("Before iteration " << 0 << ", " << gp.g.size() << " vertices in graph"); double chromosome_coverage = RemoveLongGenomicEdges(gp, cfg::get().pd->long_edge_length, cfg::get().pd->relative_coverage ); PlasmidSimplify(gp, cfg::get().pd->long_edge_length); diff -Nru spades-3.10.1+dfsg/src/projects/spades/chromosome_removal.hpp spades-3.11.1+dfsg/src/projects/spades/chromosome_removal.hpp --- spades-3.10.1+dfsg/src/projects/spades/chromosome_removal.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/spades/chromosome_removal.hpp 2017-09-28 09:05:12.000000000 +0000 @@ -31,6 +31,5 @@ void PlasmidSimplify(conj_graph_pack &gp, size_t long_edge_bound, std::function removal_handler = 0); void CompressAll(Graph &g); - void DeleteAndCompress(EdgeId e, Graph &g); }; } diff -Nru spades-3.10.1+dfsg/src/projects/spades/CMakeLists.txt spades-3.11.1+dfsg/src/projects/spades/CMakeLists.txt --- spades-3.10.1+dfsg/src/projects/spades/CMakeLists.txt 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/spades/CMakeLists.txt 2017-09-28 09:05:12.000000000 +0000 @@ -17,6 +17,7 @@ contig_output_stage.cpp hybrid_aligning.cpp chromosome_removal.cpp + series_analysis.cpp ../mts/contig_abundance.cpp) target_include_directories(spades PRIVATE ${EXT_DIR}/include/ConsensusCore) diff -Nru spades-3.10.1+dfsg/src/projects/spades/contig_output_stage.cpp spades-3.11.1+dfsg/src/projects/spades/contig_output_stage.cpp --- spades-3.10.1+dfsg/src/projects/spades/contig_output_stage.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/spades/contig_output_stage.cpp 2017-09-28 09:05:12.000000000 +0000 @@ -5,23 +5,59 @@ //* See file LICENSE for details. //*************************************************************************** +#include "modules/path_extend/pe_resolver.hpp" #include "contig_output_stage.hpp" #include "assembly_graph/paths/bidirectional_path_io/bidirectional_path_output.hpp" namespace debruijn_graph { +vector CreatePathsWriters(const std::string &fn_base, + path_extend::FastgWriter &fastg_writer) { + using namespace path_extend; + vector writers; + + writers.push_back(ContigWriter::BasicFastaWriter(fn_base + ".fasta")); + INFO("Outputting FastG paths to " << fn_base << ".paths"); + writers.push_back([=](const ScaffoldStorage& scaffold_storage) { + fastg_writer.WritePaths(scaffold_storage, fn_base + ".paths"); + }); + return writers; +} + +template +path_extend::EdgeNamingF PlasmidNamingF(path_extend::EdgeNamingF naming_f, + const ConnectedComponentCounter &cc_counter) { + return [=, &cc_counter](const Graph &g, EdgeId e) { + return io::AddComponentId(naming_f(g, e), cc_counter.GetComponent(e)); + }; +} + void ContigOutput::run(conj_graph_pack &gp, const char*) { - auto output_dir = cfg::get().output_dir + contig_name_prefix_; + using namespace path_extend; + auto output_dir = cfg::get().output_dir; - OutputContigs(gp.g, output_dir + "before_rr", false); - OutputContigsToFASTG(gp.g, output_dir + "assembly_graph", gp.components); + std::string gfa_fn = output_dir + "assembly_graph_with_scaffolds.gfa"; + INFO("Writing GFA to " << gfa_fn); - if (output_paths_ && gp.contig_paths.size() != 0) { - DefaultContigCorrector corrector(gp.g); - DefaultContigConstructor constructor(gp.g, corrector); + std::ofstream os(gfa_fn); + GFAWriter gfa_writer(gp.g, os, + cfg::get().pd ? PlasmidNamingF(IdNamingF(), gp.components) + : IdNamingF()); + gfa_writer.WriteSegmentsAndLinks(); + + OutputEdgeSequences(gp.g, output_dir + "before_rr"); + + INFO("Outputting FastG graph to " << output_dir << "assembly_graph.fastg"); + std::string fastg_fn = output_dir + "assembly_graph.fastg"; + + FastgWriter fastg_writer(gp.g, + cfg::get().pd ? PlasmidNamingF(BasicNamingF(), gp.components) + : BasicNamingF()); + fastg_writer.WriteSegmentsAndLinks(fastg_fn); - auto name_generator = path_extend::MakeContigNameGenerator(cfg::get().mode, gp); - path_extend::ContigWriter writer(gp.g, constructor, gp.components, name_generator); + if (output_paths_ && gp.contig_paths.size() != 0) { + auto name_generator = MakeContigNameGenerator(cfg::get().mode, gp); + ContigWriter writer(gp.g, name_generator); bool output_broken_scaffolds = cfg::get().pe_params.param_set.scaffolder_options.enabled && cfg::get().use_scaffolder && @@ -37,18 +73,32 @@ WARN("Unsupported contig output mode"); } - path_extend::ScaffoldBreaker breaker(min_gap); - path_extend::PathContainer broken_scaffolds; + ScaffoldBreaker breaker(min_gap); + PathContainer broken_scaffolds; breaker.Break(gp.contig_paths, broken_scaffolds); - writer.OutputPaths(broken_scaffolds, output_dir + cfg::get().co.contigs_name); - } - writer.OutputPaths(gp.contig_paths, output_dir + cfg::get().co.scaffolds_name); + //FIXME don't we want to use FinalizePaths here? + GraphCoverageMap cover_map(gp.g, broken_scaffolds, true); + Deduplicate(gp.g, broken_scaffolds, cover_map, + /*min_edge_len*/0, + /*max_path_diff*/0); + broken_scaffolds.FilterEmptyPaths(); + broken_scaffolds.SortByLength(); + + writer.OutputPaths(broken_scaffolds, + CreatePathsWriters(output_dir + contigs_name_, + fastg_writer)); + } - OutputContigsToGFA(gp.g, gp.contig_paths, output_dir + "assembly_graph"); + auto writers = CreatePathsWriters(output_dir + cfg::get().co.scaffolds_name, fastg_writer); + writers.push_back([&](const ScaffoldStorage &storage) { + gfa_writer.WritePaths(storage); + }); + writer.OutputPaths(gp.contig_paths, writers); } else { - OutputContigs(gp.g, output_dir + "simplified_contigs", cfg::get().use_unipaths); - OutputContigs(gp.g, output_dir + cfg::get().co.contigs_name, false); + //FIXME weird logic + OutputEdgeSequences(gp.g, output_dir + "simplified_contigs"); + OutputEdgeSequences(gp.g, output_dir + contigs_name_); } } diff -Nru spades-3.10.1+dfsg/src/projects/spades/contig_output_stage.hpp spades-3.11.1+dfsg/src/projects/spades/contig_output_stage.hpp --- spades-3.10.1+dfsg/src/projects/spades/contig_output_stage.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/spades/contig_output_stage.hpp 2017-09-28 09:05:12.000000000 +0000 @@ -15,15 +15,16 @@ class ContigOutput : public spades::AssemblyStage { private: bool output_paths_; - string contig_name_prefix_; - + std::string contigs_name_; public: - ContigOutput(bool output_paths = true, bool preliminary = false, const string& contig_name_prefix = "") - : AssemblyStage("Contig Output", preliminary ? "preliminary_contig_output" : "contig_output"), - output_paths_(output_paths), contig_name_prefix_(contig_name_prefix) { } + ContigOutput(bool output_paths = true, std::string contigs_name = cfg::get().co.contigs_name) + : AssemblyStage("Contig Output", "contig_output"), output_paths_(output_paths), contigs_name_(contigs_name) { } - void run(conj_graph_pack &gp, const char *); + void load(conj_graph_pack &, const std::string &, const char *) { } + void save(const conj_graph_pack &, const std::string &, const char *) const { } + + void run(conj_graph_pack &gp, const char *); }; -} \ No newline at end of file +} diff -Nru spades-3.10.1+dfsg/src/projects/spades/distance_estimation.cpp spades-3.11.1+dfsg/src/projects/spades/distance_estimation.cpp --- spades-3.10.1+dfsg/src/projects/spades/distance_estimation.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/spades/distance_estimation.cpp 2017-09-28 09:05:12.000000000 +0000 @@ -24,7 +24,7 @@ template void estimate_with_estimator(const Graph &graph, - const omnigraph::de::AbstractDistanceEstimator& estimator, + const omnigraph::de::AbstractDistanceEstimator& estimator, omnigraph::de::AbstractPairInfoChecker& checker, PairedIndexT& clustered_index) { using config::estimation_mode; @@ -107,7 +107,7 @@ const config::debruijn_config& config = cfg::get(); size_t delta = size_t(lib.data().insert_size_deviation); size_t linkage_distance = size_t(config.de.linkage_distance_coeff * lib.data().insert_size_deviation); - GraphDistanceFinder dist_finder(gp.g, (size_t)math::round(lib.data().mean_insert_size), lib.data().read_length, delta); + GraphDistanceFinder dist_finder(gp.g, (size_t)math::round(lib.data().mean_insert_size), lib.data().read_length, delta); size_t max_distance = size_t(config.de.max_distance_coeff * lib.data().insert_size_deviation); std::function weight_function; @@ -131,27 +131,27 @@ switch (config.est_mode) { case estimation_mode::simple: { - const AbstractDistanceEstimator& + const AbstractDistanceEstimator& estimator = - DistanceEstimator(gp.g, paired_index, dist_finder, + DistanceEstimator(gp.g, paired_index, dist_finder, linkage_distance, max_distance); estimate_with_estimator(gp.g, estimator, checker, clustered_index); break; } case estimation_mode::weighted: { - const AbstractDistanceEstimator& + const AbstractDistanceEstimator& estimator = - WeightedDistanceEstimator(gp.g, paired_index, + WeightedDistanceEstimator(gp.g, paired_index, dist_finder, weight_function, linkage_distance, max_distance); estimate_with_estimator(gp.g, estimator, checker, clustered_index); break; } case estimation_mode::smoothing: { - const AbstractDistanceEstimator& + const AbstractDistanceEstimator& estimator = - SmoothingDistanceEstimator(gp.g, paired_index, + SmoothingDistanceEstimator(gp.g, paired_index, dist_finder, weight_function, linkage_distance, max_distance, config.ade.threshold, config.ade.range_coeff, @@ -185,7 +185,7 @@ double is_var = lib.data().insert_size_deviation; size_t delta = size_t(is_var); size_t linkage_distance = size_t(cfg::get().de.linkage_distance_coeff * is_var); - GraphDistanceFinder dist_finder(gp.g, (size_t) math::round(lib.data().mean_insert_size), + GraphDistanceFinder dist_finder(gp.g, (size_t) math::round(lib.data().mean_insert_size), lib.data().read_length, delta); size_t max_distance = size_t(cfg::get().de.max_distance_coeff_scaff * is_var); std::function weight_function; @@ -205,8 +205,8 @@ PairInfoWeightChecker checker(gp.g, 0.); DEBUG("Weight Filter Done"); - const AbstractDistanceEstimator& estimator = - SmoothingDistanceEstimator(gp.g, paired_index, dist_finder, + const AbstractDistanceEstimator& estimator = + SmoothingDistanceEstimator(gp.g, paired_index, dist_finder, weight_function, linkage_distance, max_distance, cfg::get().ade.threshold, cfg::get().ade.range_coeff, cfg::get().ade.delta_coeff, cfg::get().ade.cutoff, diff -Nru spades-3.10.1+dfsg/src/projects/spades/gap_closer.cpp spades-3.11.1+dfsg/src/projects/spades/gap_closer.cpp --- spades-3.10.1+dfsg/src/projects/spades/gap_closer.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/spades/gap_closer.cpp 2017-09-28 09:05:12.000000000 +0000 @@ -173,7 +173,6 @@ omnigraph::de::PairedInfoIndexT &tips_paired_idx_; const size_t min_intersection_; const size_t hamming_dist_bound_; - const int init_gap_val_; const omnigraph::de::DEWeight weight_threshold_; std::vector DiffPos(const Sequence &s1, const Sequence &s2) const { @@ -390,7 +389,6 @@ tips_paired_idx_(tips_paired_idx), min_intersection_(min_intersection), hamming_dist_bound_(hamming_dist_bound), - init_gap_val_(-10), weight_threshold_(weight_threshold) { VERIFY(min_intersection_ < g_.k()); DEBUG("weight_threshold=" << weight_threshold_); diff -Nru spades-3.10.1+dfsg/src/projects/spades/gap_closing.hpp spades-3.11.1+dfsg/src/projects/spades/gap_closing.hpp --- spades-3.10.1+dfsg/src/projects/spades/gap_closing.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/spades/gap_closing.hpp 2017-09-28 09:05:12.000000000 +0000 @@ -13,52 +13,42 @@ class GapJoiner { Graph& g_; omnigraph::EdgeRemover edge_remover_; - bool add_flanks_; - EdgeId ClipEnd(EdgeId e, size_t pos) { - VERIFY(pos > 0); + EdgeId ClipEnd(EdgeId e, size_t to_trim) { + VERIFY(to_trim < g_.length(e)); VERIFY(omnigraph::TerminalVertexCondition(g_).Check(g_.EdgeEnd(e))); VERIFY(e != g_.conjugate(e)); - if (pos == g_.length(e)) { + if (to_trim == 0) { return e; } else { - auto split_res = g_.SplitEdge(e, pos); + auto split_res = g_.SplitEdge(e, g_.length(e) - to_trim); edge_remover_.DeleteEdge(split_res.second); return split_res.first; } } - EdgeId ClipStart(EdgeId e, size_t pos) { - return g_.conjugate(ClipEnd(g_.conjugate(e), g_.length(e) - pos)); + EdgeId ClipStart(EdgeId e, size_t to_trim) { + return g_.conjugate(ClipEnd(g_.conjugate(e), to_trim)); } - EdgeId AddEdge(VertexId v1, VertexId v2, const Sequence& gap_seq) { - if (!add_flanks_) { - VERIFY_MSG(g_.VertexNucls(v1) == gap_seq.Subseq(0, g_.k()), - g_.VertexNucls(v1) << " not equal " << gap_seq.Subseq(0, g_.k())); - VERIFY_MSG(g_.VertexNucls(v2) == gap_seq.Subseq(gap_seq.size() - g_.k()), - g_.VertexNucls(v2) << " not equal " << gap_seq.Subseq(gap_seq.size() - g_.k())); - return g_.AddEdge(v1, v2, gap_seq); - } else { - DEBUG("Adding gap seq " << gap_seq); - DEBUG("Between vertices " << g_.VertexNucls(v1) << " and " << g_.VertexNucls(v2)); - return g_.AddEdge(v1, v2, g_.VertexNucls(v1) + gap_seq + g_.VertexNucls(v2)); - } + EdgeId AddEdge(VertexId v1, VertexId v2, const Sequence &gap_seq) { + DEBUG("Adding gap seq " << gap_seq); + DEBUG("Between vertices " << g_.VertexNucls(v1) << " and " << g_.VertexNucls(v2)); + return g_.AddEdge(v1, v2, g_.VertexNucls(v1) + gap_seq + g_.VertexNucls(v2)); } public: - GapJoiner(Graph& g, bool add_flanks = false) : + GapJoiner(Graph& g) : g_(g), - edge_remover_(g), - add_flanks_(add_flanks) { + edge_remover_(g) { } EdgeId operator() (const GapDescription& gap, bool compress = true) { - VERIFY(gap.start != gap.end && gap.start != g_.conjugate(gap.end)); + VERIFY(gap.left() != gap.right() && gap.left() != g_.conjugate(gap.right())); DEBUG("Processing gap " << gap.str(g_)); - EdgeId start = ClipEnd(gap.start, gap.edge_gap_start_position); - EdgeId end = ClipStart(gap.end, gap.edge_gap_end_position); - EdgeId new_edge = AddEdge(g_.EdgeEnd(start), g_.EdgeStart(end), gap.gap_seq); + EdgeId start = ClipEnd(gap.left(), gap.left_trim()); + EdgeId end = ClipStart(gap.right(), gap.right_trim()); + EdgeId new_edge = AddEdge(g_.EdgeEnd(start), g_.EdgeStart(end), gap.filling_seq()); if (compress) { return omnigraph::Compressor(g_).CompressVertexEdgeId(g_.EdgeStart(new_edge)); diff -Nru spades-3.10.1+dfsg/src/projects/spades/hybrid_aligning.cpp spades-3.11.1+dfsg/src/projects/spades/hybrid_aligning.cpp --- spades-3.10.1+dfsg/src/projects/spades/hybrid_aligning.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/spades/hybrid_aligning.cpp 2017-09-28 09:05:12.000000000 +0000 @@ -86,7 +86,7 @@ DEBUG("Gap info successfully created"); return GapDescription(left, right, *gap_seq, - left_offset, + g_.length(left) - left_offset, right_offset); } else { DEBUG("Something wrong with read subsequence"); @@ -416,7 +416,7 @@ //FIXME make const auto& reads = cfg::get_writable().ds.reads[lib_id]; - SequenceMapperNotifier notifier(gp); + SequenceMapperNotifier notifier(gp, cfg::get_writable().ds.reads.lib_count()); //FIXME pretty awful, would be much better if listeners were shared ptrs LongReadMapper read_mapper(gp.g, gp.single_long_reads[lib_id], ChooseProperReadPathExtractor(gp.g, reads.type())); diff -Nru spades-3.10.1+dfsg/src/projects/spades/hybrid_gap_closer.hpp spades-3.11.1+dfsg/src/projects/spades/hybrid_gap_closer.hpp --- spades-3.10.1+dfsg/src/projects/spades/hybrid_gap_closer.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/spades/hybrid_gap_closer.hpp 2017-09-28 09:05:12.000000000 +0000 @@ -59,7 +59,7 @@ DECL_LOGGER("GapStorage"); void HiddenAddGap(const GapDescription& p) { - inner_index_[p.start].push_back(p); + inner_index_[p.left()].push_back(p); } size_t FillIndex() { @@ -87,8 +87,8 @@ auto copy_dest = gaps.begin(); for (const info_it_pair& ep_gaps : ep_ranges) { if (filter_f(ep_gaps.first, ep_gaps.second)) { - DEBUG("Erasing candidates between " << g_.int_id(ep_gaps.first->start) << " and " - << g_.int_id(ep_gaps.first->end)); + DEBUG("Erasing candidates between " << g_.int_id(ep_gaps.first->left()) << " and " + << g_.int_id(ep_gaps.first->right())); } else { if (copy_dest == const_iterator_cast(gaps, ep_gaps.first)) { copy_dest = const_iterator_cast(gaps, ep_gaps.second); @@ -108,7 +108,7 @@ void FilterByEdgePair(const EdgePairPred &filter_f) { FilterByCandidates([=](gap_info_it info_start, gap_info_it /*info_end*/) { - return filter_f(EdgePair(info_start->start, info_start->end)); + return filter_f(EdgePair(info_start->left(), info_start->right())); }); } @@ -128,7 +128,7 @@ vector SecondEdges(const GapInfos& edge_gaps) const { vector jump_edges; for (auto it_pair : EdgePairGaps(edge_gaps)) { - jump_edges.push_back(it_pair.first->end); + jump_edges.push_back(it_pair.first->right()); } return jump_edges; }; @@ -196,8 +196,8 @@ void FilterIndex(size_t min_weight, size_t max_flank) { DEBUG("Filtering by maximal allowed flanking length " << max_flank); FilterByDescription([=](const GapDescription &gap) { - return gap.edge_gap_start_position + max_flank < g_.length(gap.start) - || gap.edge_gap_end_position > max_flank; + return gap.left_trim() > max_flank + || gap.right_trim() > max_flank; }); DEBUG("Filtering by weight " << min_weight); @@ -243,7 +243,7 @@ } void AddGap(const GapDescription& p) { - if (IsCanonical(g_, p.start, p.end)) { + if (IsCanonical(g_, p.left(), p.right())) { HiddenAddGap(p); } else { HiddenAddGap(p.conjugate(g_)); @@ -277,38 +277,12 @@ } } -// void LoadFromFile(const string s) { -// FILE* file = fopen((s).c_str(), "r"); -// int res; -// char ss[5000]; -// map tmp_map; -// for (auto iter = g.ConstEdgeBegin(); !iter.IsEnd(); ++iter) { -// tmp_map[g.int_id(*iter)] = *iter; -// } -// while (!feof(file)) { -// int first_id, second_id, first_ind, second_ind; -// int size; -// res = fscanf(file, "%d %d\n", &first_id, &size); -// VERIFY(res == 2); -// for (int i = 0; i < size; i++) { -// res = fscanf(file, "%d %d\n", &first_id, &first_ind); -// VERIFY(res == 2); -// res = fscanf(file, "%d %d\n", &second_id, &second_ind); -// VERIFY(res == 2); -// res = fscanf(file, "%s\n", ss); -// VERIFY(res == 1); -// GapDescription gap(tmp_map[first_id], tmp_map[second_id], Sequence(ss), first_ind, second_ind); -// this->AddGap(gap); -// } -// } -// } - //edge_gaps must be sorted vector EdgePairGaps(const GapInfos& edge_gaps) const { vector answer; auto ep_start = edge_gaps.begin(); for (auto it = ep_start; it != edge_gaps.end(); ++it) { - if (it->end != ep_start->end) { + if (it->right() != ep_start->right()) { answer.push_back({ep_start, it}); ep_start = it; } @@ -414,7 +388,8 @@ bool CheckGapsValidity(const vector& gaps) const { vector answer; return std::all_of(gaps.begin(), gaps.end(), [&](const GapDescription &gap) { - return IsCanonical(g_, gap.start, gap.end) && gap.start != gap.end && gap.start != g_.conjugate(gap.end); + return IsCanonical(g_, gap.left(), gap.right()) && + gap.left() != gap.right() && gap.left() != g_.conjugate(gap.right()); }); } @@ -431,7 +406,7 @@ vector EdgesNeedingSplit(const SplitInfo& left_split_info, const SplitInfo& right_split_info) const { vector answer; - for (EdgeId e : key_set(left_split_info)) + for (EdgeId e : utils::key_set(left_split_info)) if (right_split_info.count(e)) answer.push_back(e); return answer; @@ -445,63 +420,63 @@ return (left_split + right_split) / 2; } - bool Update(EdgeId& e, size_t& gap_pos, EdgePair split_orig_ep, EdgePair split_res, bool gap_start) const { + bool UpdateLeft(GapDescription &gap, EdgePair split_orig_ep, EdgePair split_res) const { + EdgeId e = gap.left(); + EdgeId split_orig = split_orig_ep.first; if (e == split_orig_ep.second) { split_orig = split_orig_ep.second; split_res = Conjugate(g_, split_res); } + if (e == split_orig) { - if (gap_start) { - e = split_res.second; - gap_pos = gap_pos - g_.length(split_res.first); - } else { - e = split_res.first; - } + VERIFY(gap.left_trim() < g_.length(split_res.second)); + gap.set_left(split_res.second); return true; } + return false; } - void UpdateGap(GapDescription& gap, EdgePair split_orig, EdgePair split_res) const { - bool u1 = Update(gap.start, gap.edge_gap_start_position, split_orig, split_res, true); - bool u2 = Update(gap.end, gap.edge_gap_end_position, split_orig, split_res, false); - VERIFY(u1 != u2); - } + bool UpdateRight(GapDescription &gap, EdgePair split_orig_ep, EdgePair split_res) const { + EdgeId e = gap.right(); - bool CheckInsert(EdgeId e, set& used_edges) const { - return used_edges.insert(e).second; - } + EdgeId split_orig = split_orig_ep.first; + if (e == split_orig_ep.second) { + split_orig = split_orig_ep.second; + split_res = Conjugate(g_, split_res); + } - bool CheckInsert(const vector edges, set& used_edges) const { - for (EdgeId e : edges) { - if (!CheckInsert(e, used_edges)) { - return false; - } + if (e == split_orig) { + VERIFY(gap.right_trim() < g_.length(split_res.first)); + gap.set_right(split_res.first); + return true; } - return true; + + return false; + } + + void UpdateGap(GapDescription& gap, EdgePair split_orig, EdgePair split_res) const { + bool u1 = UpdateLeft(gap, split_orig, split_res); + bool u2 = UpdateRight(gap, split_orig, split_res); + VERIFY(u1 != u2); } std::set RelevantEdges(const GapDescription& gap) const { std::set answer; - answer.insert(gap.start); - answer.insert(g_.conjugate(gap.start)); - answer.insert(gap.end); - answer.insert(g_.conjugate(gap.end)); + answer.insert(gap.left()); + answer.insert(g_.conjugate(gap.left())); + answer.insert(gap.right()); + answer.insert(g_.conjugate(gap.right())); return answer; } bool CheckGaps(const vector& gaps) const { set used_edges; - for (const auto& gap : gaps) { - const auto relevant = RelevantEdges(gap); - //TODO check the semantics of all_of - if (!std::all_of(relevant.begin(), relevant.end(), [&](const EdgeId& e) { - return used_edges.insert(e).second; - })) { - return false; - } - } + for (const auto& gap : gaps) + for (EdgeId e : RelevantEdges(gap)) + if (!used_edges.insert(e).second) + return false; return true; } @@ -511,8 +486,8 @@ for (size_t i = 0; i < canonical_gaps.size(); ++i) { const auto& gap = canonical_gaps[i]; DEBUG("Processing gap " << gap.str(g_)); - Add(i, gap.start, gap.edge_gap_start_position, right_split_pos, left_split_pos); - Add(i, gap.end, gap.edge_gap_end_position, left_split_pos, right_split_pos); + Add(i, gap.left(), g_.length(gap.left()) - gap.left_trim(), right_split_pos, left_split_pos); + Add(i, gap.right(), gap.right_trim(), left_split_pos, right_split_pos); } set to_ignore; @@ -545,7 +520,7 @@ }; public: - MultiGapJoiner(Graph& g) : g_(g), inner_joiner_(g, true) { + MultiGapJoiner(Graph& g) : g_(g), inner_joiner_(g) { } //Resulting graph should be condensed @@ -587,39 +562,39 @@ return ss.str(); } - GapDescription ConstructConsensus(EdgeId start, - EdgeId end, - size_t edge_gap_start_position, - size_t edge_gap_end_position, + GapDescription ConstructConsensus(EdgeId left, + EdgeId right, + size_t left_trim, + size_t right_trim, const vector& gap_variants) const { DEBUG(gap_variants.size() << " gap closing variants, lengths: " << PrintLengths(gap_variants)); DEBUG("var size original " << gap_variants.size()); vector new_gap_variants(gap_variants.begin(), gap_variants.end()); new_gap_variants.resize(std::min(max_consensus_reads_, gap_variants.size())); auto s = consensus_(new_gap_variants); - DEBUG("consenus for " << g_.int_id(start) - << " and " << g_.int_id(end) + DEBUG("consenus for " << g_.int_id(left) + << " and " << g_.int_id(right) << " found: '" << s << "'"); - return GapDescription(start, end, + return GapDescription(left, right, Sequence(s), - edge_gap_start_position, edge_gap_end_position); + left_trim, right_trim); } //all gaps guaranteed to correspond to a single edge pair GapInfos PadGaps(gap_info_it start, gap_info_it end) const { - size_t start_min = std::numeric_limits::max(); - size_t end_max = 0; + size_t start_trim = 0; + size_t end_trim = 0; size_t long_seqs = 0; size_t short_seqs = 0; for (auto it = start; it != end; ++it) { const auto& gap = *it; - if (gap.gap_seq.size() > long_seq_limit_) + if (gap.filling_seq().size() > long_seq_limit_) long_seqs++; else short_seqs++; - start_min = std::min(start_min, gap.edge_gap_start_position); - end_max = std::max(end_max, gap.edge_gap_end_position); + start_trim = std::max(start_trim, gap.left_trim()); + end_trim = std::max(end_trim, gap.right_trim()); } const bool exclude_long_seqs = (short_seqs >= min_weight_ && short_seqs > long_seqs); @@ -628,19 +603,20 @@ for (auto it = start; it != end; ++it) { const auto& gap = *it; - if (exclude_long_seqs && gap.gap_seq.size() > long_seq_limit_) + if (exclude_long_seqs && gap.filling_seq().size() > long_seq_limit_) continue; - string s = g_.EdgeNucls(gap.start).Subseq(start_min + g_.k(), gap.edge_gap_start_position + g_.k()).str(); - s += gap.gap_seq.str(); - s += g_.EdgeNucls(gap.end).Subseq(gap.edge_gap_end_position, end_max).str(); - answer.push_back(GapDescription(gap.start, gap.end, Sequence(s), start_min, end_max)); + size_t start_nucl_size = g_.length(gap.left()) + g_.k(); + string s = g_.EdgeNucls(gap.left()).Subseq(start_nucl_size - start_trim, start_nucl_size - gap.left_trim()).str(); + s += gap.filling_seq().str(); + s += g_.EdgeNucls(gap.right()).Subseq(gap.right_trim(), end_trim).str(); + answer.push_back(GapDescription(gap.left(), gap.right(), Sequence(s), start_trim, end_trim)); } return answer; } GapDescription ConstructConsensus(gap_info_it start_it, gap_info_it end_it) const { - DEBUG("Considering extension " << g_.str(start_it->end)); + DEBUG("Considering extension " << g_.str(start_it->right())); size_t cur_len = end_it - start_it; //low weight connections filtered earlier @@ -656,7 +632,7 @@ vector gap_variants; std::transform(padded_gaps.begin(), padded_gaps.end(), std::back_inserter(gap_variants), [](const GapDescription& gap) { - return gap.gap_seq.str(); + return gap.filling_seq().str(); }); //for (auto it = start_it; it != end_it; ++it) { @@ -667,16 +643,16 @@ //} auto padded_gap = padded_gaps.front(); - return ConstructConsensus(padded_gap.start, padded_gap.end, - padded_gap.edge_gap_start_position, - padded_gap.edge_gap_end_position, + return ConstructConsensus(padded_gap.left(), padded_gap.right(), + padded_gap.left_trim(), + padded_gap.right_trim(), gap_variants); } GapDescription ConstructConsensus(EdgeId e) const { DEBUG("Constructing consensus for edge " << g_.str(e)); vector closures; - for (const auto& edge_pair_gaps : storage_.EdgePairGaps(get(storage_.inner_index(), e))) { + for (const auto& edge_pair_gaps : storage_.EdgePairGaps(utils::get(storage_.inner_index(), e))) { auto consensus = ConstructConsensus(edge_pair_gaps.first, edge_pair_gaps.second); if (consensus != INVALID_GAP) { closures.push_back(consensus); @@ -733,7 +709,7 @@ gap_joiner(ConstructConsensus()); - CompressAllVertices(g_, true, /*chunk_cnt*/100); + CompressAllVertices(g_, /*chunk_cnt*/100); return fate_tracker.Old2NewMapping(); }; diff -Nru spades-3.10.1+dfsg/src/projects/spades/launch.hpp spades-3.11.1+dfsg/src/projects/spades/launch.hpp --- spades-3.10.1+dfsg/src/projects/spades/launch.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/spades/launch.hpp 2017-09-28 09:05:12.000000000 +0000 @@ -96,13 +96,12 @@ SPAdes.add(new debruijn_graph::MismatchCorrection()); if (cfg::get().rr_enable) { if (two_step_rr) { - string prelim_prefix = "preliminary_"; if (cfg::get().use_intermediate_contigs) SPAdes.add(new debruijn_graph::PairInfoCount(true)) .add(new debruijn_graph::DistanceEstimation(true)) .add(new debruijn_graph::RepeatResolution(true)) - .add(new debruijn_graph::ContigOutput(true, true, prelim_prefix)) - .add(new debruijn_graph::SecondPhaseSetup(prelim_prefix)); + .add(new debruijn_graph::ContigOutput()) + .add(new debruijn_graph::SecondPhaseSetup()); SPAdes.add(new debruijn_graph::Simplification()); } @@ -120,10 +119,12 @@ //No graph modification allowed after HybridLibrariesAligning stage! - SPAdes.add(new debruijn_graph::ContigOutput(false, false, "pre_pe_")) + SPAdes.add(new debruijn_graph::ContigOutput(false, "intermediate_contigs")) .add(new debruijn_graph::PairInfoCount()) .add(new debruijn_graph::DistanceEstimation()) .add(new debruijn_graph::RepeatResolution()); + } else { + SPAdes.add(new debruijn_graph::ContigOutput(false)); } SPAdes.add(new debruijn_graph::ContigOutput()); @@ -131,7 +132,7 @@ SPAdes.run(conj_gp, cfg::get().entry_point.c_str()); // For informing spades.py about estimated params - debruijn_graph::config::write_lib_data(path::append_path(cfg::get().output_dir, "final")); + debruijn_graph::config::write_lib_data(fs::append_path(cfg::get().output_dir, "final")); INFO("SPAdes finished"); } diff -Nru spades-3.10.1+dfsg/src/projects/spades/main.cpp spades-3.11.1+dfsg/src/projects/spades/main.cpp --- spades-3.10.1+dfsg/src/projects/spades/main.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/spades/main.cpp 2017-09-28 09:05:12.000000000 +0000 @@ -10,15 +10,15 @@ */ #include "utils/logger/log_writers.hpp" -#include "utils/memory_limit.hpp" +#include "utils/perf/memory_limit.hpp" #include "utils/segfault_handler.hpp" #include "launch.hpp" -#include "utils/copy_file.hpp" +#include "utils/filesystem/copy_file.hpp" #include "version.hpp" void load_config(const vector& cfg_fns) { for (const auto& s : cfg_fns) { - path::CheckFileExistenceFATAL(s); + fs::CheckFileExistenceFATAL(s); } cfg::create_instance(cfg_fns); @@ -41,16 +41,17 @@ string log_props_file = cfg::get().log_filename; - if (!path::FileExists(log_props_file)) - log_props_file = path::append_path(dir, cfg::get().log_filename); + if (!fs::FileExists(log_props_file)) + log_props_file = fs::append_path(dir, cfg::get().log_filename); - logger *lg = create_logger(path::FileExists(log_props_file) ? log_props_file : ""); + logger *lg = create_logger(fs::FileExists(log_props_file) ? log_props_file : ""); lg->add_writer(std::make_shared()); + //lg->add_writer(std::make_shared(std::make_shared())); attach_logger(lg); } int main(int argc, char **argv) { - perf_counter pc; + utils::perf_counter pc; const size_t GB = 1 << 30; @@ -60,7 +61,7 @@ try { using namespace debruijn_graph; - string cfg_dir = path::parent_path(argv[1]); + string cfg_dir = fs::parent_path(argv[1]); vector cfg_fns; for (int i = 1; i < argc; ++i) { @@ -79,7 +80,7 @@ // read configuration file (dataset path etc.) - limit_memory(cfg::get().max_memory * GB); + utils::limit_memory(cfg::get().max_memory * GB); // assemble it! INFO("Starting SPAdes, built from " diff -Nru spades-3.10.1+dfsg/src/projects/spades/pair_info_count.cpp spades-3.11.1+dfsg/src/projects/spades/pair_info_count.cpp --- spades-3.10.1+dfsg/src/projects/spades/pair_info_count.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/spades/pair_info_count.cpp 2017-09-28 09:05:11.000000000 +0000 @@ -14,6 +14,8 @@ #include "modules/alignment/bwa_sequence_mapper.hpp" #include "paired_info/pair_info_filler.hpp" #include "modules/path_extend/split_graph_pair_info.hpp" +#include "modules/alignment/rna/ss_coverage_filler.hpp" + #include "adt/bf.hpp" #include "adt/hll.hpp" @@ -110,6 +112,64 @@ EdgePairCounter counter_; }; +static bool HasGoodRRLibs() { + for (const auto &lib : cfg::get().ds.reads) { + if (lib.is_contig_lib()) + continue; + + if (lib.is_paired() && + lib.data().mean_insert_size == 0.0) + continue; + + if (lib.is_repeat_resolvable()) + return true; + } + + return false; +} + +static bool HasOnlyMP() { + for (const auto &lib : cfg::get().ds.reads) { + if (lib.type() == io::LibraryType::PathExtendContigs) + continue; + + if (lib.type() != io::LibraryType::MatePairs && + lib.type() != io::LibraryType::HQMatePairs) + return false; + } + + return true; +} + +static bool ShouldObtainLibCoverage() { + return cfg::get().calculate_coverage_for_each_lib; +} + +//todo improve logic +static bool ShouldObtainSingleReadsPaths(size_t ilib) { + using config::single_read_resolving_mode; + switch (cfg::get().single_reads_rr) { + case single_read_resolving_mode::all: + return true; + case single_read_resolving_mode::only_single_libs: + //Map when no PacBio/paried libs or only mate-pairs or single lib itself + if (!HasGoodRRLibs() || HasOnlyMP() || + cfg::get().ds.reads[ilib].type() == io::LibraryType::SingleReads) { + if (cfg::get().mode != debruijn_graph::config::pipeline_type::meta) { + return true; + } else { + WARN("Single reads are not used in metagenomic mode"); + } + } + break; + case single_read_resolving_mode::none: + break; + default: + VERIFY_MSG(false, "Invalid mode value"); + } + return false; +} + static bool CollectLibInformation(const conj_graph_pack &gp, size_t &edgepairs, size_t ilib, size_t edge_length_threshold) { @@ -117,7 +177,7 @@ InsertSizeCounter hist_counter(gp, edge_length_threshold); EdgePairCounterFiller pcounter(cfg::get().max_threads); - SequenceMapperNotifier notifier(gp); + SequenceMapperNotifier notifier(gp, cfg::get_writable().ds.reads.lib_count()); notifier.Subscribe(ilib, &hist_counter); notifier.Subscribe(ilib, &pcounter); @@ -157,19 +217,30 @@ } // FIXME: This needs to be static -void ProcessSingleReads(conj_graph_pack &gp, +static void ProcessSingleReads(conj_graph_pack &gp, size_t ilib, bool use_binary = true, bool map_paired = false) { //FIXME make const auto& reads = cfg::get_writable().ds.reads[ilib]; - SequenceMapperNotifier notifier(gp); - //FIXME pretty awful, would be much better if listeners were shared ptrs + SequenceMapperNotifier notifier(gp, cfg::get_writable().ds.reads.lib_count()); + LongReadMapper read_mapper(gp.g, gp.single_long_reads[ilib], ChooseProperReadPathExtractor(gp.g, reads.type())); - notifier.Subscribe(ilib, &read_mapper); + if (ShouldObtainSingleReadsPaths(ilib) || reads.is_contig_lib()) { + //FIXME pretty awful, would be much better if listeners were shared ptrs + notifier.Subscribe(ilib, &read_mapper); + cfg::get_writable().ds.reads[ilib].data().single_reads_mapped = true; + } + + SSCoverageFiller ss_coverage_filler(gp.g, gp.ss_coverage[ilib], !cfg::get().ss.ss_enabled); + if (cfg::get().calculate_coverage_for_each_lib) { + INFO("Will calculate lib coverage as well"); + map_paired = true; + notifier.Subscribe(ilib, &ss_coverage_filler); + } auto mapper_ptr = ChooseProperMapper(gp, reads, cfg::get().bwa.bwa_enable); if (use_binary) { @@ -180,9 +251,9 @@ map_paired, /*handle Ns*/false); notifier.ProcessLibrary(single_streams, ilib, *mapper_ptr); } - cfg::get_writable().ds.reads[ilib].data().single_reads_mapped = true; } + static void ProcessPairedReads(conj_graph_pack &gp, std::unique_ptr filter, unsigned filter_threshold, size_t ilib) { @@ -197,7 +268,7 @@ round_thr = unsigned(std::min(cfg::get().de.max_distance_coeff * data.insert_size_deviation * cfg::get().de.rounding_coeff, cfg::get().de.rounding_thr)); - SequenceMapperNotifier notifier(gp); + SequenceMapperNotifier notifier(gp, cfg::get_writable().ds.reads.lib_count()); INFO("Left insert size quantile " << data.insert_size_left_quantile << ", right insert size quantile " << data.insert_size_right_quantile << ", filtering threshold " << filter_threshold << @@ -237,60 +308,6 @@ cfg::get_writable().ds.reads[ilib].data().pi_threshold = split_graph.GetThreshold(); } -static bool HasGoodRRLibs() { - for (const auto &lib : cfg::get().ds.reads) { - if (lib.is_contig_lib()) - continue; - - if (lib.is_paired() && - lib.data().mean_insert_size == 0.0) - continue; - - if (lib.is_repeat_resolvable()) - return true; - } - - return false; -} - -static bool HasOnlyMP() { - for (const auto &lib : cfg::get().ds.reads) { - if (lib.type() == io::LibraryType::PathExtendContigs) - continue; - - if (lib.type() != io::LibraryType::MatePairs && - lib.type() != io::LibraryType::HQMatePairs) - return false; - } - - return true; -} - -//todo improve logic -static bool ShouldMapSingleReads(size_t ilib) { - using config::single_read_resolving_mode; - switch (cfg::get().single_reads_rr) { - case single_read_resolving_mode::all: - return true; - case single_read_resolving_mode::only_single_libs: - //Map when no PacBio/paried libs or only mate-pairs or single lib itself - if (!HasGoodRRLibs() || HasOnlyMP() || - cfg::get().ds.reads[ilib].type() == io::LibraryType::SingleReads) { - if (cfg::get().mode != debruijn_graph::config::pipeline_type::meta) { - return true; - } else { - WARN("Single reads are not used in metagenomic mode"); - } - } - break; - case single_read_resolving_mode::none: - break; - default: - VERIFY_MSG(false, "Invalid mode value"); - } - return false; -} - void PairInfoCount::run(conj_graph_pack &gp, const char *) { gp.InitRRIndices(); gp.EnsureBasicMapping(); @@ -350,7 +367,7 @@ INFO("Filtering data for library #" << i); { - SequenceMapperNotifier notifier(gp); + SequenceMapperNotifier notifier(gp, cfg::get_writable().ds.reads.lib_count()); DEFilter filter_counter(*filter, gp.g); notifier.Subscribe(i, &filter_counter); @@ -367,8 +384,8 @@ } } - if (ShouldMapSingleReads(i)) { - cfg::get_writable().use_single_reads = true; + if (ShouldObtainSingleReadsPaths(i) || ShouldObtainLibCoverage()) { + cfg::get_writable().use_single_reads |= ShouldObtainSingleReadsPaths(i); INFO("Mapping single reads of library #" << i); ProcessSingleReads(gp, i, /*use_binary*/true, /*map_paired*/true); INFO("Total paths obtained from single reads: " << gp.single_long_reads[i].size()); diff -Nru spades-3.10.1+dfsg/src/projects/spades/repeat_resolving.cpp spades-3.11.1+dfsg/src/projects/spades/repeat_resolving.cpp --- spades-3.10.1+dfsg/src/projects/spades/repeat_resolving.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/spades/repeat_resolving.cpp 2017-09-28 09:05:12.000000000 +0000 @@ -16,6 +16,7 @@ static void PEResolving(conj_graph_pack& gp) { path_extend::PathExtendParamsContainer params(cfg::get().ds, cfg::get().pe_params, + cfg::get().ss, cfg::get().output_dir, cfg::get().mode, cfg::get().uneven_depth, diff -Nru spades-3.10.1+dfsg/src/projects/spades/second_phase_setup.cpp spades-3.11.1+dfsg/src/projects/spades/second_phase_setup.cpp --- spades-3.10.1+dfsg/src/projects/spades/second_phase_setup.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/spades/second_phase_setup.cpp 2017-09-28 09:05:11.000000000 +0000 @@ -21,10 +21,10 @@ gp.ClearRRIndices(); gp.ClearPaths(); - std::string old_pe_contigs_filename = cfg::get().output_dir + contig_name_prefix_ + "final_contigs.fasta"; + std::string old_pe_contigs_filename = cfg::get().output_dir + "final_contigs.fasta"; std::string new_pe_contigs_filename = cfg::get().output_dir + "first_pe_contigs.fasta"; - VERIFY(path::check_existence(old_pe_contigs_filename)); + VERIFY(fs::check_existence(old_pe_contigs_filename)); INFO("Moving preliminary contigs from " << old_pe_contigs_filename << " to " << new_pe_contigs_filename); int code = rename(old_pe_contigs_filename.c_str(), new_pe_contigs_filename.c_str()); VERIFY(code == 0); diff -Nru spades-3.10.1+dfsg/src/projects/spades/second_phase_setup.hpp spades-3.11.1+dfsg/src/projects/spades/second_phase_setup.hpp --- spades-3.10.1+dfsg/src/projects/spades/second_phase_setup.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/spades/second_phase_setup.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -12,12 +12,9 @@ //todo rename class SecondPhaseSetup : public spades::AssemblyStage { -private: - string contig_name_prefix_; - public: - SecondPhaseSetup(const string& contig_name_prefix = "") - : AssemblyStage("Second Phase Setup", "second_phase_setup"),contig_name_prefix_(contig_name_prefix) { } + SecondPhaseSetup() + : AssemblyStage("Second Phase Setup", "second_phase_setup") { } void run(conj_graph_pack &gp, const char *); }; diff -Nru spades-3.10.1+dfsg/src/projects/spades/series_analysis.cpp spades-3.11.1+dfsg/src/projects/spades/series_analysis.cpp --- spades-3.10.1+dfsg/src/projects/spades/series_analysis.cpp 1970-01-01 00:00:00.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/spades/series_analysis.cpp 2017-09-28 09:05:12.000000000 +0000 @@ -0,0 +1,321 @@ +//*************************************************************************** +//* Copyright (c) 2016-2017 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "assembly_graph/handlers/id_track_handler.hpp" +#include "assembly_graph/graph_support/graph_processing_algorithm.hpp" +#include "assembly_graph/graph_support/basic_edge_conditions.hpp" +#include "modules/simplification/tip_clipper.hpp" +#include "projects/mts/contig_abundance.hpp" +#include "io/reads/osequencestream.hpp" +#include "series_analysis.hpp" + +#include "llvm/Support/YAMLParser.h" +#include "llvm/Support/YAMLTraits.h" + +namespace debruijn_graph { + +struct SeriesAnalysisConfig { + uint k; + uint sample_cnt; + uint frag_size; + uint min_len; + + std::string kmer_mult, bin, bin_prof, edges_sqn, edges_mpl, edge_fragments_mpl; +}; + +} + +namespace llvm { namespace yaml { + +template<> struct MappingTraits { + static void mapping(IO& io, debruijn_graph::SeriesAnalysisConfig& cfg) { + io.mapRequired("k", cfg.k); + io.mapRequired("sample_cnt", cfg.sample_cnt); + io.mapRequired("kmer_mult", cfg.kmer_mult); + io.mapRequired("bin", cfg.bin); + io.mapRequired("bin_prof", cfg.bin_prof); + io.mapRequired("min_len", cfg.min_len); + io.mapRequired("edges_sqn", cfg.edges_sqn); + io.mapRequired("edges_mpl", cfg.edges_mpl); + io.mapRequired("edge_fragments_mpl", cfg.edge_fragments_mpl); + io.mapRequired("frag_size", cfg.frag_size); + } +}; + +} } + +namespace debruijn_graph { + +template +shared_ptr> DefaultGPColorer( + const graph_pack& gp) { + io::SingleRead genome("ref", gp.genome.str()); + auto mapper = MapperInstance(gp); + auto path1 = mapper->MapRead(genome).path(); + auto path2 = mapper->MapRead(!genome).path(); + return visualization::graph_colorer::DefaultColorer(gp.g, path1, path2); +} + +inline double l2_norm(const AbundanceVector& v) { + double s = 0.; + for (auto val : v) { + s += val * val; + } + return std::sqrt(s); +} + +inline double cosine_sim(const AbundanceVector& v1, const AbundanceVector& v2) { + double s = 0.; + for (size_t i = 0; i < v1.size(); ++i) { + s += v1[i] * v2[i]; + } + return s / (l2_norm(v1) * l2_norm(v2)); +} + +template +class EdgeAbundance: public omnigraph::GraphActionHandler { + typedef map Storage; + typedef Storage::const_iterator const_iterator; + Storage edge_abundance_; + const ContigAbundanceCounter& abundance_counter_; + +public: + EdgeAbundance(const Graph& g, const ContigAbundanceCounter& abundance_counter) : + omnigraph::GraphActionHandler(g, "EdgeAbundance"), + abundance_counter_(abundance_counter){} + + void Fill() { + for (auto it = this->g().ConstEdgeBegin(true); !it.IsEnd(); ++it) { + HandleAdd(*it); + } + } + + virtual void HandleAdd(EdgeId e) override { + auto ab = abundance_counter_(this->g().EdgeNucls(e).str()); + if (!ab) { + INFO("Couldn't estimate abundance of edge " << this->g().str(e)); + } else { + edge_abundance_[e] = *ab; + } + } + + const_iterator begin() const { + return edge_abundance_.begin(); + } + + const_iterator end() const { + return edge_abundance_.end(); + } + + const_iterator find(EdgeId e) const { + return edge_abundance_.find(e); + } + + size_t count(EdgeId e) const { + return edge_abundance_.count(e); + } + +private: + DECL_LOGGER("EdgeAbundance"); +}; + +template +class AggressiveClearing: public omnigraph::EdgeProcessingAlgorithm { + typedef typename Graph::EdgeId EdgeId; + const EdgeAbundance& edge_abundance_; + const AbundanceVector base_profile_; + const double similarity_threshold_; + const double norm_ratio_threshold_; + EdgeRemover edge_remover_; + func::TypedPredicate topological_condition_; + +protected: + virtual bool ProcessEdge(EdgeId e) override { + DEBUG("Processing edge " << this->g().str(e)); + if (!topological_condition_(e)) { + DEBUG("Topological condition failed"); + return false; + } + auto it = edge_abundance_.find(e); + if (it == edge_abundance_.end()) { + DEBUG("Edge " << this->g().str(e) << " did not have valid abundance profile"); + return false; + } + const auto& profile = it->second; + DEBUG("Edge profile " << PrintVector(profile)); + double sim = cosine_sim(profile, base_profile_); + double norm_ratio = l2_norm(profile) / l2_norm(base_profile_); + + DEBUG("Similarity between edge and base profiles " << sim); + DEBUG("Norm ratio " << norm_ratio); + if (math::ls(norm_ratio, norm_ratio_threshold_) + || math::ls(sim, similarity_threshold_)) { + DEBUG("Removing edge " << this->g().str(e)); + + edge_remover_.DeleteEdge(e); + return true; + } + return false; + } + +public: + AggressiveClearing(Graph &g, + const EdgeAbundance& edge_abundance, + const AbundanceVector& base_profile, + double similarity_threshold, + double norm_ratio_threshold, + const std::function &removal_handler = 0) : + EdgeProcessingAlgorithm(g, true), + edge_abundance_(edge_abundance), + base_profile_(base_profile), + similarity_threshold_(similarity_threshold), + norm_ratio_threshold_(norm_ratio_threshold), + edge_remover_(g, removal_handler), + topological_condition_(func::Or(AlternativesPresenceCondition(g), TipCondition(g))) { + DEBUG("Base profile " << PrintVector(base_profile_)); + } +private: + DECL_LOGGER("AggressiveClearing"); +}; + +boost::optional InferAbundance(const std::string& bin_mult_fn, + const std::string& b_id) { + fs::CheckFileExistenceFATAL(bin_mult_fn); + + ifstream is(bin_mult_fn); + std::vector abundances; + std::string name; + while (true) { + is >> name; + if (!is.fail()) { + AbundanceVector vec(SampleCount(), 0.0); + for (size_t i = 0; i < SampleCount(); ++i) { + is >> vec[i]; + VERIFY(!is.fail()); + } + if (name == b_id) { + abundances.push_back(vec); + } + } else { + INFO("Read " << abundances.size() << " profiles for bin " << b_id); + break; + } + } + return boost::optional(MeanVector(abundances)); +} + +void PrintEdgeFragmentProfiles(const conj_graph_pack &gp, const ContigAbundanceCounter &abundance_counter, + size_t split_length, size_t min_len, std::ostream &os) { + for (auto it = gp.g.ConstEdgeBegin(true); !it.IsEnd(); ++it) { + EdgeId e = *it; + io::SingleRead full_contig(std::to_string(gp.g.int_id(e)), gp.g.EdgeNucls(e).str()); + for (size_t i = 0; i < full_contig.size(); i += split_length) { + if (full_contig.size() - i < min_len) { + DEBUG("Fragment shorter than min_length_bound " << min_len); + break; + } + + io::SingleRead contig = full_contig.Substr(i, std::min(i + split_length, full_contig.size())); + + DEBUG("Processing fragment # " << (i / split_length) << " with id " << contig.name()); + + auto abundance_vec = abundance_counter(contig.GetSequenceString(), contig.name()); + + if (abundance_vec) { + size_t len = contig.GetSequenceString().size(); + os << contig.name() << " " << len << " " << PrintVector(*abundance_vec) << std::endl; + //copy(abundance_vec->begin(), abundance_vec->begin() + config.sample_cnt, + // ostream_iterator(ss, " ")); + DEBUG("Successfully estimated abundance of " << contig.name()); + } else { + DEBUG("Failed to estimate abundance of " << contig.name()); + } + } + } +} + +void SeriesAnalysis::run(conj_graph_pack &gp, const char *) { + std::string cfg = cfg::get().series_analysis; + INFO("Series analysis enabled with config " << cfg); + + auto buf = llvm::MemoryBuffer::getFile(cfg); + VERIFY_MSG(buf, "Failed to load config file " + cfg); + + llvm::yaml::Input yin(*buf.get()); + SeriesAnalysisConfig config; + yin >> config; + + SetSampleCount(config.sample_cnt); + + ContigAbundanceCounter abundance_counter(config.k, + make_shared(), + cfg::get().tmp_dir); + + DEBUG("Initiating abundance counter"); + abundance_counter.Init(config.kmer_mult); + DEBUG("Abundance counter ready"); + + if (!config.edges_sqn.empty()) { + io::OutputSequenceStream oss(config.edges_sqn); + for (auto it = gp.g.ConstEdgeBegin(true); !it.IsEnd(); ++it) { + EdgeId e = *it; + string s = gp.g.EdgeNucls(e).str(); + oss << io::SingleRead(io::MakeContigId(gp.g.int_id(e), s.size()), s); + } + } + + if (!config.edges_mpl.empty()) { + ofstream os(config.edges_mpl); + PrintEdgeFragmentProfiles(gp, abundance_counter, -1ul, config.min_len, os); + } + + if (!config.edge_fragments_mpl.empty()) { + ofstream os(config.edge_fragments_mpl); + PrintEdgeFragmentProfiles(gp, abundance_counter, config.frag_size, config.min_len, os); + } + +// boost::optional bin_profile = InferAbundance(config.bin_prof, config.bin); +// if (!bin_profile) { +// ERROR("Couldn't estimate profile of bin"); +// return; +// } +// +// EdgeAbundance edge_abundance(gp.g, abundance_counter); +// edge_abundance.Fill(); +// +// gp.EnsureBasicMapping(); +// gp.FillQuality(); +// visualization::graph_labeler::DefaultLabeler labeler(gp.g, gp.edge_pos); +// auto colorer = DefaultGPColorer(gp); +// +// /* +// fs::make_dir(cfg::get().output_dir + "pictures/"); +// QualityEdgeLocalityPrintingRH qual_removal_handler(gp.g, gp.edge_qual, labeler, colorer, +// cfg::get().output_dir + "pictures/"); +// +// INFO("Launching aggressive graph clearing"); +// //positive quality edges removed (folder colored_edges_deleted) +// AggressiveClearing clearing(gp.g, edge_abundance, +// *bin_profile, 0.8, 0.3, [&](EdgeId e) { +// qual_removal_handler.HandleDelete(e);}); +// clearing.Run(); +// INFO("Graph clearing finished"); +// */ +// +// INFO("Drawing edges with failed abundance estimate") +// fs::make_dir(cfg::get().output_dir + "pictures_no_ab/"); +// QualityEdgeLocalityPrintingRH qual_removal_handler2(gp.g, gp.edge_qual, labeler, colorer, +// cfg::get().output_dir + "pictures_no_ab/"); +// +// for (auto it = gp.g.ConstEdgeBegin(true); !it.IsEnd(); ++it) { +// EdgeId e = *it; +// if (edge_abundance.count(e) == 0) { +// qual_removal_handler2.HandleDelete(e); +// } +// } +} + +} diff -Nru spades-3.10.1+dfsg/src/projects/spades/series_analysis.hpp spades-3.11.1+dfsg/src/projects/spades/series_analysis.hpp --- spades-3.10.1+dfsg/src/projects/spades/series_analysis.hpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/spades/series_analysis.hpp 2017-09-28 09:05:11.000000000 +0000 @@ -1,239 +1,11 @@ #pragma once #include "pipeline/stage.hpp" -#include "assembly_graph/graph_support/graph_processing_algorithm.hpp" -#include "assembly_graph/graph_support/basic_edge_conditions.hpp" -#include "modules/simplification/tip_clipper.hpp" -#include "projects/mts/contig_abundance.hpp" -#include "io/reads/osequencestream.hpp" - -#include "llvm/Support/YAMLParser.h" -#include "llvm/Support/YAMLTraits.h" namespace debruijn_graph { -struct SeriesAnalysisConfig { - uint k; - uint sample_cnt; - uint frag_size; - uint min_len; - - std::string kmer_mult, bin, bin_prof, edges_sqn, edges_mpl, edge_fragments_mpl; -}; - -} - -namespace llvm { namespace yaml { - -template<> struct MappingTraits { - static void mapping(IO& io, debruijn_graph::SeriesAnalysisConfig& cfg) { - io.mapRequired("k", cfg.k); - io.mapRequired("sample_cnt", cfg.sample_cnt); - io.mapRequired("kmer_mult", cfg.kmer_mult); - io.mapRequired("bin", cfg.bin); - io.mapRequired("bin_prof", cfg.bin_prof); - io.mapRequired("min_len", cfg.min_len); - io.mapRequired("edges_sqn", cfg.edges_sqn); - io.mapRequired("edges_mpl", cfg.edges_mpl); - io.mapRequired("edge_fragments_mpl", cfg.edge_fragments_mpl); - io.mapRequired("frag_size", cfg.frag_size); - } -}; - -} } - -namespace debruijn_graph { - -template -shared_ptr> DefaultGPColorer( - const graph_pack& gp) { - io::SingleRead genome("ref", gp.genome.str()); - auto mapper = MapperInstance(gp); - auto path1 = mapper->MapRead(genome).path(); - auto path2 = mapper->MapRead(!genome).path(); - return visualization::graph_colorer::DefaultColorer(gp.g, path1, path2); -} - -inline double l2_norm(const AbundanceVector& v) { - double s = 0.; - for (auto val : v) { - s += val * val; - } - return std::sqrt(s); -} - -inline double cosine_sim(const AbundanceVector& v1, const AbundanceVector& v2) { - double s = 0.; - for (size_t i = 0; i < v1.size(); ++i) { - s += v1[i] * v2[i]; - } - return s / (l2_norm(v1) * l2_norm(v2)); -} - -template -class EdgeAbundance: public omnigraph::GraphActionHandler { - typedef map Storage; - typedef Storage::const_iterator const_iterator; - Storage edge_abundance_; - const ContigAbundanceCounter& abundance_counter_; - -public: - EdgeAbundance(const Graph& g, const ContigAbundanceCounter& abundance_counter) : - omnigraph::GraphActionHandler(g, "EdgeAbundance"), - abundance_counter_(abundance_counter){} - - void Fill() { - for (auto it = this->g().ConstEdgeBegin(true); !it.IsEnd(); ++it) { - HandleAdd(*it); - } - } - - virtual void HandleAdd(EdgeId e) override { - auto ab = abundance_counter_(this->g().EdgeNucls(e).str()); - if (!ab) { - INFO("Couldn't estimate abundance of edge " << this->g().str(e)); - } else { - edge_abundance_[e] = *ab; - } - } - - const_iterator begin() const { - return edge_abundance_.begin(); - } - - const_iterator end() const { - return edge_abundance_.end(); - } - - const_iterator find(EdgeId e) const { - return edge_abundance_.find(e); - } - - size_t count(EdgeId e) const { - return edge_abundance_.count(e); - } - -private: - DECL_LOGGER("EdgeAbundance"); -}; - -template -class AggressiveClearing: public omnigraph::EdgeProcessingAlgorithm { - typedef typename Graph::EdgeId EdgeId; - const EdgeAbundance& edge_abundance_; - const AbundanceVector base_profile_; - const double similarity_threshold_; - const double norm_ratio_threshold_; - EdgeRemover edge_remover_; - func::TypedPredicate topological_condition_; - -protected: - virtual bool ProcessEdge(EdgeId e) override { - DEBUG("Processing edge " << this->g().str(e)); - if (!topological_condition_(e)) { - DEBUG("Topological condition failed"); - return false; - } - auto it = edge_abundance_.find(e); - if (it == edge_abundance_.end()) { - DEBUG("Edge " << this->g().str(e) << " did not have valid abundance profile"); - return false; - } - const auto& profile = it->second; - DEBUG("Edge profile " << PrintVector(profile)); - double sim = cosine_sim(profile, base_profile_); - double norm_ratio = l2_norm(profile) / l2_norm(base_profile_); - - DEBUG("Similarity between edge and base profiles " << sim); - DEBUG("Norm ratio " << norm_ratio); - if (math::ls(norm_ratio, norm_ratio_threshold_) - || math::ls(sim, similarity_threshold_)) { - DEBUG("Removing edge " << this->g().str(e)); - - edge_remover_.DeleteEdge(e); - return true; - } - return false; - } - -public: - AggressiveClearing(Graph &g, - const EdgeAbundance& edge_abundance, - const AbundanceVector& base_profile, - double similarity_threshold, - double norm_ratio_threshold, - const std::function &removal_handler = 0) : - EdgeProcessingAlgorithm(g, true), - edge_abundance_(edge_abundance), - base_profile_(base_profile), - similarity_threshold_(similarity_threshold), - norm_ratio_threshold_(norm_ratio_threshold), - edge_remover_(g, removal_handler), - topological_condition_(func::Or(AlternativesPresenceCondition(g), TipCondition(g))) { - DEBUG("Base profile " << PrintVector(base_profile_)); - } -private: - DECL_LOGGER("AggressiveClearing"); -}; - class SeriesAnalysis : public spades::AssemblyStage { - boost::optional InferAbundance(const std::string& bin_mult_fn, - const std::string& b_id) const { - path::CheckFileExistenceFATAL(bin_mult_fn); - - ifstream is(bin_mult_fn); - vector abundances; - while (true) { - string name; - is >> name; - if (!is.fail()) { - AbundanceVector vec(SampleCount(), 0.0); - for (size_t i = 0; i < SampleCount(); ++i) { - is >> vec[i]; - VERIFY(!is.fail()); - } - if (name == b_id) { - abundances.push_back(vec); - } - } else { - INFO("Read " << abundances.size() << " profiles for bin " << b_id); - break; - } - } - return boost::optional(MeanVector(abundances)); - } - - void PrintEdgeFragmentProfiles(const conj_graph_pack &gp, const ContigAbundanceCounter &abundance_counter, - size_t split_length, size_t min_len, std::ostream &os) const { - for (auto it = gp.g.ConstEdgeBegin(true); !it.IsEnd(); ++it) { - EdgeId e = *it; - io::SingleRead full_contig(ToString(gp.g.int_id(e)), gp.g.EdgeNucls(e).str()); - for (size_t i = 0; i < full_contig.size(); i += split_length) { - if (full_contig.size() - i < min_len) { - DEBUG("Fragment shorter than min_length_bound " << min_len); - break; - } - - io::SingleRead contig = full_contig.Substr(i, std::min(i + split_length, full_contig.size())); - - DEBUG("Processing fragment # " << (i / split_length) << " with id " << contig.name()); - - auto abundance_vec = abundance_counter(contig.GetSequenceString(), contig.name()); - - if (abundance_vec) { - size_t len = contig.GetSequenceString().size(); - os << contig.name() << " " << len << " " << PrintVector(*abundance_vec) << std::endl; - //copy(abundance_vec->begin(), abundance_vec->begin() + config.sample_cnt, - // ostream_iterator(ss, " ")); - DEBUG("Successfully estimated abundance of " << contig.name()); - } else { - DEBUG("Failed to estimate abundance of " << contig.name()); - } - } - } - } - public: SeriesAnalysis() : AssemblyStage("Series Analysis", "series_analysis") { } @@ -241,83 +13,7 @@ void save(const conj_graph_pack &, const std::string &, const char *) const { } - void run(conj_graph_pack &gp, const char *) { - std::string cfg = cfg::get().series_analysis; - INFO("Series analysis enabled with config " << cfg); - - auto Buf = llvm::MemoryBuffer::getFile(cfg); - VERIFY_MSG(Buf, "Failed to load config file " + cfg); - - llvm::yaml::Input yin(*Buf.get()); - SeriesAnalysisConfig config; - yin >> config; - - SetSampleCount(config.sample_cnt); - - ContigAbundanceCounter abundance_counter(config.k, - SingleClusterAnalyzer(2., 0.4), - cfg::get().tmp_dir); - - DEBUG("Initiating abundance counter"); - abundance_counter.Init(config.kmer_mult); - DEBUG("Abundance counter ready"); - - if (!config.edges_sqn.empty()) { - io::osequencestream oss(config.edges_sqn); - for (auto it = gp.g.ConstEdgeBegin(true); !it.IsEnd(); ++it) { - EdgeId e = *it; - string s = gp.g.EdgeNucls(e).str(); - oss << io::SingleRead(io::MakeContigId(gp.g.int_id(e), s.size()), s); - } - } - - if (!config.edges_mpl.empty()) { - ofstream os(config.edges_mpl); - PrintEdgeFragmentProfiles(gp, abundance_counter, -1ul, config.min_len, os); - } - - if (!config.edge_fragments_mpl.empty()) { - ofstream os(config.edge_fragments_mpl); - PrintEdgeFragmentProfiles(gp, abundance_counter, config.frag_size, config.min_len, os); - } - - boost::optional bin_profile = InferAbundance(config.bin_prof, config.bin); - if (!bin_profile) { - ERROR("Couldn't estimate profile of bin"); - return; - } - - EdgeAbundance edge_abundance(gp.g, abundance_counter); - edge_abundance.Fill(); - - gp.EnsureBasicMapping(); - gp.FillQuality(); - visualization::graph_labeler::DefaultLabeler labeler(gp.g, gp.edge_pos); - auto colorer = DefaultGPColorer(gp); - path::make_dir(cfg::get().output_dir + "pictures/"); - QualityEdgeLocalityPrintingRH qual_removal_handler(gp.g, gp.edge_qual, labeler, colorer, - cfg::get().output_dir + "pictures/"); - - INFO("Launching aggressive graph clearing"); - //positive quality edges removed (folder colored_edges_deleted) - AggressiveClearing clearing(gp.g, edge_abundance, - *bin_profile, 0.8, 0.3, [&](EdgeId e) { - qual_removal_handler.HandleDelete(e);}); - clearing.Run(); - INFO("Graph clearing finished"); - - INFO("Drawing edges with failed abundance estimate") - path::make_dir(cfg::get().output_dir + "pictures_no_ab/"); - QualityEdgeLocalityPrintingRH qual_removal_handler2(gp.g, gp.edge_qual, labeler, colorer, - cfg::get().output_dir + "pictures_no_ab/"); - - for (auto it = gp.g.ConstEdgeBegin(true); !it.IsEnd(); ++it) { - EdgeId e = *it; - if (edge_abundance.count(e) == 0) { - qual_removal_handler2.HandleDelete(e); - } - } - } + void run(conj_graph_pack &gp, const char *); }; } diff -Nru spades-3.10.1+dfsg/src/projects/truseq_analysis/AlignmentAnalyserNew.cpp spades-3.11.1+dfsg/src/projects/truseq_analysis/AlignmentAnalyserNew.cpp --- spades-3.10.1+dfsg/src/projects/truseq_analysis/AlignmentAnalyserNew.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/truseq_analysis/AlignmentAnalyserNew.cpp 2017-09-28 09:05:12.000000000 +0000 @@ -14,7 +14,6 @@ #include "AlignmentAnalyserNew.hpp" namespace alignment_analysis { - using omnigraph::Range; size_t AlignmentAnalyserNew::StepBack(const vector &path) const { size_t cur_step = 0; diff -Nru spades-3.10.1+dfsg/src/projects/truseq_analysis/consistent_mapping.cpp spades-3.11.1+dfsg/src/projects/truseq_analysis/consistent_mapping.cpp --- spades-3.10.1+dfsg/src/projects/truseq_analysis/consistent_mapping.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/truseq_analysis/consistent_mapping.cpp 2017-09-28 09:05:12.000000000 +0000 @@ -10,7 +10,6 @@ #include "consistent_mapping.h" namespace alignment_analysis { - using omnigraph::Range; using omnigraph::MappingRange; bool ConsistentMapping::CheckConnect(EdgeId e, Range r) const { @@ -242,4 +241,4 @@ os << ")"; return os; } -} \ No newline at end of file +} diff -Nru spades-3.10.1+dfsg/src/projects/truseq_analysis/consistent_mapping.h spades-3.11.1+dfsg/src/projects/truseq_analysis/consistent_mapping.h --- spades-3.10.1+dfsg/src/projects/truseq_analysis/consistent_mapping.h 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/truseq_analysis/consistent_mapping.h 2017-09-28 09:05:12.000000000 +0000 @@ -13,9 +13,9 @@ typedef Graph::EdgeId EdgeId; struct EdgeRange { - EdgeRange(const EdgeId &first, const omnigraph::Range &second) : first(first), second(second) { } + EdgeRange(const EdgeId &first, const Range &second) : first(first), second(second) { } EdgeId first; - omnigraph::Range second; + Range second; }; ostream & operator<<(ostream& os, const EdgeRange& er); @@ -27,9 +27,9 @@ ConsistentMapping(const Graph &graph, const omnigraph::MappingPath &path); - ConsistentMapping(Graph const &graph, omnigraph::Range r, const vector &path); + ConsistentMapping(Graph const &graph, Range r, const vector &path); - bool CheckConnect(EdgeId e, omnigraph::Range r) const; + bool CheckConnect(EdgeId e, Range r) const; bool CheckConnect(const EdgeRange &er) const; @@ -45,7 +45,7 @@ void ForceJoin(const ConsistentMapping &other, const vector &path); - omnigraph::Range const &GetInitialRange() const; + Range const &GetInitialRange() const; const vector &GetMappedPath() const; @@ -81,7 +81,7 @@ vector GenerateMappingPath(const vector &path) const; const Graph &graph_; - omnigraph::Range initial_range; + Range initial_range; vector mapped_path; DECL_LOGGER("ConsistentMapping"); }; diff -Nru spades-3.10.1+dfsg/src/projects/truseq_analysis/main.cpp spades-3.11.1+dfsg/src/projects/truseq_analysis/main.cpp --- spades-3.10.1+dfsg/src/projects/truseq_analysis/main.cpp 2017-02-28 14:55:37.000000000 +0000 +++ spades-3.11.1+dfsg/src/projects/truseq_analysis/main.cpp 2017-09-28 09:05:12.000000000 +0000 @@ -10,13 +10,13 @@ */ #include "utils/logger/log_writers.hpp" #include "utils/segfault_handler.hpp" -#include "utils/memory_limit.hpp" -#include "utils/copy_file.hpp" +#include "utils/perf/memory_limit.hpp" +#include "utils/filesystem/copy_file.hpp" #include "pipeline/config_struct.hpp" #include "analysis_pipeline.hpp" void load_config(string cfg_filename) { - path::CheckFileExistenceFATAL(cfg_filename); + fs::CheckFileExistenceFATAL(cfg_filename); cfg::create_instance(cfg_filename); @@ -38,16 +38,16 @@ string log_props_file = cfg::get().log_filename; - if (!path::FileExists(log_props_file)) - log_props_file = path::append_path(path::parent_path(cfg_filename), cfg::get().log_filename); + if (!fs::FileExists(log_props_file)) + log_props_file = fs::append_path(fs::parent_path(cfg_filename), cfg::get().log_filename); - logger *lg = create_logger(path::FileExists(log_props_file) ? log_props_file : ""); + logger *lg = create_logger(fs::FileExists(log_props_file) ? log_props_file : ""); lg->add_writer(std::make_shared()); attach_logger(lg); } int main(int /*argc*/, char** argv) { - perf_counter pc; + utils::perf_counter pc; const size_t GB = 1 << 30; @@ -67,7 +67,7 @@ // read configuration file (dataset path etc.) - limit_memory(cfg::get().max_memory * GB); + utils::limit_memory(cfg::get().max_memory * GB); // assemble it! INFO("Assembling dataset (" << cfg::get().dataset_file << ") with K=" << cfg::get().K); diff -Nru spades-3.10.1+dfsg/src/spades_pipeline/CMakeLists.txt spades-3.11.1+dfsg/src/spades_pipeline/CMakeLists.txt --- spades-3.10.1+dfsg/src/spades_pipeline/CMakeLists.txt 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/spades_pipeline/CMakeLists.txt 2017-09-28 09:05:14.000000000 +0000 @@ -16,6 +16,6 @@ DESTINATION share/spades/spades_pipeline/truspades COMPONENT runtime) # Common module -install(FILES common/alignment.py common/parallel_launcher.py common/sam_parser.py common/SeqIO.py +install(FILES common/alignment.py common/parallel_launcher.py common/sam_parser.py common/SeqIO.py common/__init__.py DESTINATION share/spades/spades_pipeline/common COMPONENT runtime) diff -Nru spades-3.10.1+dfsg/src/spades_pipeline/common/SeqIO.py spades-3.11.1+dfsg/src/spades_pipeline/common/SeqIO.py --- spades-3.10.1+dfsg/src/spades_pipeline/common/SeqIO.py 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/spades_pipeline/common/SeqIO.py 2017-09-28 09:05:14.000000000 +0000 @@ -6,12 +6,17 @@ import itertools import sys +import gzip +import codecs + +fasta_ext = ['.fa', '.fas', '.fasta', '.seq', '.fsa', '.fna', '.ffn', '.frn'] +fastq_ext = ['.fq', 'fastq'] def Open(f, mode): if f.endswith(".gz"): - return gzip.open(f, mode) + return codecs.getreader('UTF-8')(gzip.open(f, mode)) else: - return open(f, mode) + return codecs.open(f, mode, encoding='utf-8') class Reader: def __init__(self, handler): @@ -145,3 +150,28 @@ r -= 1 if r > l: write(SeqRecord(contig.seq[l:r], contig.id)) + + +def is_fasta(file_name): + for ext in fasta_ext: + if ext in file_name: + return True + + return False + + +def is_fastq(file_name): + for ext in fastq_ext: + if ext in file_name: + return True + + return False + + +def get_read_file_type(file_name): + if is_fastq(file_name): + return 'fastq' + elif is_fasta(file_name): + return 'fasta' + else: + return None diff -Nru spades-3.10.1+dfsg/src/spades_pipeline/corrector_logic.py spades-3.11.1+dfsg/src/spades_pipeline/corrector_logic.py --- spades-3.10.1+dfsg/src/spades_pipeline/corrector_logic.py 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/spades_pipeline/corrector_logic.py 2017-09-28 09:05:14.000000000 +0000 @@ -27,12 +27,13 @@ data = pyyaml.load(open(filename, 'r')) data["dataset"] = cfg.dataset data["output_dir"] = cfg.output_dir - data["work_dir"] = process_cfg.process_spaces(cfg.tmp_dir) + data["work_dir"] = cfg.tmp_dir #data["hard_memory_limit"] = cfg.max_memory data["max_nthreads"] = cfg.max_threads data["bwa"] = cfg.bwa file_c = open(filename, 'w') - pyyaml.dump(data, file_c, default_flow_style = False, default_style='"', width=100500) + pyyaml.dump(data, file_c, + default_flow_style=False, default_style='"', width=float("inf")) file_c.close() diff -Nru spades-3.10.1+dfsg/src/spades_pipeline/hammer_logic.py spades-3.11.1+dfsg/src/spades_pipeline/hammer_logic.py --- spades-3.10.1+dfsg/src/spades_pipeline/hammer_logic.py 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/spades_pipeline/hammer_logic.py 2017-09-28 09:05:14.000000000 +0000 @@ -93,7 +93,8 @@ data["output_dir"] = cfg.output_dir data["hard_memory_limit"] = cfg.max_memory data["max_nthreads"] = cfg.max_threads - pyyaml.dump(data, open(filename, 'w'), default_flow_style = False, default_style='"', width=100500) + pyyaml.dump(data, open(filename, 'w'), + default_flow_style=False, default_style='"', width=float("inf")) def run_hammer(corrected_dataset_yaml_filename, configs_dir, execution_home, cfg, @@ -109,7 +110,8 @@ not_used_dataset_data = support.get_libs_by_type(dataset_data, options_storage.LONG_READS_TYPES) to_correct_dataset_data = support.rm_libs_by_type(dataset_data, options_storage.LONG_READS_TYPES) to_correct_dataset_yaml_filename = os.path.join(cfg.output_dir, "to_correct.yaml") - pyyaml.dump(to_correct_dataset_data, open(to_correct_dataset_yaml_filename, 'w'), default_flow_style = False, default_style='"', width=100500) + pyyaml.dump(to_correct_dataset_data, open(to_correct_dataset_yaml_filename, 'w'), + default_flow_style=False, default_style='"', width=float("inf")) cfg.dataset_yaml_filename = to_correct_dataset_yaml_filename else: not_used_dataset_data = None @@ -154,7 +156,8 @@ is_changed = True corrected_dataset_data += not_used_dataset_data if is_changed: - pyyaml.dump(corrected_dataset_data, open(corrected_dataset_yaml_filename, 'w'), default_flow_style = False, default_style='"', width=100500) + pyyaml.dump(corrected_dataset_data, open(corrected_dataset_yaml_filename, 'w'), + default_flow_style=False, default_style='"', width=float("inf")) log.info("\n== Dataset description file was created: " + corrected_dataset_yaml_filename + "\n") if os.path.isdir(cfg.tmp_dir): diff -Nru spades-3.10.1+dfsg/src/spades_pipeline/options_storage.py spades-3.11.1+dfsg/src/spades_pipeline/options_storage.py --- spades-3.10.1+dfsg/src/spades_pipeline/options_storage.py 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/spades_pipeline/options_storage.py 2017-09-28 09:05:14.000000000 +0000 @@ -12,7 +12,7 @@ import support from os.path import basename -SUPPORTED_PYTHON_VERSIONS = ['2.4', '2.5', '2.6', '2.7', '3.2', '3.3', '3.4', '3.5'] +SUPPORTED_PYTHON_VERSIONS = ['2.4-2.7', '3.2+'] # major.minor format only, close ('-') and open ('+') ranges allowed # allowed reads extensions for BayesHammer and for thw whole SPAdes pipeline BH_ALLOWED_READS_EXTENSIONS = ['.fq', '.fastq', '.bam'] CONTIGS_ALLOWED_READS_EXTENSIONS = ['.fa', '.fasta'] @@ -33,11 +33,12 @@ contigs_name = "contigs.fasta" scaffolds_name = "scaffolds.fasta" assembly_graph_name = "assembly_graph.fastg" -assembly_graph_name_gfa = "assembly_graph.gfa" +assembly_graph_name_gfa = "assembly_graph_with_scaffolds.gfa" contigs_paths = "contigs.paths" scaffolds_paths = "scaffolds.paths" transcripts_name = "transcripts.fasta" transcripts_paths = "transcripts.paths" +filtering_types = ["hard", "soft", "default"] #other constants MIN_K = 1 @@ -87,6 +88,7 @@ cov_cutoff = 'off' # default is 'off' # hidden options +save_gp = False mismatch_corrector = None reference = None series_analysis = None @@ -95,6 +97,7 @@ bh_heap_check = None spades_heap_check = None read_buffer_size = None +lcer_cutoff = None ### END OF OPTIONS # for restarting SPAdes @@ -114,6 +117,7 @@ restart_reference = None restart_configs_dir = None restart_read_buffer_size = None +restart_fast = None # for running to specific check-point stop_after = None @@ -124,17 +128,22 @@ correct_scaffolds = False run_truseq_postprocessing = False +#rna options +strand_specific = None # None, True, False are possible +fast = None + dict_of_prefixes = dict() dict_of_rel2abs = dict() # list of spades.py options long_options = "12= threads= memory= tmp-dir= iterations= phred-offset= sc iontorrent meta large-genome rna plasmid "\ + "ss-fr ss-rf fast fast:false "\ "only-error-correction only-assembler "\ "disable-gzip-output disable-gzip-output:false disable-rr disable-rr:false " \ "help version test debug debug:false reference= series-analysis= config-file= dataset= "\ "bh-heap-check= spades-heap-check= read-buffer-size= help-hidden "\ - "mismatch-correction mismatch-correction:false careful careful:false "\ - "continue restart-from= diploid truseq cov-cutoff= configs-dir= stop-after=".split() + "mismatch-correction mismatch-correction:false careful careful:false save-gp save-gp:false "\ + "continue restart-from= diploid truseq cov-cutoff= hidden-cov-cutoff= configs-dir= stop-after=".split() short_options = "o:1:2:s:k:t:m:i:hv" # adding multiple paired-end, mate-pair and other (long reads) libraries support @@ -166,13 +175,13 @@ def version(spades_version, mode=None): - sys.stderr.write("SPAdes v" + str(spades_version)) + sys.stdout.write("SPAdes v" + str(spades_version)) if mode is None: mode = get_mode() if mode is not None: - sys.stderr.write(" [" + mode + "SPAdes mode]") - sys.stderr.write("\n") - sys.stderr.flush() + sys.stdout.write(" [" + mode + "SPAdes mode]") + sys.stdout.write("\n") + sys.stdout.flush() def usage(spades_version, show_hidden=False, mode=None): @@ -247,12 +256,15 @@ sys.stderr.write("--sanger\t\tfile with Sanger reads\n") sys.stderr.write("--pacbio\t\tfile with PacBio reads\n") sys.stderr.write("--nanopore\t\tfile with Nanopore reads\n") - sys.stderr.write("--tslr\t\tfile with TSLR-contigs\n") + if not mode == "rna": + sys.stderr.write("--tslr\t\tfile with TSLR-contigs\n") sys.stderr.write("--trusted-contigs\t\tfile with trusted contigs\n") sys.stderr.write("--untrusted-contigs\t\tfile with untrusted contigs\n") if mode == "dip": sys.stderr.write("Input haplocontigs:" + "\n") sys.stderr.write("--hap\t\tfile with haplocontigs" + "\n") + if mode == "rna": + sys.stderr.write("--ss-\tstrand specific data, = fr (normal) and rf (antisense)\n") sys.stderr.write("" + "\n") sys.stderr.write("Pipeline options:" + "\n") @@ -281,6 +293,8 @@ sys.stderr.write("" + "\n") sys.stderr.write("Advanced options:" + "\n") sys.stderr.write("--dataset\t\tfile with dataset description in YAML format" + "\n") + if mode == "rna": + sys.stderr.write("--fast\t\t\t\tspeeds up isoform detection, but may miss short and low-expressed isoforms\n") sys.stderr.write("-t/--threads\t\t\tnumber of threads" + "\n") sys.stderr.write("\t\t\t\t[default: %s]\n" % THREADS) sys.stderr.write("-m/--memory\t\t\tRAM limit for SPAdes in Gb"\ @@ -294,7 +308,7 @@ sys.stderr.write("\t\t\t\tless than " + str(MAX_K + 1) + ") [default: 'auto']" + "\n") else: sys.stderr.write("-k\t\t\t\tk-mer size (must be odd and less than " + str(MAX_K + 1) + ") " \ - "[default: " + str(K_MERS_RNA[0]) + "]\n") + "[default: 'auto']\n") if mode not in ["rna", "meta"]: sys.stderr.write("--cov-cutoff\t\t\tcoverage cutoff value (a positive float number, " @@ -323,6 +337,9 @@ sys.stderr.write("--spades-heap-check\t\tsets HEAPCHECK environment variable"\ " for SPAdes" + "\n") sys.stderr.write("--large-genome\tEnables optimizations for large genomes \n") + sys.stderr.write("--save-gp\tEnables saving graph pack before repeat resolution (even without --debug) \n") + sys.stderr.write("--hidden-cov-cutoff\t\t\tcoverage cutoff value deeply integrated in simplification"\ + " (a positive float number). Base coverage! Will be adjusted depending on K and RL! \n") sys.stderr.write("--help-hidden\tprints this usage message with all hidden options" + "\n") if show_hidden and mode == "dip": @@ -354,6 +371,7 @@ global qvoffset global cov_cutoff global tmp_dir + global fast if threads is None: threads = THREADS @@ -380,12 +398,15 @@ cov_cutoff = 'off' if tmp_dir is None: tmp_dir = os.path.join(output_dir, TMP_DIR) + if fast is None: + fast = False def set_test_options(): global output_dir global single_cell global test_mode + global meta output_dir = os.path.abspath('spades_test') single_cell = False @@ -406,6 +427,8 @@ support.error("you cannot specify --only-assembler with --restart-from option!", log) if only_error_correction: support.error("you cannot specify --only-error-correction with --restart-from option!", log) + if strand_specific is not None: + support.error("you cannot specify strand specificity (--ss-rf or --ss-fr) with --restart-from option!", log) global restart_k_mers global restart_careful @@ -421,6 +444,7 @@ global restart_reference global restart_configs_dir global restart_read_buffer_size + global restart_fast restart_k_mers = k_mers restart_careful = careful @@ -436,6 +460,7 @@ restart_reference = reference restart_configs_dir = configs_dir restart_read_buffer_size = read_buffer_size + restart_fast = fast def load_restart_options(): @@ -454,6 +479,7 @@ global configs_dir global read_buffer_size global original_k_mers + global fast if restart_k_mers: original_k_mers = k_mers @@ -487,6 +513,8 @@ configs_dir = restart_configs_dir if restart_read_buffer_size is not None: read_buffer_size = restart_read_buffer_size + if restart_fast is not None: + fast = restart_fast def enable_truseq_mode(): diff -Nru spades-3.10.1+dfsg/src/spades_pipeline/spades_logic.py spades-3.11.1+dfsg/src/spades_pipeline/spades_logic.py --- spades-3.10.1+dfsg/src/spades_pipeline/spades_logic.py 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/spades_pipeline/spades_logic.py 2017-09-28 09:05:14.000000000 +0000 @@ -43,6 +43,7 @@ # subst_dict["topology_simplif_enabled"] = bool_to_str(last_one) subst_dict["max_threads"] = cfg.max_threads subst_dict["max_memory"] = cfg.max_memory + subst_dict["save_gp"] = bool_to_str(cfg.save_gp) if (not last_one): subst_dict["correct_mismatches"] = bool_to_str(False) if "resolving_mode" in cfg.__dict__: @@ -58,6 +59,9 @@ subst_dict["coverage_threshold"] = 0.0 else: subst_dict["coverage_threshold"] = cfg.cov_cutoff + if cfg.lcer_cutoff is not None: + subst_dict["lcer_enabled"] = bool_to_str(True) + subst_dict["lcer_coverage_threshold"] = cfg.lcer_cutoff #TODO: make something about spades.py and config param substitution if "bwa_paired" in cfg.__dict__: @@ -68,6 +72,15 @@ process_cfg.substitute_params(filename, subst_dict, log) +def prepare_config_rnaspades(filename, log): + if not options_storage.rna: + return + subst_dict = dict() + subst_dict["ss_enabled"] = bool_to_str(options_storage.strand_specific is not None) + subst_dict["antisense"] = bool_to_str(options_storage.strand_specific) + process_cfg.substitute_params(filename, subst_dict, log) + + def get_read_length(output_dir, K, ext_python_modules_home, log): est_params_filename = os.path.join(output_dir, "K%d" % K, "final.lib_data") max_read_length = 0 @@ -119,6 +132,7 @@ original_k_mers = [k for k in original_k_mers if k < RL] return original_k_mers + def add_configs(command, configs_dir): #Order matters here! mode_config_mapping = [("single_cell", "mda_mode"), @@ -127,13 +141,22 @@ ("rna", "rna_mode"), ("large_genome", "large_genome_mode"), ("plasmid", "plasmid_mode"), - ("careful", "careful_mode"), + #("careful", "careful_mode"), ("diploid_mode", "diploid_mode")] for (mode, config) in mode_config_mapping: if options_storage.__dict__[mode]: if mode == "rna" or mode == "meta": command.append(os.path.join(configs_dir, "mda_mode.info")) command.append(os.path.join(configs_dir, config + ".info")) + if options_storage.__dict__["careful"]: + if options_storage.__dict__["single_cell"]: + command.append(os.path.join(configs_dir, "careful_mda_mode.info")) + else: + command.append(os.path.join(configs_dir, "careful_mode.info")) + + # special case: extra config + if options_storage.rna and options_storage.fast: + command.append(os.path.join(configs_dir, "rna_fast_mode.info")) def run_iteration(configs_dir, execution_home, cfg, log, K, prev_K, last_one): @@ -178,6 +201,7 @@ #FIXME why here??? process_cfg.substitute_params(os.path.join(dst_configs, "pe_params.info"), {"scaffolding_mode": cfg.scaffolding_mode}, log) + prepare_config_rnaspades(os.path.join(dst_configs, "rna_mode.info"), log) cfg_fn = os.path.join(dst_configs, "config.info") prepare_config_spades(cfg_fn, cfg, log, additional_contigs_fname, K, stage, saves_dir, last_one, execution_home) @@ -347,11 +371,18 @@ shutil.copyfile(os.path.join(latest, "before_rr.fasta"), result_before_rr_contigs) if options_storage.rna: if os.path.isfile(os.path.join(latest, "transcripts.fasta")): - if not os.path.isfile(cfg.result_transcripts) or not options_storage.continue_mode: - shutil.copyfile(os.path.join(latest, "transcripts.fasta"), cfg.result_transcripts) + if not os.path.isfile(cfg.result_transcripts) or not options_storage.continue_mode: + shutil.copyfile(os.path.join(latest, "transcripts.fasta"), cfg.result_transcripts) if os.path.isfile(os.path.join(latest, "transcripts.paths")): if not os.path.isfile(cfg.result_transcripts_paths) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "transcripts.paths"), cfg.result_transcripts_paths) + for filtering_type in options_storage.filtering_types: + prefix = filtering_type + "_filtered_" + result_filtered_transcripts = os.path.join(cfg.output_dir, prefix + options_storage.transcripts_name) + latest_filtered_transcripts = os.path.join(latest, prefix + "final_paths.fasta") + if os.path.isfile(latest_filtered_transcripts): + if not os.path.isfile(result_filtered_transcripts) or not options_storage.continue_mode: + shutil.copyfile(latest_filtered_transcripts, result_filtered_transcripts) else: if os.path.isfile(os.path.join(latest, "final_contigs.fasta")): if not os.path.isfile(cfg.result_contigs) or not options_storage.continue_mode: @@ -367,9 +398,9 @@ if os.path.isfile(os.path.join(latest, "scaffolds.paths")): if not os.path.isfile(cfg.result_scaffolds_paths) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "scaffolds.paths"), cfg.result_scaffolds_paths) - if os.path.isfile(os.path.join(latest, "assembly_graph.gfa")): + if os.path.isfile(os.path.join(latest, "assembly_graph_with_scaffolds.gfa")): if not os.path.isfile(cfg.result_graph_gfa) or not options_storage.continue_mode: - shutil.copyfile(os.path.join(latest, "assembly_graph.gfa"), cfg.result_graph_gfa) + shutil.copyfile(os.path.join(latest, "assembly_graph_with_scaffolds.gfa"), cfg.result_graph_gfa) if os.path.isfile(os.path.join(latest, "assembly_graph.fastg")): if not os.path.isfile(cfg.result_graph) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "assembly_graph.fastg"), cfg.result_graph) diff -Nru spades-3.10.1+dfsg/src/spades_pipeline/support.py spades-3.11.1+dfsg/src/spades_pipeline/support.py --- spades-3.10.1+dfsg/src/spades_pipeline/support.py 2017-02-28 14:55:38.000000000 +0000 +++ spades-3.11.1+dfsg/src/spades_pipeline/support.py 2017-09-28 09:05:14.000000000 +0000 @@ -18,7 +18,10 @@ import shutil import options_storage import itertools +from common import SeqIO +import math from os.path import abspath, expanduser, join +from distutils.version import LooseVersion # constants to print and detect warnings and errors in logs SPADES_PY_ERROR_MESSAGE = "== Error == " @@ -40,10 +43,12 @@ log.info("\n\n" + prefix + " " + err_str) log_warnings(log, with_error=True) log.info("\nIn case you have troubles running " + binary_name + ", you can write to spades.support@cab.spbu.ru") + log.info("or report an issue on our GitHub repository github.com/ablab/spades") log.info("Please provide us with params.txt and " + binary_name.lower() + ".log files from the output directory.") else: sys.stderr.write("\n\n" + prefix + " " + err_str + "\n\n") sys.stderr.write("\nIn case you have troubles running " + binary_name + ", you can write to spades.support@cab.spbu.ru\n") + sys.stderr.write("or report an issue on our GitHub repository github.com/ablab/spades\n") sys.stderr.write("Please provide us with params.txt and " + binary_name.lower() + ".log files from the output directory.\n") sys.stderr.flush() if current_tmp_dir and os.path.isdir(current_tmp_dir): @@ -60,14 +65,35 @@ def check_python_version(): - if sys.version[0:3] not in options_storage.SUPPORTED_PYTHON_VERSIONS: - error("python version " + sys.version[0:3] + " is not supported!\n" + \ - "Supported versions are " + ", ".join(options_storage.SUPPORTED_PYTHON_VERSIONS)) + def __next_version(version): + components = version.split('.') + for i in reversed(range(len(components))): + if components[i].isdigit(): + components[i] = str(int(components[i]) + 1) + break + return '.'.join(components) + + current_version = sys.version.split()[0] + supported_versions_msg = [] + for supported_versions in options_storage.SUPPORTED_PYTHON_VERSIONS: + major = supported_versions[0] + if '-' in supported_versions: # range + min_inc, max_inc = supported_versions.split('-') + elif supported_versions.endswith('+'): # half open range + min_inc, max_inc = supported_versions[:-1], major + else: # exact version + min_inc = max_inc = supported_versions + max_exc = __next_version(max_inc) + supported_versions_msg.append("Python%s: %s" % (major, supported_versions.replace('+', " and higher"))) + if LooseVersion(min_inc) <= LooseVersion(current_version) < LooseVersion(max_exc): + return True + error("Python version " + current_version + " is not supported!\n" + + "Supported versions are " + ", ".join(supported_versions_msg)) def get_spades_binaries_info_message(): return "You can obtain SPAdes binaries in one of two ways:" +\ - "\n1. Download them from http://bioinf.spbau.ru/content/spades-download" +\ + "\n1. Download them from http://cab.spbu.ru/software/spades/" +\ "\n2. Build source code with ./spades_compile.sh script" @@ -577,6 +603,36 @@ return abs_paths_dataset_data +def get_reads_length(dataset_data, log, num_checked=10 ** 4, diff_len_allowable=25): + max_reads_lenghts = [get_max_reads_length(reads_file, log, num_checked) for reads_file in get_reads_files(dataset_data)] + + avg_len = sum(max_reads_lenghts) / len(max_reads_lenghts) + for max_len in max_reads_lenghts: + if math.fabs(max_len - avg_len) > diff_len_allowable: + warning('Read lengths differ more than allowable. Length: ' + str(max_len) + '. Avg. length: ' + str(avg_len) + '.', log) + reads_length = min(max_reads_lenghts) + log.info('Reads length: ' + str(reads_length)) + return reads_length + + +def get_reads_files(dataset_data): + for reads_library in dataset_data: + for key, value in reads_library.items(): + if key.endswith('reads'): + for reads_file in value: + yield reads_file + + +def get_max_reads_length(reads_file, log, num_checked): + file_type = SeqIO.get_read_file_type(reads_file) + if not file_type: + error('Incorrect type of reads file: ' + reads_file, log) + + max_reads_length = max([len(rec) for rec in itertools.islice(SeqIO.parse(SeqIO.Open(reads_file, "r"), file_type), num_checked)]) + log.info('Max reads length: ' + str(max_reads_length)) + return max_reads_length + + def check_dataset_reads(dataset_data, only_assembler, log): all_files = [] for id, reads_library in enumerate(dataset_data): Binary files /tmp/tmpPucGcS/oc4uoPsyIX/spades-3.10.1+dfsg/test_dataset/reference_1K.fa.gz and /tmp/tmpPucGcS/gvGHfDl7AK/spades-3.11.1+dfsg/test_dataset/reference_1K.fa.gz differ diff -Nru spades-3.10.1+dfsg/VERSION spades-3.11.1+dfsg/VERSION --- spades-3.10.1+dfsg/VERSION 2017-02-28 14:55:56.000000000 +0000 +++ spades-3.11.1+dfsg/VERSION 2017-09-28 09:05:59.000000000 +0000 @@ -1 +1 @@ -3.10.1 +3.11.1