input_parameters.yml

####################################################################################################
####################################################################################################
##  This is the input parameter file for the Nanopore consensus pipeline. 
##  The file is written in the YAML format.  A nice description of the
##  format can be found at http://docs.ansible.com/ansible/YAMLSyntax.html
##
##  Unless otherwise mentioned, all fields must be filled.
##
####################################################################################################
####################################################################################################

# Full path to the files need to be provided. One fastq file at a time can be analyzed. 
fastq: /home/ada/Desktop/16S_alignments/scripts/16S_alignment/reads_clustering_pipeline/mock_datasets/mock5_1000_varying.fastq
tmp: /home/ada/Desktop/16S_alignments/scripts/16S_alignment/reads_clustering_pipeline/main_pipeline/src/tmp
dependencies: /home/ada/Desktop/16S_alignments/scripts/16S_alignment/reads_clustering_pipeline/main_pipeline/src/dependencies
results: /home/ada/Desktop/16S_alignments/scripts/16S_alignment/reads_clustering_pipeline/main_pipeline/result_files
blast_db: /media/data/home/blast/ref_prok_rep_genomes_db
db_name: ref_prok_rep_genomes

# Set to True if you want to include a refinement of the clusters based on the MSA guide tree.
refinement: True
 
# Parameters used for clustering and refinement. 
parameters:
  kmer_len: 5                                # Length (bp) of kmers for kmer frequency matrix
  umap_n_components: 10                      # Dimensions for UMAP clustering
  umap_n_neighbors: 5                        # Controls how UMAP balances local versus global structure in the data
  hdbscan_min_cluster_size: 500                # Min number of reads that will create a cluster
  hdbscan_cluster_selection_method: leaf     # Determine how it selects flat clusters from the cluster tree hierarchy (eom or leaf)
  reads_per_job: 30000                        # Number of reads that will be analyzed at once. 
  std_ref: 0.0015                            # Value of standard deviation of branch lengths that will determine if we do or do not split the cluster
  std_t: 0.001                               # Value of standard deviation of branch lengths that will determine how we split the clusters


#####
# time python3 /home/ada/Desktop/16S_alignments/scripts/16S_alignment/reads_clustering_pipeline/main_pipeline/main.py --config_file /home/ada/Desktop/16S_alignments/scripts/16S_alignment/reads_clustering_pipeline/main_pipeline/input_parameters.yml

# Cluster
# python3 /home/amadejska/16s_project/Nanopore_consensus_pipeline/src/main.py --config_file /home/amadejska/16s_project/Nanopore_consensus_pipeline/input_parameters.yml