[ProcessRun]
; Run description: if specified, a custom description will be used. Will be ignored if a RunId is specified.
;                  otherwise, "Added sequences for {MapSet Accession No.} from {Sources}." will be used.
RunDescription="Loaded sequences for PGSC_DM_v4.03 from http://solanaceae.plantbiology.msu.edu/pgsc_download.shtml"

[MapSet]
;------------------------------------------------------------------------------------------------
; 1. Using existing MapSet
;------------------------------------------------------------------------------------------------
; If either MapSetId or MapSetPath is specified, it adds sequences to the existing MapSet.
; otherwise, a new MapSet should be specified below.
; MapSetId: id of an existing map set
;MapSetId=270000441
; MapSetPath: path of an existing map set.
;MapSetPath="Glycine max/Wm82.a2.v1"
;------------------------------------------------------------------------------------------------
; 2. Adding new MapSet
;------------------------------------------------------------------------------------------------
; Organism ID (required): organism ID should exist.
OrganismId=4113
; Display name (required): a name shown in MapSetTree. Usually a assembly build name.
DisplayName="DM_v4.03"
; Description: by default, organism name + display name.
Description="Solanum tuberosum DM_v4.03 from SpudDb (http://spuddb.uga.edu/pgsc_download.shtml)
<a href=\"https://www.nature.com/articles/nature10158\">Publication</a>"
; AccessionNo: accession of the genome build. See http://ncbi.nlm.nih.gov/genome
AccessionNo="PGSC_DM_v4.03"
; Source ID: database or institution that the MapSet/sequence originate
SourceId="PGSC"

[MapSetTree]
;------------------------------------------------------------------------------------------------
; 1. Using existing MapSetTree node
;------------------------------------------------------------------------------------------------
; Node ID: if specified, the MapSet with the new sequences will be placed on this node.
;NodeId=12345
;------------------------------------------------------------------------------------------------
; 2. Adding new MapSetTree node to a parent node
;------------------------------------------------------------------------------------------------
; Parent node ID: if specified, the MapSet with the new sequences will be placed under this parent node as a child.
;ParentNodeId=200206082
;------------------------------------------------------------------------------------------------
; 3. Adding new MapSetTree node under a new root node
;------------------------------------------------------------------------------------------------
; Root node name: usually an organism name. Ignored if the root name already exists.
RootNodeName="Solanum tuberosum"
; Root node order number: order of the root node in the MapSetTree. By default, 0.
RootNodeOrderNo=0

[Sequence]
; Sources (required): FASTA file(s) of genomic DNA sequence located locally or remotely accessible via URL.
Sources=$DATA/potato/PGSC_DM_v4.03_pseudomolecules.fasta.zip 
; Commit frequency: commits after reading this many nucleotides. Large numbers requre larger rollback segment, smaller numbers result in higher frequency of transactions
CommitFrequency=10000000

; ExcludedHeadersRegEx: exclude sequences that match this regular expression
;ExcludedHeadersRegEx="chloroplast"
; IncludeHeadersRegEx: include only sequences that match this regular expression. Note, ExcludedHeadersRegEx, if specified, is applied first
;IncludeHeadersRegEx="complete"

; FASTA header starts with '>' and provides map information delimited by delimiters follows (Delimiter=VerticalBar): 
; >{0}|{1}|{2}|{3}|{4}
; Delimiter: specify one among Colon(:), Comma(,), Period(.), Hyphen(-), SemiColon(;), Slash(/), Tab(\t), Space( ) and VerticalBar(|)
; Comment this line out if the FASTA header will not be partitioned
;Delimiter=VerticalBar

; Map name index. Tells which field between the delimiters corresponds to the map name. 0-based. If delimiter is not used, MapNameIndex is ignored, 
; the entire FASTA header is saved as the map name.
;MapNameIndex=0
; If the part between delimiters is too long to be stored as the map name, it can be parsed and the map name can be extracted by using this regular expression
; If MapNameIndex is not provided, MapNameRegEx is applied to the whole FASTA header
;MapNameRegEx=".*chromosome (..).*"
; Map accession index. Tells which part between delimiters corresponds to map accession. 0-based. If delimiter is not used, MapAccessionIndex is ignored, 
; the entire FASTA header is saved as the map accession.
;MapAccessionIndex=0
; If the part between delimiters is too long to be stored as the map accession, it can be parsed and the map accession can be extracted by using this regular expression
; If MapAccessionIndex is not provided, MapAccessionRegEx is applied to the whole FASTA header
;MapAccessionRegEx=".*chromosome (..).*"
; ReplaceSpacesWith: Provide symbols that will be used to replace spaces in the map name. Some third-party tools are confused if the map name contains a space.
;ReplaceSpacesWith=_

; Map description index
;MapDescriptionIndex=0
; MapDescriptionRegEx: regular expression to extract the map description from FASTA header or, if MapDescriptionIndex is provided,
; from the extracted field at that index
;MapDescriptionRegEx=".*"
; Some sequences can be called chromosomes, some as scaffolds. Usually, the number of chromosomes is small and allows to show all of them as a representation of a genome
; ChromosomeCriteriaRegEx - if the fasta header matches the RegEx criteria, the entry will be called a chromosome
;ChromosomeCriteriaRegEx="chromosome"
; ChromosomeCriteriaLength - a formula to separate chromosomes from the rest of the sequences. It can be an alternative
; to ChoromosomeCriteriaRegEx
; For example, ChromosomeCriteriaLength=">5000000" will store sequences longer than 5,000,000 bp as chromosomes

; In case the sequence passes the criteria to be called a chromosome, the chromosome name will be read from the field with this index:
;ChromosomeNameIndex=0
; ChromosomeNameRegEx - a regular expression applied to the field with the given index to extract the chromosome name
ChromosomeNameRegEx="ST4.03ch(.+)"

; Expected length index: if specified, the expected length of each sequence will be compared to the actual sequence length.
;ExpectedLengthIndex=4
; MapNameFilterRegEx: Regular expression filter based on map name: include only sequences whose map name matches the given pattern. 
; if not specified, all the sequences in the source will be included.
; Example below would load only sequences with name that start with 'Chr.'
;MapNameFilterRegEx="^Chr\..*"
; Length filter: used to include only sequences whose length satisfies the criteria. Use '>' or '<' to load sequences longer/shorter than the given number (e.g,"<1000000")
;                if not specified, all the sequences in the source will be included.
;LengthFilter=">100000"
; Sequences can be stored in the database (Oracle) or in the file system (MySql-compatible). 
; In case of the file system, please make sure that the storage location is visible from the machine where the loading process is running.
; The path to this location will be also used by the API-server (Cerberus) or, in case of direct database connection, by the main application,
; running on the user's machine.
; When the API-server is used, the storage location should be accessible to the server. The machines used for loading and for Cerberus can be different. In such case,
; use path remapping, specified in PersephoneShell's configuration file (see StorageMapping entry).
; StorageId: If present, specifies storage to add sequences to. Otherwise, default storage will be used. Use 'add storage' command to specify alternative storage locations;
;StorageId=1

[TranslationCode]
; List map names and their corresponding translation codes. Default code is 1
;Pt=11

; this section is about creating GC low res data for sequence
[Gc]
; what percentage must 'N' take for a region to be considered poly-N, in range[1-99]. Default is 25
;PolyNPercentage=50
; Histogram step (size of a 'cell'), in range [1-127], the lesser the value the more data will take space but the more precise it will be. Default is 100
;HistogramStep=100