nextflow.conf

input="/PATH_TO_INPUT_FOLDER/" #with backslash at the end
output="/PATH_TO_OUTPUT_FOLDER/" #with backslash at the end
prefix="/home/username/PATH_TO_REPO" # no backslash at the end
filename_wildcard="*.json" #file name pattern of the input

doc_cascaded=true # If we enforce document level labels to sentence and token level labels.
sent_cascaded=true # If we enforce sentence level labels to token level labels.
source_lang="portuguese" # used when tokenizing or sentence splitting
do_text_extraction=false # if true pipeline starts with extractor if false the first component of the pipeline will be multi_task
extractor_script_path=$prefix"/bin/extract/peoples_chaina.py" # path to html to text script. necessary if do_text_extraction is true.

gpu_multi_task=0,1,2,3,4,5,6,7
gpu_number_tsc=2,3
gpu_number_psc=4,5
gpu_number_osc=6,7
gpu_number_violent=0,1,2,3,4,5,6,7

multi_task_batchsize=500 # Max number of filenames(relative paths) to never get "Argument list too long". Math -> 130000/255
doc_batchsize=120
sent_batchsize=2500

RUN_MULTI_TASK=true # Whether to run multi_task model
RUN_SENT=true # Whether to run sent level -> If run_multi_task is false, then your input files must contain "sentences" and "tokens" and "id"
RUN_DOC=true # Whether to run doc level -> If run_multi_task is false, input files must contain "text" and "id"
RUN_POST=false # Whether to run post processing stuff