Skip to content

Commit

Permalink
update text processing tools (#1385)
Browse files Browse the repository at this point in the history
* add header

* ad conditional

* move delimiter to 'select by field' branch

* bump tool version

* fix access to list parameter

* Drop `-b` option

Doesn't make sense given we restrict `input` to `txt` format.

Also refactor Cheetah code.

* add nl

* update dependencies

* add output number test

* add styler R script

* fix test data

* add citation and creators

* Trying to fix grep basic regex

* Update tools/text_processing/text_processing/macros.xml

---------

Co-authored-by: Marie Jossé <[email protected]>
Co-authored-by: Matthias Bernt <[email protected]>
Co-authored-by: Nicola Soranzo <[email protected]>
Co-authored-by: Pavankumar Videm <[email protected]>
  • Loading branch information
5 people authored Feb 29, 2024
1 parent 06dd963 commit 70980e3
Show file tree
Hide file tree
Showing 26 changed files with 254 additions and 125 deletions.
36 changes: 36 additions & 0 deletions .github/styler.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/usr/bin/env Rscript

library("argparse")
library("styler")

parser <- ArgumentParser(description = "Call styler")
parser$add_argument("dir",
metavar = "DIR", type = "character",
help = "File to parse"
)
parser$add_argument("--dry",
choices = c("off", "on"), default = "on"
)
args <- parser$parse_args()

file_info <- file.info(args$dir)
is_directory <- file_info$isdir

if (is_directory) {
captured_output <- capture.output({
result <- style_dir(args$dir, indent_by = 4, dry = args$dry, recursive = TRUE)
})
} else {
captured_output <- capture.output({
result <- style_file(args$dir, indent_by = 4, dry = args$dry)
})
}

n <- nrow(subset(result, changed == TRUE))
if (n > 0) {
if (args$dry == "off") {
print(paste("Changed", n, "files"))
} else {
stop(paste("Linting failed for", n, "files"))
}
}
12 changes: 6 additions & 6 deletions tools/text_processing/text_processing/awk.xml
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
<tool id="tp_awk_tool" name="Text reformatting" version="@[email protected]">
<tool id="tp_awk_tool" name="Text reformatting" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
<description>with awk</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="creator"/>
<requirements>
<requirement type="package" version="4.2.0">gawk</requirement>
<requirement type="package" version="5.3.0">gawk</requirement>
</requirements>
<version_command>awk --version | head -n 1</version_command>
<command>
Expand All @@ -15,9 +16,9 @@
-v FS=' '
-v OFS=' '
--re-interval
-f "$awk_script"
"$infile"
> "$outfile"
-f '$awk_script'
'$infile'
> '$outfile'
]]>
</command>
<configfiles>
Expand Down Expand Up @@ -119,7 +120,6 @@ The select tool searches the data for lines containing or not containing a match
- **$** matches the end of a line or string.
- **\|** Separates alternate possibilities.
@REFERENCES@
]]>
</help>
<expand macro="citations" />
Expand Down
4 changes: 2 additions & 2 deletions tools/text_processing/text_processing/cat.xml
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
<tool id="tp_cat" name="Concatenate datasets" version="0.1.1">
<tool id="tp_cat" name="Concatenate datasets" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
<description>tail-to-head (cat)</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="creator"/>
<expand macro="requirements" />
<version_command>
<![CDATA[
Expand Down Expand Up @@ -89,7 +90,6 @@ will result in the following::
chr2 100000030 200000955 P 0 +
chr2 100000015 200000999 Q 0 +
@REFERENCES@
]]>
</help>
<expand macro="citations" />
Expand Down
79 changes: 45 additions & 34 deletions tools/text_processing/text_processing/cut.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<tool id="tp_cut_tool" name="Advanced Cut" version="@[email protected]">
<tool id="tp_cut_tool" name="Advanced Cut" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
<description>columns from a table (cut)</description>
<macros>
<import>macros.xml</import>
Expand All @@ -8,14 +8,18 @@
<command>
<![CDATA[
cut
#if str($delimiter) != '':
-d"${delimiter}"
$complement
$cut_type_options.cut_element
#if $cut_type_options.cut_element != '-f'
'$cut_type_options.list'
#else
#if str($cut_type_options.delimiter) != '':
-d"$cut_type_options.delimiter"
#end if
'$cut_type_options.colnames_option.list'
#end if
${complement}
${cut_type_options.cut_element}
'${cut_type_options.list}'
'${input}'
> '${output}'
'$input'
> '$output'
]]>
</command>
<inputs>
Expand All @@ -24,23 +28,33 @@
<option value="">Keep</option>
<option value="--complement">Discard</option>
</param>
<param name="delimiter" type="select" label="Delimited by">
<option value="">Tab</option>
<option value=" ">Whitespace</option>
<option value=".">Dot</option>
<option value=",">Comma</option>
<option value="-">Dash</option>
<option value="_">Underscore</option>
<option value="|">Pipe</option>
</param>
<conditional name="cut_type_options">
<param name="cut_element" type="select" label="Cut by">
<option value="-f">fields</option>
<option value="-c">characters</option>
<option value="-b">bytes</option>
</param>
<when value="-f">
<param name="list" type="data_column" data_ref="input" multiple="true" label="List of Fields" help="(-f)" />
<param name="delimiter" type="select" label="Delimited by">
<option value="">Tab</option>
<option value=" ">Whitespace</option>
<option value=".">Dot</option>
<option value=",">Comma</option>
<option value="-">Dash</option>
<option value="_">Underscore</option>
<option value="|">Pipe</option>
</param>
<conditional name="colnames_option">
<param name="header" type="select" label="Is there a header for the data's columns ?">
<option value="Y">Yes</option>
<option value="N" selected="true">No</option>
</param>
<when value="Y">
<param name="list" type="data_column" data_ref="input" use_header_names="true" multiple="true" label="List of Fields" help="(-f)" />
</when>
<when value="N">
<param name="list" type="data_column" data_ref="input" multiple="true" label="List of Fields" help="(-f)" />
</when>
</conditional>
</when>
<when value="-c">
<param name="list" type="text" value="" label="List of characters" help="These will be kept/discarded (depending on 'operation'). &lt;BR /&gt; Examples: 1,3,4 or 2-5">
Expand All @@ -51,15 +65,6 @@
</sanitizer>
</param>
</when>
<when value="-b">
<param name="list" type="text" value="" label="List of Bytes" help="These will be kept/discarded (depending on 'operation'). &lt;BR /&gt; Examples: 1,3,4 or 2-5">
<sanitizer>
<valid initial="string.printable">
<remove value="&apos;"/>
</valid>
</sanitizer>
</param>
</when>
</conditional>
</inputs>
<outputs>
Expand All @@ -70,7 +75,7 @@
<conditional name="cut_type_options.cut_element">
<!-- fields -->
<when value="-f">
<conditional name="delimiter">
<conditional name="cut_type_options.delimiter">
<when value="T">
<conditional name="input">
<when datatype_isinstance="interval">
Expand Down Expand Up @@ -184,20 +189,26 @@
</data>
</outputs>
<tests>
<test>
<test expect_num_outputs="1">
<param name="input" value="cut1.txt"/>
<param name="list" value="1,3,4"/>
<param name="delimiter" value=""/>
<output name="output" file="cut_results1.txt"/>
</test>
<test>
<test expect_num_outputs="1">
<param name="input" value="cut1.txt"/>
<param name="list" value="2" />
<param name="delimiter" value="" />
<conditional name="cut_type_options">
<param name="delimiter" value="" />
<conditional name="colnames_option">
<param name="header" value="Y"/>
<!-- in the test we can (apparently) not select by header name -->
<param name="list" value="2" />
</conditional>
</conditional>
<param name="complement" value="--complement" />
<output name="output" file="cut_results2.txt"/>
</test>
<test>
<test expect_num_outputs="1">
<param name="input" value="cut1.txt"/>
<param name="list" value="-3" />
<param name="delimiter" value="" />
Expand Down
14 changes: 7 additions & 7 deletions tools/text_processing/text_processing/easyjoin.xml
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
<tool id="tp_easyjoin_tool" name="Join" version="@[email protected]">
<tool id="tp_easyjoin_tool" name="Join" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
<description>two files</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="creator"/>
<expand macro="requirements">
<requirement type="package" version="5.22.0.1">perl</requirement>
<requirement type="package" version="5.32">perl</requirement>
</expand>
<version_command>join --version | head -n 1</version_command>
<command>
<![CDATA[
cp $__tool_directory__/sort-header ./ &&
cp '$__tool_directory__/sort-header' ./ &&
chmod +x sort-header &&
perl $__tool_directory__/easyjoin
perl '$__tool_directory__/easyjoin'
$jointype
-t $'\t'
$header
Expand All @@ -20,8 +21,8 @@
$ignore_case
-1 '$column1'
-2 '$column2'
"$infile1"
"$infile2"
'$infile1'
'$infile2'
> '$output'
]]>
</command>
Expand Down Expand Up @@ -109,7 +110,6 @@ This tool joins two tabular files based on a common key column.
* The header line (**Fruit Color Price**) was joined and kept as first line.
* Missing values ( Avocado's color, missing from the first file ) are replaced with a period character.
@REFERENCES@
]]>
</help>
<expand macro="citations" />
Expand Down
6 changes: 3 additions & 3 deletions tools/text_processing/text_processing/find_and_replace.xml
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
<tool id="tp_find_and_replace" name="Replace" version="@[email protected]">
<tool id="tp_find_and_replace" name="Replace" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
<description>parts of text</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="creator"/>
<requirements>
<requirement type="package" version="5.22.0.1">perl</requirement>
<requirement type="package" version="5.32">perl</requirement>
</requirements>
<command>
<![CDATA[
Expand Down Expand Up @@ -240,7 +241,6 @@ The Find & Replace tool searches the data for lines containing or not containing
- **\\w** matches a single letter or digit or an underscore.
- **\\s** matches a single white-space (space or tabs).
@REFERENCES@
]]>
</help>
<expand macro="citations" />
Expand Down
28 changes: 14 additions & 14 deletions tools/text_processing/text_processing/grep.xml
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
<tool id="tp_grep_tool" name="Search in textfiles" version="@[email protected]">
<tool id="tp_grep_tool" name="Search in textfiles" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
<description>(grep)</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="creator"/>
<requirements>
<requirement type="package" version="2.14">grep</requirement>
<requirement type="package" version="4.4">sed</requirement><!-- for ansi2html.sh -->
<requirement type="package" version="3.11">grep</requirement>
<requirement type="package" version="4.8">sed</requirement><!-- for ansi2html.sh -->
</requirements>
<version_command>grep --version | head -n 1</version_command>
<command>
Expand All @@ -19,17 +20,17 @@
-B $lines_before
$invert
$case_sensitive
-- "${url_paste}"
'${infile}' | $__tool_directory__/ansi2html.sh > "${output}"
-- '${url_paste}'
'${infile}' | '$__tool_directory__/ansi2html.sh' > '${output}'
#else:
grep
$regex_type
-A $lines_after
-B $lines_before
$invert
$case_sensitive
-- "${url_paste}"
'${infile}' | grep -v "^--$" > "${output}"
-- '${url_paste}'
'${infile}' | grep -v "^--$" > '${output}'
#end if
]]>
</command>
Expand All @@ -42,9 +43,9 @@
</param>

<param name="regex_type" type="select" label="Type of regex">
<option value="-G">Basic</option>
<option value="-P" selected="true">Perl</option>
<option value="-E">Extended (egrep)</option>
<option value="-G">Basic (-G)</option>
<option value="-P" selected="true">Perl (-P)</option>
<option value="-E">Extended (egrep -E)</option>
</param>

<param name="url_paste" type="text" label="Regular Expression" help="See below for more details">
Expand Down Expand Up @@ -113,16 +114,16 @@
<param name="color" value="NOCOLOR" />
<output name="output" file="egrep_results1.txt" />
</test>
<test><!-- same regex as egrep test, but different outcome with basic regex -->
<test><!-- tests basic regex; + must be backslashed to match -->
<param name="infile" value="egrep1.txt" />
<param name="case_sensitive" value="case sensitive" />
<param name="regex_type" value="-G" />
<param name="invert" value="" />
<param name="url_paste" value="[^ ]+" />
<param name="url_paste" value="[^ ]\+" />
<param name="lines_before" value="0" />
<param name="lines_after" value="0" />
<param name="color" value="NOCOLOR" />
<output name="output" file="egrep_results2.txt" />
<output name="output" file="egrep_results1.txt" />
</test>
</tests>
<help>
Expand Down Expand Up @@ -187,7 +188,6 @@ The select tool searches the data for lines containing or not containing a match
- **$** matches the end of a line or string.
- **\|** Separates alternate possibilities.
@REFERENCES@
]]>
</help>
<expand macro="citations" />
Expand Down
4 changes: 2 additions & 2 deletions tools/text_processing/text_processing/head.xml
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
<tool id="tp_head_tool" name="Select first" version="@[email protected]">
<tool id="tp_head_tool" name="Select first" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
<description>lines from a dataset (head)</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="creator"/>
<expand macro="requirements" />
<version_command>head --version | head -n 1</version_command>
<command>
Expand Down Expand Up @@ -56,7 +57,6 @@ will produce::
chr7 56632 56652 D17003_CTCF_R6 310 +
chr7 56736 56756 D17003_CTCF_R7 354 +
@REFERENCES@
]]>
</help>
<expand macro="citations" />
Expand Down
Loading

0 comments on commit 70980e3

Please sign in to comment.