Skip to content

Commit

Permalink
Initial implementation of Data.read_many (#11490)
Browse files Browse the repository at this point in the history
- Part of #11311
- Adds ability to read a list of files (Vector, Column, Table) into a Vector.
- Reading into a Table of objects or merged will come in a next PR.
  • Loading branch information
radeusgd authored Nov 8, 2024
1 parent 67db825 commit e76fe90
Show file tree
Hide file tree
Showing 17 changed files with 392 additions and 3 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,14 @@
programmatically.][11255]
- [DB_Table may be saved as a Data Link.][11371]
- [Support for dates before 1900 in Excel and signed AWS requests.][11373]
- [Added `Data.read_many` that allows to read a list of files in a single
operation.][11490]

[11235]: https://github.com/enso-org/enso/pull/11235
[11255]: https://github.com/enso-org/enso/pull/11255
[11371]: https://github.com/enso-org/enso/pull/11371
[11373]: https://github.com/enso-org/enso/pull/11373
[11490]: https://github.com/enso-org/enso/pull/11490

#### Enso Language & Runtime

Expand Down
57 changes: 56 additions & 1 deletion distribution/lib/Standard/Base/0.0.0-dev/src/Data.enso
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import project.Any.Any
import project.Data.Pair.Pair
import project.Data.Read.Many_Files_List.Many_Files_List
import project.Data.Read.Return_As.Return_As
import project.Data.Text.Encoding.Encoding
import project.Data.Text.Text
import project.Data.Vector.Vector
Expand Down Expand Up @@ -27,7 +29,7 @@ import project.System.File.Generic.Writable_File.Writable_File
from project.Data.Boolean import Boolean, False, True
from project.Meta.Enso_Project import enso_project
from project.Metadata.Choice import Option
from project.Metadata.Widget import Folder_Browse, Text_Input
from project.Metadata.Widget import Folder_Browse, Text_Input, Vector_Editor
from project.System.File_Format import Auto_Detect, File_Format

## ALIAS load, open
Expand Down Expand Up @@ -92,6 +94,59 @@ read path=(Missing_Argument.throw "path") format=Auto_Detect (on_problems : Prob
if file_obj.is_directory then Error.throw (Illegal_Argument.Error "Cannot `read` a directory, use `Data.list`.") else
file_obj.read format on_problems

## ALIAS load, open
GROUP Input
ICON data_input
Reads a a list of files into Enso.

Arguments:
- paths: A list of files to load. It can be a Vector, Column or Table of
files, paths or URIs to fetch. If a Table is provided, it must either
contain a single column or a column called `path` (case insensitive).
- format: A `File_Format` object used to read files into memory.
If `Auto_Detect` is specified; each file determines the specific
type and configures it appropriately. If there is no matching type then
a `File_Error.Unsupported_Type` error is returned.
- return: Specifies the shape of the data to return.
- on_problems: Specifies the behavior when a problem occurs during the
function.
By default, if one of the files fails to load, a warning is issued and the
entry for that file becomes `Nothing`, but the operation proceeds.
If set to `Report_Error`, the operation fails with a dataflow error on the
first failing file.
If set to `Ignore`, the operation proceeds without errors or warnings,
replacing files that fail to load with `Nothing`.

! Request Caching

Responses to HTTP data requests are cached, and additional requests for the
same resources will use the cache, saving a round-trip call to the remote
server. Two resources are considered the same if the URIs and request
headers are the same. Header order does not affect sameness.

The cache respects the "max-age" and "Age" response headers; see
`Data.fetch` for more details.

The cached values are retained as long as the project remains open. Closing
a project will clear the cache.

> Example
Read all CSV files from a directory into a single merged table.

from Standard.Table import all
import Standard.Examples

files = Data.list name_filter="*.csv"
example_csv_dir_to_table = Data.read_many files
@paths (Vector_Editor item_editor=Text_Input item_default='""')
@format File_Format.default_widget
read_many : Many_Files_List -> File_Format -> Return_As -> Problem_Behavior -> Any ! File_Error
read_many (paths : Many_Files_List = Missing_Argument.throw "paths") format=Auto_Detect return=..Vector (on_problems : Problem_Behavior = ..Report_Warning) =
return_as = Return_As.resolve return
loaded_objects = paths.paths_to_load.map on_problems=on_problems path->
Data.read path format on_problems
return_as.make_return paths loaded_objects

## ALIAS load text, open text
GROUP Input
ICON data_input
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import project.Data.Text.Text
import project.Data.Vector.Vector

## A common interface that represents a list of files that can be read.

Various types (e.g. Vector, Column) can convert to this type to be able to be
used in `Data.read_many`.
type Many_Files_List
## PRIVATE
Value original_value paths_to_load:Vector

## PRIVATE
to_text self -> Text =
"Many_Files_List "+self.original_value.to_text

## PRIVATE
to_display_text self -> Text =
"Many_Files_List "+self.original_value.to_display_text

## PRIVATE
Many_Files_List.from (that : Vector) =
Many_Files_List.Value that that
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import project.Any.Any
import project.Data.Text.Text
import project.Data.Read.Many_Files_List.Many_Files_List
import project.Data.Vector.Vector
import project.Error.Error
import project.Errors.Common.Type_Error
import project.Errors.Illegal_Argument.Illegal_Argument
import project.Function.Function
import project.Metadata.Display
import project.Metadata.Widget
import project.Nothing.Nothing
import project.Panic.Panic
from project.Data.Boolean import Boolean, False, True
from project.Metadata.Choice import Option
from project.Metadata.Widget import Single_Choice

polyglot java import org.enso.base.read.ReadManyReturnSPI

private _get_known_return_classes -> Vector =
Vector.from_polyglot_array (ReadManyReturnSPI.get_types False)

## A common interface that represents ways to return a list of files that have
been read.
type Return_As
## PRIVATE
Instance underlying

## PRIVATE
to_text self -> Text = self.underlying.to_text

## PRIVATE
to_display_text self -> Text = self.underlying.to_display_text

## PRIVATE
make_return self (input : Many_Files_List) (objects : Vector Any) =
self.underlying.make_return input objects

## PRIVATE
Resolve an unresolved constructor to the actual type.
private resolve value = case value of
_ : Function ->
types = _get_known_return_classes
try_next idx =
if idx >= types.length then Error.throw (Illegal_Argument.Error "Expected Return_As, but got a function.") else
resolved = (types.at idx).resolve value
if resolved.is_nothing then @Tail_Call try_next (idx + 1) else resolved
try_next 0
_ : Return_As -> value
_ -> Panic.throw (Type_Error.Error Return_As value "Expected `return` to be a Return_As type, but got {got}.")

## PRIVATE
default_widget : Widget
default_widget =
options = _get_known_return_classes.map .get_dropdown_options
Single_Choice display=Display.Always values=options

## PRIVATE
type Return_As_Base
## Will return a Vector of objects that were loaded.
The order of the returned Vector is the same as in the input.
Vector

## PRIVATE
get_dropdown_options : Vector Option
get_dropdown_options = [Option "Vector" "..Vector"]

## PRIVATE
resolve value =
Panic.catch Type_Error (value:Return_As_Base) _->Nothing

## PRIVATE
make_return self (input : Many_Files_List) (objects : Vector Any) =
_ = input
objects

## PRIVATE
Return_As.from (that : Return_As_Base) =
Return_As.Instance that
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from Standard.Base import all
import Standard.Base.Data.Read.Many_Files_List.Many_Files_List
import Standard.Base.Errors.Common.Index_Out_Of_Bounds
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
import Standard.Base.Errors.Illegal_State.Illegal_State
Expand Down Expand Up @@ -2168,3 +2169,8 @@ Vector.from (that:DB_Column) =

## PRIVATE
Cleansable_Text.from (that:DB_Column) = Cleansable_Text.Value (pattern->replace_with-> (that.text_replace (regex pattern) replace_with).rename that.name)

## PRIVATE
Many_Files_List.from (that : DB_Column) =
_ = that
Error.throw (Illegal_Argument.Error "`read_many` cannot be used with Database columns. Materialize the column into memory using `.read` first.")
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from Standard.Base import all
import Standard.Base.Data.Array_Proxy.Array_Proxy
import Standard.Base.Data.Filter_Condition as Filter_Condition_Module
import Standard.Base.Data.Read.Many_Files_List.Many_Files_List
import Standard.Base.Data.Time.Errors.Date_Time_Format_Parse_Error
import Standard.Base.Data.Vector.Builder
import Standard.Base.Errors.Common.Additional_Warnings
Expand Down Expand Up @@ -3120,3 +3121,8 @@ make_literal_table connection column_vectors column_names alias =
connection.dialect.make_cast base_column sql_type infer_type_from_database

DB_Table.Value alias connection internal_columns context

## PRIVATE
Many_Files_List.from (that : DB_Table) =
_ = that
Error.throw (Illegal_Argument.Error "`read_many` cannot be used with Database tables. Materialize the table into memory using `.read` first.")
1 change: 0 additions & 1 deletion distribution/lib/Standard/Database/0.0.0-dev/src/Main.enso
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,3 @@ export project.Extensions.Upload_In_Memory_Table.update_rows
export project.SQL_Query.SQL_Query

export project.Update_Action.Update_Action

7 changes: 7 additions & 0 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Column.enso
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from Standard.Base import all
import Standard.Base.Data.Array_Proxy.Array_Proxy
import Standard.Base.Data.Read.Many_Files_List.Many_Files_List
import Standard.Base.Data.Vector.No_Wrap
import Standard.Base.Errors.Common.Arithmetic_Error
import Standard.Base.Errors.Common.Incomparable_Values
Expand All @@ -22,6 +23,7 @@ import project.Internal.Column_Ops
import project.Internal.Date_Time_Helpers
import project.Internal.Java_Problems
import project.Internal.Parse_Values_Helper
import project.Internal.Read_Many_Helpers
import project.Internal.Storage
import project.Internal.Value_Type_Helpers
import project.Internal.Widget_Helpers
Expand Down Expand Up @@ -2927,3 +2929,8 @@ apply_unary_map column:Column new_name:Text function expected_result_type:Value_
Java_Problems.with_map_operation_problem_aggregator column.name Problem_Behavior.Report_Warning java_problem_aggregator->
map_column = UnaryOperation.mapFunction column.java_column function nothing_unchanged storage_type new_name java_problem_aggregator
Column.Value map_column

## PRIVATE
Many_Files_List.from (that : Column) =
Read_Many_Helpers.ensure_column_type_valid_to_be_files_list that <|
Many_Files_List.Value that that.to_vector
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
private

from Standard.Base import all
import Standard.Base.Data.Read.Many_Files_List.Many_Files_List
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument

import project.Column.Column
import project.Errors.Invalid_Value_Type
import project.Table.Table
import project.Value_Type.Value_Type

find_files_list_in_table (that : Table) -> Many_Files_List =
found_column = if that.column_count == 1 then that.at 0 else
path_columns = that.select_columns "path" case_sensitivity=..Insensitive on_problems=..Report_Error
not_found = path_columns.is_error || (path_columns.column_count == 0)
if not_found then Error.throw (Illegal_Argument.Error "To use a Table as file list, it must be a single column or contain a `path` column (case insensitive).") else
if path_columns.column_count > 1 then Error.throw (Illegal_Argument.Error "Multiple 'paths' column candidates found: "+path_columns.column_names.to_display_text+".") else
path_columns.at 0
ensure_column_type_valid_to_be_files_list found_column <|
Many_Files_List.Value that found_column.to_vector

ensure_column_type_valid_to_be_files_list (column : Column) ~action =
is_expected_type = case column.value_type of
# Columns containing File objects will be Mixed
Value_Type.Mixed -> True
# Columns containing paths as Text will be Char
Value_Type.Char _ _ -> True
_ -> False
if is_expected_type then action else
Error.throw (Invalid_Value_Type.Column "Text or Mixed" column.value_type column.name)
1 change: 0 additions & 1 deletion distribution/lib/Standard/Table/0.0.0-dev/src/Main.enso
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,3 @@ export project.Table.Table
export project.Value_Type.Auto
export project.Value_Type.Bits
export project.Value_Type.Value_Type

6 changes: 6 additions & 0 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Table.enso
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from Standard.Base import all
import Standard.Base.Data.Array_Proxy.Array_Proxy
import Standard.Base.Data.Filter_Condition as Filter_Condition_Module
import Standard.Base.Data.Read.Many_Files_List.Many_Files_List
import Standard.Base.Data.Time.Errors.Date_Time_Format_Parse_Error
import Standard.Base.Data.Vector.No_Wrap
import Standard.Base.Errors.Common.Additional_Warnings
Expand Down Expand Up @@ -47,6 +48,7 @@ import project.Internal.Lookup_Helpers
import project.Internal.Lookup_Helpers.Lookup_Column
import project.Internal.Parse_Values_Helper
import project.Internal.Problem_Builder.Problem_Builder
import project.Internal.Read_Many_Helpers
import project.Internal.Replace_Helpers
import project.Internal.Split_Tokenize
import project.Internal.Table_Helpers
Expand Down Expand Up @@ -3881,3 +3883,7 @@ make_fill_nothing_default_widget table cache=Nothing =
## PRIVATE
Helper method for internal use to make a Table from a Java Table.
from_java_table java_table = Table.Value java_table

## PRIVATE
Many_Files_List.from (that : Table) =
Read_Many_Helpers.find_files_list_in_table that
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package org.enso.base.read;

@org.openide.util.lookup.ServiceProvider(service = ReadManyReturnSPI.class)
public class BaseReadManyReturnSPI extends ReadManyReturnSPI {
@Override
protected String getModuleName() {
return "Standard.Base.Data.Read.Return_As";
}

@Override
protected String getTypeName() {
return "Return_As_Base";
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package org.enso.base.read;

import java.util.ServiceLoader;
import org.enso.base.polyglot.EnsoMeta;
import org.graalvm.polyglot.Value;

public abstract class ReadManyReturnSPI {
private static final ServiceLoader<ReadManyReturnSPI> loader =
ServiceLoader.load(ReadManyReturnSPI.class, ReadManyReturnSPI.class.getClassLoader());

public static Value[] get_types(boolean refresh) {
if (refresh) {
loader.reload();
}
return loader.stream().map(provider -> provider.get().getTypeObject()).toArray(Value[]::new);
}

public Value getTypeObject() {
return EnsoMeta.getType(getModuleName(), getTypeName());
}

protected abstract String getModuleName();

protected abstract String getTypeName();
}
6 changes: 6 additions & 0 deletions test/Base_Tests/src/Network/Http_Spec.enso
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,12 @@ add_specs suite_builder =
r = Data.read (URI.from url_get)
r.should_be_a JS_Object

group_builder.specify "can use URI or Text URLs in Data.read_many" <|
r = Data.read_many [URI.from url_get, url_get]
r.should_be_a Vector
r.at 0 . should_be_a JS_Object
r.at 1 . should_be_a JS_Object

group_builder.specify "works if HTTP is uppercase" <| Test.with_retries <|
r = Data.fetch (url_get.replace "http" "HTTP")
r.should_be_a JS_Object
Expand Down
Loading

0 comments on commit e76fe90

Please sign in to comment.