Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implemented DataFrame.lookup #1785

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
68 changes: 68 additions & 0 deletions databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -10246,6 +10246,74 @@ def from_dict(data, orient="columns", dtype=None, columns=None) -> "DataFrame":
"""
return DataFrame(pd.DataFrame.from_dict(data, orient=orient, dtype=dtype, columns=columns))

def lookup(self, row_labels, col_labels) -> np.ndarray:
"""
Label-based "fancy indexing" function for DataFrame.

Given equal-length arrays of row and column labels, return an
array of the values corresponding to each (row, col) pair.

`row_labels` and `col_labels` are not support the type `Series` and `Index`
to prevent performance degradation.

Parameters
----------
row_labels : sequence
The row labels to use for lookup.
col_labels : sequence
The column labels to use for lookup.

Returns
-------
numpy.ndarray
The found values.

Examples
--------
>>> kdf = ks.DataFrame({'A': [3, 4, 5, 6, 7],
... 'B': [10.0, 20.0, 30.0, 40.0, 50.0],
... 'C': ['a', 'b', 'c', 'd', 'e']})
>>> kdf
A B C
0 3 10.0 a
1 4 20.0 b
2 5 30.0 c
3 6 40.0 d
4 7 50.0 e

>>> kdf.lookup([0], ["C"])
array(['a'], dtype=object)

>>> kdf.lookup([2, 3], ["A", "B"])
array([ 5., 40.])
"""
from databricks.koalas.series import Series
from databricks.koalas.indexes import Index, MultiIndex

if isinstance(row_labels, (Series, Index)):
raise TypeError(
"'row_labels' doesn't support type '{}'.".format(type(row_labels).__name__)
)
if isinstance(col_labels, (Series, Index)):
itholic marked this conversation as resolved.
Show resolved Hide resolved
raise TypeError(
"'col_labels' doesn't support type '{}'.".format(type(col_labels).__name__)
)

if not isinstance(self.index, MultiIndex):
return (
self.loc[list(set(row_labels)), list(set(col_labels))]
.to_pandas()
.lookup(row_labels, col_labels)
)
else:
if len(row_labels) != len(col_labels):
raise ValueError("Row labels must have same size as column labels")
lookups = [
self.loc[row_label, col_label]
for row_label, col_label in zip(row_labels, col_labels)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this mean we launch jobs as many as labels?

]
return np.asarray(pd.Series(lookups))

def _to_internal_pandas(self):
"""
Return a pandas DataFrame directly from _internal to avoid overhead of copy.
Expand Down
1 change: 0 additions & 1 deletion databricks/koalas/missing/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ class _MissingPandasLikeDataFrame(object):
interpolate = _unsupported_function("interpolate")
itertuples = _unsupported_function("itertuples")
last = _unsupported_function("last")
lookup = _unsupported_function("lookup")
mode = _unsupported_function("mode")
reindex_like = _unsupported_function("reindex_like")
rename_axis = _unsupported_function("rename_axis")
Expand Down
57 changes: 57 additions & 0 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4098,6 +4098,63 @@ def test_from_dict(self):
kdf = ks.DataFrame.from_dict(data, orient="index", columns=["A", "B", "C", "D"])
self.assert_eq(pdf, kdf)

def test_lookup(self):
pdf = pd.DataFrame(
{
"A": [3, 4, 5, 6, 7],
"B": [10.0, 20.0, 30.0, 40.0, 50.0],
"C": ["a", "b", "c", "d", "e"],
}
)
kdf = ks.from_pandas(pdf)
itholic marked this conversation as resolved.
Show resolved Hide resolved

# list
self.assert_eq(pdf.lookup([0], ["C"]), kdf.lookup([0], ["C"]))
self.assert_list_eq(
pdf.lookup([0, 3, 4], ["A", "C", "A"]), kdf.lookup([0, 3, 4], ["A", "C", "A"])
)

# tuple
self.assert_eq(pdf.lookup((0,), ("C",)), kdf.lookup((0,), ("C",)))
self.assert_list_eq(
pdf.lookup((0, 3, 4), ("A", "C", "A")), kdf.lookup((0, 3, 4), ("A", "C", "A"))
)

# dict
self.assert_eq(pdf.lookup({0: None}, {"C": None}), kdf.lookup({0: None}, {"C": None}))
self.assert_list_eq(
pdf.lookup({0: None, 3: None, 4: None}, {"A": None, "C": None, "B": None}),
kdf.lookup({0: None, 3: None, 4: None}, {"A": None, "C": None, "B": None}),
)

# MultiIndex
pdf.index = pd.MultiIndex.from_tuples(
[("a", "v"), ("b", "w"), ("c", "x"), ("d", "y"), ("e", "z")]
)
kdf = ks.from_pandas(pdf)

self.assert_eq(pdf.lookup([("a", "v")], ["C"]), kdf.lookup([("a", "v")], ["C"]))
self.assert_list_eq(
pdf.lookup([("a", "v"), ("d", "y"), ("e", "z")], ["A", "C", "A"]),
kdf.lookup([("a", "v"), ("d", "y"), ("e", "z")], ["A", "C", "A"]),
)

err_msg = "Row labels must have same size as column labels"
with self.assertRaisesRegex(ValueError, err_msg):
kdf.lookup([0, 3, 4], ["A", "C"])
err_msg = "'row_labels' doesn't support type 'Index'."
with self.assertRaisesRegex(TypeError, err_msg):
kdf.lookup(ks.Index([0]), ["C"])
err_msg = "'row_labels' doesn't support type 'Series'."
with self.assertRaisesRegex(TypeError, err_msg):
kdf.lookup(ks.Series([0]), ["C"])
err_msg = "'col_labels' doesn't support type 'Index'."
with self.assertRaisesRegex(TypeError, err_msg):
kdf.lookup([0], ks.Index(["C"]))
err_msg = "'col_labels' doesn't support type 'Series'."
with self.assertRaisesRegex(TypeError, err_msg):
kdf.lookup([0], ks.Series(["C"]))

def test_pad(self):
pdf = pd.DataFrame(
{
Expand Down
1 change: 1 addition & 0 deletions docs/source/reference/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ Indexing, iteration
DataFrame.items
DataFrame.iteritems
DataFrame.iterrows
DataFrame.lookup
DataFrame.keys
DataFrame.pop
DataFrame.tail
Expand Down