From 31d299af56141be0338186fa3e38113846dc9863 Mon Sep 17 00:00:00 2001 From: Isira Seneviratne Date: Thu, 28 Dec 2023 13:27:27 +0530 Subject: [PATCH 1/6] Add select methods returning element streams --- src/main/java/org/jsoup/nodes/Element.java | 35 +++++++++++++++++++ src/main/java/org/jsoup/select/Collector.java | 25 ++++++++----- src/main/java/org/jsoup/select/Selector.java | 27 ++++++++++++++ 3 files changed, 78 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/jsoup/nodes/Element.java b/src/main/java/org/jsoup/nodes/Element.java index 0a63d73bf8..c161967dad 100644 --- a/src/main/java/org/jsoup/nodes/Element.java +++ b/src/main/java/org/jsoup/nodes/Element.java @@ -485,6 +485,41 @@ public Elements select(Evaluator evaluator) { return Selector.select(evaluator, this); } + /** + * Find elements that match the {@link Selector} CSS query, with this element as the starting context. Matched elements + * may include this element, or any of its children. + *

This method is generally more powerful to use than the DOM-type {@code getElementBy*} methods, because + * multiple filters can be combined, e.g.:

+ * + *

See the query syntax documentation in {@link org.jsoup.select.Selector}.

+ *

Also known as {@code querySelectorAll()} in the Web DOM.

+ * + * @param cssQuery a {@link Selector} CSS-like query + * @return a {@link Stream} containing elements that match the query (empty if none match) + * @see Selector selector query syntax + * @see QueryParser#parse(String) + * @throws Selector.SelectorParseException (unchecked) on an invalid CSS query. + * @since 1.17.2 + */ + public Stream selectAsStream(String cssQuery) { + return Selector.selectAsStream(cssQuery, this); + } + + /** + * Find elements that match the supplied Evaluator. This has the same functionality as {@link #select(String)}, but + * may be useful if you are running the same query many times (on many documents) and want to save the overhead of + * repeatedly parsing the CSS query. + * @param evaluator an element evaluator + * @return a {@link Stream} containing elements that match the query (empty if none match) + * @since 1.17.2 + */ + public Stream selectAsStream(Evaluator evaluator) { + return Selector.selectAsStream(evaluator, this); + } + /** * Find the first Element that matches the {@link Selector} CSS query, with this element as the starting context. *

This is effectively the same as calling {@code element.select(query).first()}, but is more efficient as query diff --git a/src/main/java/org/jsoup/select/Collector.java b/src/main/java/org/jsoup/select/Collector.java index 02b0528384..6638f5ef6b 100644 --- a/src/main/java/org/jsoup/select/Collector.java +++ b/src/main/java/org/jsoup/select/Collector.java @@ -3,8 +3,8 @@ import org.jsoup.nodes.Element; import org.jspecify.annotations.Nullable; -import java.util.Optional; import java.util.stream.Collectors; +import java.util.stream.Stream; /** * Collects a list of elements that match the supplied criteria. @@ -21,12 +21,22 @@ private Collector() {} @param root root of tree to descend @return list of matches; empty if none */ - public static Elements collect (Evaluator eval, Element root) { - eval.reset(); + public static Elements collect(Evaluator eval, Element root) { + return stream(eval, root).collect(Collectors.toCollection(Elements::new)); + } + + /** + * Obtain a stream of elements by visiting root and every descendant of root and testing it + * against the evaluator. + * @param evaluator Evaluator to test elements against + * @param root root of tree to descend + * @return A {@link Stream} of matches + */ + public static Stream stream(Evaluator evaluator, Element root) { + evaluator.reset(); return root.stream() - .filter(eval.asPredicate(root)) - .collect(Collectors.toCollection(Elements::new)); + .filter(evaluator.asPredicate(root)); } /** @@ -37,9 +47,6 @@ public static Elements collect (Evaluator eval, Element root) { @return the first match; {@code null} if none */ public static @Nullable Element findFirst(Evaluator eval, Element root) { - eval.reset(); - - Optional first = root.stream().filter(eval.asPredicate(root)).findFirst(); - return first.orElse(null); + return stream(eval, root).findFirst().orElse(null); } } diff --git a/src/main/java/org/jsoup/select/Selector.java b/src/main/java/org/jsoup/select/Selector.java index 322cfa07bc..6c5bf99140 100644 --- a/src/main/java/org/jsoup/select/Selector.java +++ b/src/main/java/org/jsoup/select/Selector.java @@ -6,6 +6,7 @@ import java.util.Collection; import java.util.IdentityHashMap; +import java.util.stream.Stream; /** * CSS-like element selector, that finds elements matching a query. @@ -114,6 +115,32 @@ public static Elements select(Evaluator evaluator, Element root) { return Collector.collect(evaluator, root); } + /** + * Find elements matching selector. + * + * @param query CSS selector + * @param root root element to descend into + * @return matching elements, empty if none + * @throws Selector.SelectorParseException (unchecked) on an invalid CSS query. + */ + public static Stream selectAsStream(String query, Element root) { + Validate.notEmpty(query); + return selectAsStream(QueryParser.parse(query), root); + } + + /** + * Find elements matching selector. + * + * @param evaluator CSS selector + * @param root root element to descend into + * @return matching elements, empty if none + */ + public static Stream selectAsStream(Evaluator evaluator, Element root) { + Validate.notNull(evaluator); + Validate.notNull(root); + return Collector.stream(evaluator, root); + } + /** * Find elements matching selector. * From da7bb52363590a17fe338491a91079fee285ea71 Mon Sep 17 00:00:00 2001 From: Isira Seneviratne Date: Fri, 29 Dec 2023 06:25:31 +0530 Subject: [PATCH 2/6] Avoid creating list in getElementById --- src/main/java/org/jsoup/nodes/Element.java | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/main/java/org/jsoup/nodes/Element.java b/src/main/java/org/jsoup/nodes/Element.java index c161967dad..83b04a21e7 100644 --- a/src/main/java/org/jsoup/nodes/Element.java +++ b/src/main/java/org/jsoup/nodes/Element.java @@ -1163,11 +1163,7 @@ public Elements getElementsByTag(String tagName) { public @Nullable Element getElementById(String id) { Validate.notEmpty(id); - Elements elements = Collector.collect(new Evaluator.Id(id), this); - if (elements.size() > 0) - return elements.get(0); - else - return null; + return selectAsStream(new Evaluator.Id(id)).findFirst().orElse(null); } /** From be174208304b5ee817863cae0fad57eaf6b46282 Mon Sep 17 00:00:00 2001 From: Isira Seneviratne Date: Sat, 30 Dec 2023 09:24:32 +0530 Subject: [PATCH 3/6] Address code review comments --- src/main/java/org/jsoup/nodes/Element.java | 18 +++++++++--------- src/main/java/org/jsoup/select/Selector.java | 6 +++--- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/main/java/org/jsoup/nodes/Element.java b/src/main/java/org/jsoup/nodes/Element.java index 83b04a21e7..41ba2e9482 100644 --- a/src/main/java/org/jsoup/nodes/Element.java +++ b/src/main/java/org/jsoup/nodes/Element.java @@ -491,8 +491,8 @@ public Elements select(Evaluator evaluator) { *

This method is generally more powerful to use than the DOM-type {@code getElementBy*} methods, because * multiple filters can be combined, e.g.:

*
    - *
  • {@code el.select("a[href]")} - finds links ({@code a} tags with {@code href} attributes) - *
  • {@code el.select("a[href*=example.com]")} - finds links pointing to example.com (loosely) + *
  • {@code el.selectStream("a[href]")} - finds links ({@code a} tags with {@code href} attributes) + *
  • {@code el.selectStream("a[href*=example.com]")} - finds links pointing to example.com (loosely) *
*

See the query syntax documentation in {@link org.jsoup.select.Selector}.

*

Also known as {@code querySelectorAll()} in the Web DOM.

@@ -502,10 +502,10 @@ public Elements select(Evaluator evaluator) { * @see Selector selector query syntax * @see QueryParser#parse(String) * @throws Selector.SelectorParseException (unchecked) on an invalid CSS query. - * @since 1.17.2 + * @since 1.18.1 */ - public Stream selectAsStream(String cssQuery) { - return Selector.selectAsStream(cssQuery, this); + public Stream selectStream(String cssQuery) { + return Selector.selectStream(cssQuery, this); } /** @@ -514,10 +514,10 @@ public Stream selectAsStream(String cssQuery) { * repeatedly parsing the CSS query. * @param evaluator an element evaluator * @return a {@link Stream} containing elements that match the query (empty if none match) - * @since 1.17.2 + * @since 1.18.1 */ - public Stream selectAsStream(Evaluator evaluator) { - return Selector.selectAsStream(evaluator, this); + public Stream selectStream(Evaluator evaluator) { + return Selector.selectStream(evaluator, this); } /** @@ -1163,7 +1163,7 @@ public Elements getElementsByTag(String tagName) { public @Nullable Element getElementById(String id) { Validate.notEmpty(id); - return selectAsStream(new Evaluator.Id(id)).findFirst().orElse(null); + return selectStream(new Evaluator.Id(id)).findFirst().orElse(null); } /** diff --git a/src/main/java/org/jsoup/select/Selector.java b/src/main/java/org/jsoup/select/Selector.java index 6c5bf99140..c23c47d343 100644 --- a/src/main/java/org/jsoup/select/Selector.java +++ b/src/main/java/org/jsoup/select/Selector.java @@ -123,9 +123,9 @@ public static Elements select(Evaluator evaluator, Element root) { * @return matching elements, empty if none * @throws Selector.SelectorParseException (unchecked) on an invalid CSS query. */ - public static Stream selectAsStream(String query, Element root) { + public static Stream selectStream(String query, Element root) { Validate.notEmpty(query); - return selectAsStream(QueryParser.parse(query), root); + return selectStream(QueryParser.parse(query), root); } /** @@ -135,7 +135,7 @@ public static Stream selectAsStream(String query, Element root) { * @param root root element to descend into * @return matching elements, empty if none */ - public static Stream selectAsStream(Evaluator evaluator, Element root) { + public static Stream selectStream(Evaluator evaluator, Element root) { Validate.notNull(evaluator); Validate.notNull(root); return Collector.stream(evaluator, root); From 5e873b70e186c937030d7a980f47673494292ece Mon Sep 17 00:00:00 2001 From: Isira Seneviratne Date: Sat, 30 Dec 2023 11:45:25 +0530 Subject: [PATCH 4/6] Add selectStream test --- src/test/java/org/jsoup/nodes/ElementTest.java | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/test/java/org/jsoup/nodes/ElementTest.java b/src/test/java/org/jsoup/nodes/ElementTest.java index 71a43b7898..82097398e4 100644 --- a/src/test/java/org/jsoup/nodes/ElementTest.java +++ b/src/test/java/org/jsoup/nodes/ElementTest.java @@ -2924,4 +2924,15 @@ void prettySerializationRoundTrips(Document.OutputSettings settings) { assertEquals("

One

", p.outerHtml()); assertEquals("CLASS=\"YES\"", attr.html()); } + + @Test void testSelectStream() { + Document doc = Jsoup.parse("
Hello world
"); + Element div = doc.select("div").stream().findFirst().orElse(null); + + assertEquals("Hello world", div.text()); + + div = doc.selectStream("div").findFirst().orElse(null); + + assertEquals("Hello world", div.text()); + } } From ade3fc47893bceb1abefe976830394b472b7cbbe Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Sat, 14 Dec 2024 11:32:52 +1100 Subject: [PATCH 5/6] Updated Javadoc, changes, added an order test --- CHANGES.md | 10 ++- src/main/java/org/jsoup/nodes/Element.java | 44 ++++++------- src/main/java/org/jsoup/select/Collector.java | 7 +- src/main/java/org/jsoup/select/Selector.java | 65 ++++++++++--------- .../java/org/jsoup/select/SelectorTest.java | 15 +++++ 5 files changed, 81 insertions(+), 60 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 4c2eed2922..a650af40b9 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -5,8 +5,14 @@ ### Changes * Updated the minimum Android API Level validation from 10 to **21**. As with previous jsoup versions, Android - developers need to enable core library desugaring. The minimum Java version remains Java - 8. [2173](https://github.com/jhy/jsoup/pull/2173) + developers need to enable core library desugaring. The minimum Java version remains Java 8. + [2173](https://github.com/jhy/jsoup/pull/2173) + +### Improvements + +* Added `Element#selectStream(String query)` and `Element#selectStream(Evaluator )` methods, that return a `Stream` of + matching elements. Elements are evaluated and returned as they are found, and the stream can be + terminated early. [2092](https://github.com/jhy/jsoup/pull/2092) ### Bug Fixes diff --git a/src/main/java/org/jsoup/nodes/Element.java b/src/main/java/org/jsoup/nodes/Element.java index 6c96e99459..5643135194 100644 --- a/src/main/java/org/jsoup/nodes/Element.java +++ b/src/main/java/org/jsoup/nodes/Element.java @@ -487,35 +487,33 @@ public Elements select(Evaluator evaluator) { } /** - * Find elements that match the {@link Selector} CSS query, with this element as the starting context. Matched elements - * may include this element, or any of its children. - *

This method is generally more powerful to use than the DOM-type {@code getElementBy*} methods, because - * multiple filters can be combined, e.g.:

- *
    - *
  • {@code el.selectStream("a[href]")} - finds links ({@code a} tags with {@code href} attributes) - *
  • {@code el.selectStream("a[href*=example.com]")} - finds links pointing to example.com (loosely) - *
- *

See the query syntax documentation in {@link org.jsoup.select.Selector}.

- *

Also known as {@code querySelectorAll()} in the Web DOM.

- * - * @param cssQuery a {@link Selector} CSS-like query - * @return a {@link Stream} containing elements that match the query (empty if none match) - * @see Selector selector query syntax - * @see QueryParser#parse(String) - * @throws Selector.SelectorParseException (unchecked) on an invalid CSS query. - * @since 1.18.1 + Selects elements from the given root that match the specified {@link Selector} CSS query, with this element as the + starting context, and returns them as a lazy Stream. Matched elements may include this element, or any of its + children. +

+ Unlike {@link #select(String query)}, which returns a complete list of all matching elements, this method returns a + {@link Stream} that processes elements lazily as they are needed. The stream operates in a "pull" model — elements + are fetched from the root as the stream is traversed. You can use standard {@code Stream} operations such as + {@code filter}, {@code map}, or {@code findFirst} to process elements on demand. +

+ + @param cssQuery a {@link Selector} CSS-like query + @return a {@link Stream} containing elements that match the query (empty if none match) + @throws Selector.SelectorParseException (unchecked) on an invalid CSS query. + @see Selector selector query syntax + @see QueryParser#parse(String) + @since 1.19.1 */ public Stream selectStream(String cssQuery) { return Selector.selectStream(cssQuery, this); } /** - * Find elements that match the supplied Evaluator. This has the same functionality as {@link #select(String)}, but - * may be useful if you are running the same query many times (on many documents) and want to save the overhead of - * repeatedly parsing the CSS query. - * @param evaluator an element evaluator - * @return a {@link Stream} containing elements that match the query (empty if none match) - * @since 1.18.1 + Find a Stream of elements that match the supplied Evaluator. + + @param evaluator an element Evaluator + @return a {@link Stream} containing elements that match the query (empty if none match) + @since 1.19.1 */ public Stream selectStream(Evaluator evaluator) { return Selector.selectStream(evaluator, this); diff --git a/src/main/java/org/jsoup/select/Collector.java b/src/main/java/org/jsoup/select/Collector.java index 6638f5ef6b..e4aefcd5bc 100644 --- a/src/main/java/org/jsoup/select/Collector.java +++ b/src/main/java/org/jsoup/select/Collector.java @@ -16,7 +16,7 @@ public class Collector { private Collector() {} /** - Build a list of elements, by visiting root and every descendant of root, and testing it against the evaluator. + Build a list of elements, by visiting the root and every descendant of root, and testing it against the Evaluator. @param eval Evaluator to test elements against @param root root of tree to descend @return list of matches; empty if none @@ -26,7 +26,7 @@ public static Elements collect(Evaluator eval, Element root) { } /** - * Obtain a stream of elements by visiting root and every descendant of root and testing it + * Obtain a Stream of elements by visiting the root and every descendant of root and testing it * against the evaluator. * @param evaluator Evaluator to test elements against * @param root root of tree to descend @@ -35,8 +35,7 @@ public static Elements collect(Evaluator eval, Element root) { public static Stream stream(Evaluator evaluator, Element root) { evaluator.reset(); - return root.stream() - .filter(evaluator.asPredicate(root)); + return root.stream().filter(evaluator.asPredicate(root)); } /** diff --git a/src/main/java/org/jsoup/select/Selector.java b/src/main/java/org/jsoup/select/Selector.java index 8c2319a5f8..c0c574fbc4 100644 --- a/src/main/java/org/jsoup/select/Selector.java +++ b/src/main/java/org/jsoup/select/Selector.java @@ -91,12 +91,12 @@ public class Selector { private Selector() {} /** - * Find elements matching selector. - * - * @param query CSS selector - * @param root root element to descend into - * @return matching elements, empty if none - * @throws Selector.SelectorParseException (unchecked) on an invalid CSS query. + Find Elements matching the CSS query. + + @param query CSS selector + @param root root element to descend into + @return matching elements, empty if none + @throws Selector.SelectorParseException (unchecked) on an invalid CSS query. */ public static Elements select(String query, Element root) { Validate.notEmpty(query); @@ -104,11 +104,11 @@ public static Elements select(String query, Element root) { } /** - * Find elements matching selector. - * - * @param evaluator CSS selector - * @param root root element to descend into - * @return matching elements, empty if none + Find Elements matching the Evaluator. + + @param evaluator CSS Evaluator + @param root root (context) element to start from + @return matching elements, empty if none */ public static Elements select(Evaluator evaluator, Element root) { Validate.notNull(evaluator); @@ -117,12 +117,13 @@ public static Elements select(Evaluator evaluator, Element root) { } /** - * Find elements matching selector. - * - * @param query CSS selector - * @param root root element to descend into - * @return matching elements, empty if none - * @throws Selector.SelectorParseException (unchecked) on an invalid CSS query. + Finds a Stream of elements matching the CSS query. + + @param query CSS selector + @param root root element to descend into + @return a Stream of matching elements, empty if none + @throws Selector.SelectorParseException (unchecked) on an invalid CSS query. + @since 1.19.1 */ public static Stream selectStream(String query, Element root) { Validate.notEmpty(query); @@ -130,11 +131,12 @@ public static Stream selectStream(String query, Element root) { } /** - * Find elements matching selector. - * - * @param evaluator CSS selector - * @param root root element to descend into - * @return matching elements, empty if none + Finds a Stream of elements matching the evaluator. + + @param evaluator CSS selector + @param root root element to descend into + @return matching elements, empty if none + @since 1.19.1 */ public static Stream selectStream(Evaluator evaluator, Element root) { Validate.notNull(evaluator); @@ -143,11 +145,11 @@ public static Stream selectStream(Evaluator evaluator, Element root) { } /** - * Find elements matching selector. - * - * @param query CSS selector - * @param roots root elements to descend into - * @return matching elements, empty if none + Find elements matching the query. + + @param query CSS selector + @param roots root elements to descend into + @return matching elements, empty if none */ public static Elements select(String query, Iterable roots) { Validate.notEmpty(query); @@ -186,10 +188,11 @@ static Elements filterOut(Collection elements, Collection outs } /** - * Find the first element that matches the query. - * @param cssQuery CSS selector - * @param root root element to descend into - * @return the matching element, or null if none. + Find the first Element that matches the query. + + @param cssQuery CSS selector + @param root root element to descend into + @return the matching element, or null if none. */ public static @Nullable Element selectFirst(String cssQuery, Element root) { Validate.notEmpty(cssQuery); diff --git a/src/test/java/org/jsoup/select/SelectorTest.java b/src/test/java/org/jsoup/select/SelectorTest.java index 0ae4048e3f..78d5bdb6be 100644 --- a/src/test/java/org/jsoup/select/SelectorTest.java +++ b/src/test/java/org/jsoup/select/SelectorTest.java @@ -8,7 +8,9 @@ import org.junit.jupiter.api.Test; import java.util.IdentityHashMap; +import java.util.List; import java.util.Locale; +import java.util.stream.Collectors; import static org.junit.jupiter.api.Assertions.*; @@ -402,6 +404,19 @@ public void testByAttributeStarting(Locale locale) { assertEquals("span", divChilds.get(2).tagName()); } + @Test public void streamParentChildStar() { + String h = "

Hello

there

Hi
"; + Document doc = Jsoup.parse(h); + + List divChilds = doc.selectStream("div > *") + .collect(Collectors.toList()); + + assertEquals(3, divChilds.size()); + assertEquals("p", divChilds.get(0).tagName()); + assertEquals("p", divChilds.get(1).tagName()); + assertEquals("span", divChilds.get(2).tagName()); + } + @Test public void multiChildDescent() { String h = ""; Document doc = Jsoup.parse(h); From ead53559f2ba229daea5ad937147244c89e5bb0c Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Sat, 14 Dec 2024 11:40:26 +1100 Subject: [PATCH 6/6] Javadoc and method tweak --- src/main/java/org/jsoup/nodes/Element.java | 3 +-- src/main/java/org/jsoup/select/Collector.java | 12 ++++++------ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/jsoup/nodes/Element.java b/src/main/java/org/jsoup/nodes/Element.java index 5643135194..1a1538edd6 100644 --- a/src/main/java/org/jsoup/nodes/Element.java +++ b/src/main/java/org/jsoup/nodes/Element.java @@ -1158,8 +1158,7 @@ public Elements getElementsByTag(String tagName) { */ public @Nullable Element getElementById(String id) { Validate.notEmpty(id); - - return selectStream(new Evaluator.Id(id)).findFirst().orElse(null); + return Collector.findFirst(new Evaluator.Id(id), this); } /** diff --git a/src/main/java/org/jsoup/select/Collector.java b/src/main/java/org/jsoup/select/Collector.java index e4aefcd5bc..4199401571 100644 --- a/src/main/java/org/jsoup/select/Collector.java +++ b/src/main/java/org/jsoup/select/Collector.java @@ -26,15 +26,15 @@ public static Elements collect(Evaluator eval, Element root) { } /** - * Obtain a Stream of elements by visiting the root and every descendant of root and testing it - * against the evaluator. - * @param evaluator Evaluator to test elements against - * @param root root of tree to descend - * @return A {@link Stream} of matches + Obtain a Stream of elements by visiting the root and every descendant of root and testing it against the evaluator. + + @param evaluator Evaluator to test elements against + @param root root of tree to descend + @return A {@link Stream} of matches + @since 1.19.1 */ public static Stream stream(Evaluator evaluator, Element root) { evaluator.reset(); - return root.stream().filter(evaluator.asPredicate(root)); }