Merge pull request #362 from brianxiadong/feat-brianxiadong-huggingface

feat(document-readers): Add HuggingFace File System Document Reader / 添加 HuggingFace 文件系统文档阅读器 related issue:#283
alibaba · Jan 13, 2025 · 1209a8f · 1209a8f
2 parents dfe1767 + 0273e24
commit 1209a8f
Show file tree

Hide file tree

Showing 5 changed files with 428 additions and 0 deletions.
diff --git a/community/document-readers/huggingface-fs-document-reader/README.md b/community/document-readers/huggingface-fs-document-reader/README.md
@@ -0,0 +1,89 @@
+# HuggingFace File System Document Reader
+
+HuggingFace文件系统文档阅读器是一个专门用于读取和解析HuggingFace数据集文件的组件。
+
+## 功能特点
+
+- 支持读取JSON Lines格式文件
+- 支持GZIP压缩文件的自动解压
+- 自动跳过无效的JSON行
+- 提供文档元数据支持
+- 与Spring AI文档体系无缝集成
+
+## 快速开始
+
+### Maven依赖
+
+```xml
+<dependency>
+    <groupId>com.alibaba.cloud.ai</groupId>
+    <artifactId>huggingface-fs-document-reader</artifactId>
+    <version>${version}</version>
+</dependency>
+```
+
+### 使用示例
+
+```java
+// 创建文档阅读器
+HuggingFaceFSDocumentReader reader = new HuggingFaceFSDocumentReader("/path/to/your/file.jsonl");
+
+// 读取文档
+List<Document> documents = reader.get();
+
+// 处理文档
+for (Document doc : documents) {
+    // 获取文档内容
+    String content = doc.getContent();
+
+    // 获取源文件路径（元数据）
+    String source = doc.getMetadata().get(HuggingFaceFSDocumentReader.SOURCE);
+
+    // 进行其他处理...
+}
+```
+
+### 支持的文件格式
+
+1. 普通JSONL文件：
+```json
+{"text": "文档内容1", "label": "标签1"}
+{"text": "文档内容2", "label": "标签2"}
+```
+
+2. GZIP压缩的JSONL文件：
+- 文件扩展名为`.gz`
+- 包含压缩的JSONL内容
+
+## 异常处理
+
+- 文件不存在时会抛出适当的异常
+- 无效的JSON行会被自动跳过
+- 提供了友好的错误信息
+
+## 最佳实践
+
+1. 文件命名：
+   - 普通文件使用`.jsonl`扩展名
+   - 压缩文件使用`.jsonl.gz`扩展名
+
+2. JSON格式：
+   - 每行一个完整的JSON对象
+   - 使用UTF-8编码
+   - 避免使用特殊字符
+
+3. 性能考虑：
+   - 对于大文件，建议使用GZIP压缩
+   - 注意内存使用，避免一次加载过大的文件
+
+## 贡献指南
+
+欢迎提交Issue和Pull Request来帮助改进这个组件。在提交代码时，请确保：
+
+1. 添加适当的单元测试
+2. 遵循代码规范
+3. 更新相关文档
+
+## 许可证
+
+本项目采用 Apache License 2.0 许可证。详见 [LICENSE](LICENSE) 文件。 
diff --git a/community/document-readers/huggingface-fs-document-reader/pom.xml b/community/document-readers/huggingface-fs-document-reader/pom.xml
@@ -0,0 +1,117 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  ~ Copyright 2024-2025 the original author or authors.
+  ~
+  ~ Licensed under the Apache License, Version 2.0 (the "License");
+  ~ you may not use this file except in compliance with the License.
+  ~ You may obtain a copy of the License at
+  ~
+  ~ https://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+    <parent>
+        <groupId>com.alibaba.cloud.ai</groupId>
+        <artifactId>spring-ai-alibaba</artifactId>
+        <version>${revision}</version>
+        <relativePath>../../../pom.xml</relativePath>
+    </parent>
+
+    <artifactId>huggingface-fs-document-reader</artifactId>
+    <name>huggingface-fs-document-reader</name>
+    <description>huggingface-fs reader for Spring AI Alibaba</description>
+    <packaging>jar</packaging>
+    <url>https://github.com/alibaba/spring-ai-alibaba</url>
+    <scm>
+        <url>https://github.com/alibaba/spring-ai-alibaba</url>
+        <connection>git://github.com/alibaba/spring-ai-alibaba.git</connection>
+        <developerConnection>[email protected]:alibaba/spring-ai-alibaba.git</developerConnection>
+    </scm>
+
+    <properties>
+        <maven.compiler.source>17</maven.compiler.source>
+        <maven.compiler.target>17</maven.compiler.target>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+        <maven-deploy-plugin.version>3.1.1</maven-deploy-plugin.version>
+    </properties>
+
+    <dependencies>
+
+        <dependency>
+            <groupId>com.alibaba.cloud.ai</groupId>
+            <artifactId>spring-ai-alibaba-core</artifactId>
+            <version>${project.parent.version}</version>
+        </dependency>
+
+        <dependency>
+            <groupId>com.alibaba.cloud.ai</groupId>
+            <artifactId>document-parser-tika</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+
+        <!-- test dependencies -->
+        <dependency>
+            <groupId>org.springframework.ai</groupId>
+            <artifactId>spring-ai-test</artifactId>
+            <scope>test</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>org.springframework.boot</groupId>
+            <artifactId>spring-boot-starter-test</artifactId>
+            <scope>test</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>io.projectreactor</groupId>
+            <artifactId>reactor-test</artifactId>
+            <scope>test</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>io.micrometer</groupId>
+            <artifactId>micrometer-observation-test</artifactId>
+            <scope>test</scope>
+        </dependency>
+
+    </dependencies>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.springframework.boot</groupId>
+                <artifactId>spring-boot-maven-plugin</artifactId>
+                <version>${spring-boot.version}</version>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-deploy-plugin</artifactId>
+                <version>${maven-deploy-plugin.version}</version>
+                <configuration>
+                    <skip>true</skip>
+                </configuration>
+            </plugin>
+        </plugins>
+    </build>
+
+    <repositories>
+        <repository>
+            <id>spring-milestones</id>
+            <name>Spring Milestones</name>
+            <url>https://repo.spring.io/milestone</url>
+            <snapshots>
+                <enabled>false</enabled>
+            </snapshots>
+        </repository>
+    </repositories>
+
+</project>
diff --git a/...src/main/java/com/alibaba/cloud/ai/reader/huggingface/fs/HuggingFaceFSDocumentReader.java b/...src/main/java/com/alibaba/cloud/ai/reader/huggingface/fs/HuggingFaceFSDocumentReader.java
@@ -0,0 +1,123 @@
+/*
+ * Copyright 2024-2025 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.alibaba.cloud.ai.reader.huggingface.fs;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.springframework.ai.document.Document;
+import org.springframework.ai.document.DocumentReader;
+import org.springframework.util.Assert;
+
+import java.io.*;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.zip.GZIPInputStream;
+
+/**
+ * Hugging Face File System reader. Uses the Hugging Face Hub client library to read files
+ * from Hugging Face repositories.
+ *
+ * @author brianxiadong
+ **/
+public class HuggingFaceFSDocumentReader implements DocumentReader {
+
+	public static final String SOURCE = "source";
+
+	private final String resourcePath;
+
+	private final ObjectMapper objectMapper;
+
+	/**
+	 * Create a new HuggingFaceFSDocumentReader instance.
+	 * @param resourcePath the path to the resource
+	 */
+	public HuggingFaceFSDocumentReader(String resourcePath) {
+		Assert.notNull(resourcePath, "Resource path must not be null");
+		this.resourcePath = resourcePath;
+		this.objectMapper = new ObjectMapper();
+	}
+
+	@Override
+	public List<Document> get() {
+		try {
+			List<Map<String, Object>> jsonDicts = loadDicts();
+			List<Document> documents = new ArrayList<>();
+
+			for (Map<String, Object> dict : jsonDicts) {
+				Document document = new Document(dict.toString());
+				document.getMetadata().put(SOURCE, resourcePath);
+				documents.add(document);
+			}
+
+			return documents;
+		}
+		catch (IOException e) {
+			throw new RuntimeException("Failed to load documents from HuggingFace: " + e.getMessage(), e);
+		}
+	}
+
+	/**
+	 * Parse file and load as list of dictionaries
+	 *
+	 */
+	public List<Map<String, Object>> loadDicts() throws IOException {
+		Path path = Paths.get(resourcePath);
+		byte[] content = Files.readAllBytes(path);
+		String data;
+
+		// Handle gzip compressed files
+		if (resourcePath.endsWith(".gz")) {
+			try (InputStream inputStream = new ByteArrayInputStream(content);
+					GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream)) {
+				data = new String(gzipInputStream.readAllBytes());
+			}
+		}
+		else {
+			data = new String(content);
+		}
+
+		List<Map<String, Object>> jsonDicts = new ArrayList<>();
+		String[] lines = data.split("\n");
+
+		for (String line : lines) {
+			try {
+				if (!line.trim().isEmpty()) {
+					@SuppressWarnings("unchecked")
+					Map<String, Object> jsonDict = objectMapper.readValue(line, Map.class);
+					jsonDicts.add(jsonDict);
+				}
+			}
+			catch (Exception e) {
+				// Skip invalid JSON lines
+				continue;
+			}
+		}
+
+		return jsonDicts;
+	}
+
+	/**
+	 * Get the resource path.
+	 * @return the resource path
+	 */
+	public String getResourcePath() {
+		return this.resourcePath;
+	}
+
+}