Skip to content

Commit

Permalink
Merge pull request #362 from brianxiadong/feat-brianxiadong-huggingface
Browse files Browse the repository at this point in the history
feat(document-readers): Add HuggingFace File System Document Reader / 添加 HuggingFace 文件系统文档阅读器 related issue:#283
  • Loading branch information
chickenlj authored Jan 13, 2025
2 parents dfe1767 + 0273e24 commit 1209a8f
Show file tree
Hide file tree
Showing 5 changed files with 428 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# HuggingFace File System Document Reader

HuggingFace文件系统文档阅读器是一个专门用于读取和解析HuggingFace数据集文件的组件。

## 功能特点

- 支持读取JSON Lines格式文件
- 支持GZIP压缩文件的自动解压
- 自动跳过无效的JSON行
- 提供文档元数据支持
- 与Spring AI文档体系无缝集成

## 快速开始

### Maven依赖

```xml
<dependency>
<groupId>com.alibaba.cloud.ai</groupId>
<artifactId>huggingface-fs-document-reader</artifactId>
<version>${version}</version>
</dependency>
```

### 使用示例

```java
// 创建文档阅读器
HuggingFaceFSDocumentReader reader = new HuggingFaceFSDocumentReader("/path/to/your/file.jsonl");

// 读取文档
List<Document> documents = reader.get();

// 处理文档
for (Document doc : documents) {
// 获取文档内容
String content = doc.getContent();

// 获取源文件路径(元数据)
String source = doc.getMetadata().get(HuggingFaceFSDocumentReader.SOURCE);

// 进行其他处理...
}
```

### 支持的文件格式

1. 普通JSONL文件:
```json
{"text": "文档内容1", "label": "标签1"}
{"text": "文档内容2", "label": "标签2"}
```

2. GZIP压缩的JSONL文件:
- 文件扩展名为`.gz`
- 包含压缩的JSONL内容

## 异常处理

- 文件不存在时会抛出适当的异常
- 无效的JSON行会被自动跳过
- 提供了友好的错误信息

## 最佳实践

1. 文件命名:
- 普通文件使用`.jsonl`扩展名
- 压缩文件使用`.jsonl.gz`扩展名

2. JSON格式:
- 每行一个完整的JSON对象
- 使用UTF-8编码
- 避免使用特殊字符

3. 性能考虑:
- 对于大文件,建议使用GZIP压缩
- 注意内存使用,避免一次加载过大的文件

## 贡献指南

欢迎提交Issue和Pull Request来帮助改进这个组件。在提交代码时,请确保:

1. 添加适当的单元测试
2. 遵循代码规范
3. 更新相关文档

## 许可证

本项目采用 Apache License 2.0 许可证。详见 [LICENSE](LICENSE) 文件。
117 changes: 117 additions & 0 deletions community/document-readers/huggingface-fs-document-reader/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Copyright 2024-2025 the original author or authors.
~
~ Licensed under the Apache License, Version 2.0 (the "License");
~ you may not use this file except in compliance with the License.
~ You may obtain a copy of the License at
~
~ https://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->

<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.alibaba.cloud.ai</groupId>
<artifactId>spring-ai-alibaba</artifactId>
<version>${revision}</version>
<relativePath>../../../pom.xml</relativePath>
</parent>

<artifactId>huggingface-fs-document-reader</artifactId>
<name>huggingface-fs-document-reader</name>
<description>huggingface-fs reader for Spring AI Alibaba</description>
<packaging>jar</packaging>
<url>https://github.com/alibaba/spring-ai-alibaba</url>
<scm>
<url>https://github.com/alibaba/spring-ai-alibaba</url>
<connection>git://github.com/alibaba/spring-ai-alibaba.git</connection>
<developerConnection>[email protected]:alibaba/spring-ai-alibaba.git</developerConnection>
</scm>

<properties>
<maven.compiler.source>17</maven.compiler.source>
<maven.compiler.target>17</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven-deploy-plugin.version>3.1.1</maven-deploy-plugin.version>
</properties>

<dependencies>

<dependency>
<groupId>com.alibaba.cloud.ai</groupId>
<artifactId>spring-ai-alibaba-core</artifactId>
<version>${project.parent.version}</version>
</dependency>

<dependency>
<groupId>com.alibaba.cloud.ai</groupId>
<artifactId>document-parser-tika</artifactId>
<version>${project.version}</version>
</dependency>

<!-- test dependencies -->
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-test</artifactId>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>

<dependency>
<groupId>io.projectreactor</groupId>
<artifactId>reactor-test</artifactId>
<scope>test</scope>
</dependency>

<dependency>
<groupId>io.micrometer</groupId>
<artifactId>micrometer-observation-test</artifactId>
<scope>test</scope>
</dependency>

</dependencies>

<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<version>${spring-boot.version}</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-deploy-plugin</artifactId>
<version>${maven-deploy-plugin.version}</version>
<configuration>
<skip>true</skip>
</configuration>
</plugin>
</plugins>
</build>

<repositories>
<repository>
<id>spring-milestones</id>
<name>Spring Milestones</name>
<url>https://repo.spring.io/milestone</url>
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
</repositories>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
/*
* Copyright 2024-2025 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.alibaba.cloud.ai.reader.huggingface.fs;

import com.fasterxml.jackson.databind.ObjectMapper;
import org.springframework.ai.document.Document;
import org.springframework.ai.document.DocumentReader;
import org.springframework.util.Assert;

import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.zip.GZIPInputStream;

/**
* Hugging Face File System reader. Uses the Hugging Face Hub client library to read files
* from Hugging Face repositories.
*
* @author brianxiadong
**/
public class HuggingFaceFSDocumentReader implements DocumentReader {

public static final String SOURCE = "source";

private final String resourcePath;

private final ObjectMapper objectMapper;

/**
* Create a new HuggingFaceFSDocumentReader instance.
* @param resourcePath the path to the resource
*/
public HuggingFaceFSDocumentReader(String resourcePath) {
Assert.notNull(resourcePath, "Resource path must not be null");
this.resourcePath = resourcePath;
this.objectMapper = new ObjectMapper();
}

@Override
public List<Document> get() {
try {
List<Map<String, Object>> jsonDicts = loadDicts();
List<Document> documents = new ArrayList<>();

for (Map<String, Object> dict : jsonDicts) {
Document document = new Document(dict.toString());
document.getMetadata().put(SOURCE, resourcePath);
documents.add(document);
}

return documents;
}
catch (IOException e) {
throw new RuntimeException("Failed to load documents from HuggingFace: " + e.getMessage(), e);
}
}

/**
* Parse file and load as list of dictionaries
*
*/
public List<Map<String, Object>> loadDicts() throws IOException {
Path path = Paths.get(resourcePath);
byte[] content = Files.readAllBytes(path);
String data;

// Handle gzip compressed files
if (resourcePath.endsWith(".gz")) {
try (InputStream inputStream = new ByteArrayInputStream(content);
GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream)) {
data = new String(gzipInputStream.readAllBytes());
}
}
else {
data = new String(content);
}

List<Map<String, Object>> jsonDicts = new ArrayList<>();
String[] lines = data.split("\n");

for (String line : lines) {
try {
if (!line.trim().isEmpty()) {
@SuppressWarnings("unchecked")
Map<String, Object> jsonDict = objectMapper.readValue(line, Map.class);
jsonDicts.add(jsonDict);
}
}
catch (Exception e) {
// Skip invalid JSON lines
continue;
}
}

return jsonDicts;
}

/**
* Get the resource path.
* @return the resource path
*/
public String getResourcePath() {
return this.resourcePath;
}

}
Loading

0 comments on commit 1209a8f

Please sign in to comment.