Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add YoutubeDocumentReader and BilibiliDocumentReader #380

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 92 additions & 0 deletions community/document-readers/bilibili-document-reader/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Copyright 2024-2025 the original author or authors.
~
~ Licensed under the Apache License, Version 2.0 (the "License");
~ you may not use this file except in compliance with the License.
~ You may obtain a copy of the License at
~
~ https://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->

<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.alibaba.cloud.ai</groupId>
<artifactId>spring-ai-alibaba</artifactId>
<version>${revision}</version>
<relativePath>../../../pom.xml</relativePath>
</parent>

<artifactId>bilibili-document-reader</artifactId>
<name>bilibili-document-reader</name>
<description>bilibili reader for Spring AI Alibaba</description>
<packaging>jar</packaging>
<url>https://github.com/alibaba/spring-ai-alibaba</url>
<scm>
<url>https://github.com/alibaba/spring-ai-alibaba</url>
<connection>git://github.com/alibaba/spring-ai-alibaba.git</connection>
<developerConnection>[email protected]:alibaba/spring-ai-alibaba.git</developerConnection>
</scm>

<properties>
<maven.compiler.source>17</maven.compiler.source>
<maven.compiler.target>17</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven-deploy-plugin.version>3.1.1</maven-deploy-plugin.version>
</properties>

<dependencies>

<dependency>
<groupId>com.alibaba.cloud.ai</groupId>
<artifactId>spring-ai-alibaba-core</artifactId>
<version>${project.parent.version}</version>
</dependency>

<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>

</dependencies>

<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<version>${spring-boot.version}</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-deploy-plugin</artifactId>
<version>${maven-deploy-plugin.version}</version>
<configuration>
<skip>true</skip>
</configuration>
</plugin>
</plugins>
</build>

<repositories>
<repository>
<id>spring-milestones</id>
<name>Spring Milestones</name>
<url>https://repo.spring.io/milestone</url>
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
</repositories>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
/*
* Copyright 2024-2025 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.alibaba.cloud.ai.reader.bilibili;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.ai.document.Document;
import org.springframework.ai.document.DocumentReader;
import org.springframework.http.HttpHeaders;
import org.springframework.http.MediaType;
import org.springframework.util.Assert;
import org.springframework.web.reactive.function.client.WebClient;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

/**
* A class for reading and parsing video information and subtitles from Bilibili.
* Implements the DocumentReader interface to provide methods for obtaining document
* content.
*/
public class BilibiliDocumentReader implements DocumentReader {

private static final Logger logger = LoggerFactory.getLogger(BilibiliDocumentReader.class);

private static final String API_BASE_URL = "https://api.bilibili.com/x/web-interface/view?bvid=";

private final String resourcePath;

private final ObjectMapper objectMapper;

private static final WebClient WEB_CLIENT = WebClient.builder()
.defaultHeader(HttpHeaders.ACCEPT, MediaType.APPLICATION_JSON_VALUE)
.codecs(configurer -> configurer.defaultCodecs().maxInMemorySize(5 * 1024 * 1024))
.build();

public BilibiliDocumentReader(String resourcePath) {
Assert.hasText(resourcePath, "Query string must not be empty");
this.resourcePath = resourcePath;
this.objectMapper = new ObjectMapper();
}

@Override
public List<Document> get() {
List<Document> documents = new ArrayList<>();
try {
String bvid = extractBvid(resourcePath);
String videoInfoResponse = fetchVideoInfo(bvid);
JsonNode videoData = parseJson(videoInfoResponse).path("data");
String title = videoData.path("title").asText();
String description = videoData.path("desc").asText();
Document infoDoc = new Document("Video information", Map.of("title", title, "description", description));
documents.add(infoDoc);
String documentContent = fetchAndProcessSubtitles(videoData, title, description);
documents.add(new Document(documentContent));
}
catch (IllegalArgumentException e) {
logger.error("Invalid input: {}", e.getMessage());
documents.add(new Document("Error: Invalid input"));
}
catch (IOException e) {
logger.error("Error parsing JSON: {}", e.getMessage(), e);
documents.add(new Document("Error parsing JSON: " + e.getMessage()));
}
catch (Exception e) {
logger.error("Unexpected error: {}", e.getMessage(), e);
documents.add(new Document("Unexpected error: " + e.getMessage()));
}
return documents;
}

private String extractBvid(String resourcePath) {
return resourcePath.replaceAll(".*(BV\\w+).*", "$1");
}

private String fetchVideoInfo(String bvid) {
return WEB_CLIENT.get().uri(API_BASE_URL + bvid).retrieve().bodyToMono(String.class).block();
}

private JsonNode parseJson(String jsonResponse) throws IOException {
return objectMapper.readTree(jsonResponse);
}

private String fetchAndProcessSubtitles(JsonNode videoData, String title, String description) throws IOException {
JsonNode subtitleList = videoData.path("subtitle").path("list");
if (subtitleList.isArray() && subtitleList.size() > 0) {
String subtitleUrl = subtitleList.get(0).path("subtitle_url").asText();
String subtitleResponse = WEB_CLIENT.get().uri(subtitleUrl).retrieve().bodyToMono(String.class).block();

JsonNode subtitleJson = parseJson(subtitleResponse);
StringBuilder rawTranscript = new StringBuilder();
subtitleJson.path("body").forEach(node -> rawTranscript.append(node.path("content").asText()).append(" "));

return String.format("Video Title: %s, Description: %s\nTranscript: %s", title, description,
rawTranscript.toString().trim());
}
else {
return String.format("No subtitles found for video: %s. Returning an empty transcript.", resourcePath);
}
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
* Copyright 2024-2025 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.alibaba.cloud.ai.reader.bilibili;

import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.ai.document.Document;

import java.util.List;

/**
* @Author: XiaoYunTao
* @Date: 2025/1/18
*/
public class BilibiliDocumentReaderTest {

private static final Logger logger = LoggerFactory.getLogger(BilibiliDocumentReader.class);

@Test
void bilibiliDocumentReaderTest() {
BilibiliDocumentReader bilibiliDocumentReader = new BilibiliDocumentReader(
"https://www.bilibili.com/video/BV1KMwgeKECx/?t=7&vd_source=3069f51b168ac07a9e3c4ba94ae26af5");
List<Document> documents = bilibiliDocumentReader.get();
logger.info("documents: {}", documents);
}

}
105 changes: 105 additions & 0 deletions community/document-readers/youtube-document-reader/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Copyright 2024-2025 the original author or authors.
~
~ Licensed under the Apache License, Version 2.0 (the "License");
~ you may not use this file except in compliance with the License.
~ You may obtain a copy of the License at
~
~ https://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->

<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.alibaba.cloud.ai</groupId>
<artifactId>spring-ai-alibaba</artifactId>
<version>${revision}</version>
<relativePath>../../../pom.xml</relativePath>
</parent>

<artifactId>youtube-document-reader</artifactId>
<name>youtube-document-reader</name>
<description>youtube reader for Spring AI Alibaba</description>
<packaging>jar</packaging>
<url>https://github.com/alibaba/spring-ai-alibaba</url>
<scm>
<url>https://github.com/alibaba/spring-ai-alibaba</url>
<connection>git://github.com/alibaba/spring-ai-alibaba.git</connection>
<developerConnection>[email protected]:alibaba/spring-ai-alibaba.git</developerConnection>
</scm>

<properties>
<maven.compiler.source>17</maven.compiler.source>
<maven.compiler.target>17</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven-deploy-plugin.version>3.1.1</maven-deploy-plugin.version>
<jsoup.version>1.18.1</jsoup.version>
</properties>

<dependencies>

<dependency>
<groupId>com.alibaba.cloud.ai</groupId>
<artifactId>spring-ai-alibaba-core</artifactId>
<version>${project.parent.version}</version>
</dependency>

<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>${jsoup.version}</version>
</dependency>

<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.12.0</version>
</dependency>

</dependencies>

<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<version>${spring-boot.version}</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-deploy-plugin</artifactId>
<version>${maven-deploy-plugin.version}</version>
<configuration>
<skip>true</skip>
</configuration>
</plugin>
</plugins>
</build>

<repositories>
<repository>
<id>spring-milestones</id>
<name>Spring Milestones</name>
<url>https://repo.spring.io/milestone</url>
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
</repositories>

</project>
Loading
Loading