-
Notifications
You must be signed in to change notification settings - Fork 185
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #381 from brianxiadong/feat-brianxiadong-email
feat: .eml email document reader (The eml file cannot have a license added.)
- Loading branch information
Showing
11 changed files
with
1,464 additions
and
0 deletions.
There are no files selected for viewing
71 changes: 71 additions & 0 deletions
71
community/document-readers/email-document-reader/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
# Email Document Reader | ||
|
||
A Spring AI document reader implementation for parsing email files (EML format). | ||
|
||
## Features | ||
|
||
- Support for EML format email files | ||
- Extracts email metadata (subject, from, to, date, etc.) | ||
- Handles both plain text and HTML content | ||
- Supports various character encodings (UTF-8, etc.) | ||
- Handles Base64 and Quoted-Printable encoded content | ||
- Processes attachments using Apache Tika (supports PDF, DOC, etc.) | ||
- Compliant with Spring AI Document interface specification | ||
|
||
## Dependencies | ||
|
||
```xml | ||
<dependencies> | ||
<!-- Spring AI Document Reader API --> | ||
<dependency> | ||
<groupId>com.alibaba.cloud.ai</groupId> | ||
<artifactId>email-document-reader</artifactId> | ||
</dependency> | ||
</dependencies> | ||
``` | ||
|
||
## Usage | ||
|
||
```java | ||
// Create a reader instance with an EML file and enable attachment processing | ||
EmlEmailDocumentReader reader = new EmlEmailDocumentReader("path/to/email.eml", true); | ||
|
||
// Get documents (email body and attachments if any) | ||
List<Document> documents = reader.get(); | ||
|
||
// Access email metadata | ||
Document emailDoc = documents.get(0); | ||
Map<String, Object> metadata = emailDoc.getMetadata(); | ||
|
||
String subject = (String) metadata.get("subject"); | ||
String from = (String) metadata.get("from"); | ||
String date = (String) metadata.get("date"); | ||
|
||
// Access email content | ||
String content = emailDoc.getText(); | ||
|
||
// Access attachment content (if any) | ||
if (documents.size() > 1) { | ||
Document attachmentDoc = documents.get(1); | ||
String filename = (String) attachmentDoc.getMetadata().get("filename"); | ||
String attachmentContent = attachmentDoc.getText(); | ||
} | ||
``` | ||
|
||
## Metadata Fields | ||
|
||
The following metadata fields are available: | ||
|
||
- `subject`: Email subject line | ||
- `from`: Sender's email address | ||
- `from_name`: Sender's display name (if available) | ||
- `to`: Recipient's email address | ||
- `to_name`: Recipient's display name (if available) | ||
- `date`: Email date in RFC 822 format | ||
- `content_type`: MIME content type of the email | ||
- `filename`: Original filename (for attachments) | ||
- `size`: File size in bytes (for attachments) | ||
|
||
## License | ||
|
||
Licensed under the Apache License, Version 2.0. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
<parent> | ||
<groupId>com.alibaba.cloud.ai</groupId> | ||
<artifactId>spring-ai-alibaba</artifactId> | ||
<version>${revision}</version> | ||
<relativePath>../../../pom.xml</relativePath> | ||
</parent> | ||
|
||
<artifactId>email-document-reader</artifactId> | ||
<name>Spring AI Alibaba Email Document Reader</name> | ||
<description>Spring AI Alibaba Email Document Reader</description> | ||
|
||
<dependencies> | ||
<!-- Spring AI --> | ||
<dependency> | ||
<groupId>org.springframework.ai</groupId> | ||
<artifactId>spring-ai-core</artifactId> | ||
</dependency> | ||
|
||
<!-- JavaMail API --> | ||
<dependency> | ||
<groupId>javax.mail</groupId> | ||
<artifactId>javax.mail-api</artifactId> | ||
<version>1.6.2</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>com.sun.mail</groupId> | ||
<artifactId>javax.mail</artifactId> | ||
<version>1.6.2</version> | ||
</dependency> | ||
|
||
<!-- BsHtml Document Parser --> | ||
<dependency> | ||
<groupId>com.alibaba.cloud.ai</groupId> | ||
<artifactId>document-parser-bshtml</artifactId> | ||
<version>${project.version}</version> | ||
</dependency> | ||
|
||
<!-- tika Document Parser --> | ||
<dependency> | ||
<groupId>com.alibaba.cloud.ai</groupId> | ||
<artifactId>document-parser-tika</artifactId> | ||
<version>${project.version}</version> | ||
</dependency> | ||
|
||
<!-- Apache Commons IO --> | ||
<dependency> | ||
<groupId>commons-io</groupId> | ||
<artifactId>commons-io</artifactId> | ||
<version>2.11.0</version> | ||
</dependency> | ||
|
||
<!-- Test dependencies --> | ||
<dependency> | ||
<groupId>org.junit.jupiter</groupId> | ||
<artifactId>junit-jupiter</artifactId> | ||
<scope>test</scope> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.mockito</groupId> | ||
<artifactId>mockito-core</artifactId> | ||
<scope>test</scope> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.assertj</groupId> | ||
<artifactId>assertj-core</artifactId> | ||
<scope>test</scope> | ||
</dependency> | ||
</dependencies> | ||
|
||
</project> |
198 changes: 198 additions & 0 deletions
198
...ail-document-reader/src/main/java/com/alibaba/cloud/ai/reader/email/eml/EmailElement.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,198 @@ | ||
/* | ||
* Copyright 2024-2025 the original author or authors. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* https://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package com.alibaba.cloud.ai.reader.email.eml; | ||
|
||
import java.time.ZonedDateTime; | ||
import java.util.HashMap; | ||
import java.util.Map; | ||
import java.util.regex.Pattern; | ||
import java.io.UnsupportedEncodingException; | ||
import javax.mail.internet.MimeUtility; | ||
|
||
/** | ||
* Base class for all email elements Represents different parts of an email like subject, | ||
* sender, recipient, etc. | ||
* | ||
* @author brianxiadong | ||
* @since 2024-01-19 | ||
*/ | ||
public abstract class EmailElement { | ||
|
||
/** | ||
* The text content of the element | ||
*/ | ||
protected String text; | ||
|
||
/** | ||
* Metadata associated with this element | ||
*/ | ||
protected Map<String, Object> metadata; | ||
|
||
/** | ||
* Constructor | ||
* @param text The text content | ||
*/ | ||
protected EmailElement(String text) { | ||
this.text = text; | ||
this.metadata = new HashMap<>(); | ||
} | ||
|
||
/** | ||
* Get the text content | ||
* @return The text content | ||
*/ | ||
public String getText() { | ||
return text; | ||
} | ||
|
||
/** | ||
* Set the text content | ||
* @param text The text content | ||
*/ | ||
public void setText(String text) { | ||
this.text = text; | ||
} | ||
|
||
/** | ||
* Get the metadata | ||
* @return The metadata map | ||
*/ | ||
public Map<String, Object> getMetadata() { | ||
return metadata; | ||
} | ||
|
||
/** | ||
* Set the metadata | ||
* @param metadata The metadata map | ||
*/ | ||
public void setMetadata(Map<String, Object> metadata) { | ||
this.metadata = metadata; | ||
} | ||
|
||
} | ||
|
||
/** | ||
* Represents the subject of an email | ||
*/ | ||
class Subject extends EmailElement { | ||
|
||
// Pattern for Q-encoded content | ||
private static final Pattern Q_ENCODED_PATTERN = Pattern.compile("=\\?[^?]+\\?[qQbB]\\?[^?]+\\?="); | ||
|
||
public Subject(String text) { | ||
super(decodeSubject(text)); | ||
} | ||
|
||
/** | ||
* Decode the subject text, handling Q-encoded and Base64 encoded content | ||
* @param text The subject text to decode | ||
* @return The decoded subject text | ||
*/ | ||
private static String decodeSubject(String text) { | ||
if (text == null || text.isEmpty()) { | ||
return text; | ||
} | ||
|
||
try { | ||
// 使用 JavaMail 的 MimeUtility 来解码 | ||
return MimeUtility.decodeText(text); | ||
} | ||
catch (UnsupportedEncodingException e) { | ||
// 如果解码失败,返回原始文本 | ||
return text; | ||
} | ||
} | ||
|
||
} | ||
|
||
/** | ||
* Represents a sender of an email | ||
*/ | ||
class Sender extends EmailElement { | ||
|
||
private final String name; | ||
|
||
public Sender(String name, String email) { | ||
super(email); | ||
this.name = name; | ||
} | ||
|
||
public String getName() { | ||
return name; | ||
} | ||
|
||
} | ||
|
||
/** | ||
* Represents a recipient of an email | ||
*/ | ||
class Recipient extends EmailElement { | ||
|
||
private final String name; | ||
|
||
public Recipient(String name, String email) { | ||
super(email); | ||
this.name = name; | ||
} | ||
|
||
public String getName() { | ||
return name; | ||
} | ||
|
||
} | ||
|
||
/** | ||
* Represents metadata information in an email | ||
*/ | ||
class MetaData extends EmailElement { | ||
|
||
private final String name; | ||
|
||
public MetaData(String name, String value) { | ||
super(value); | ||
this.name = name; | ||
} | ||
|
||
public String getName() { | ||
return name; | ||
} | ||
|
||
} | ||
|
||
/** | ||
* Represents received information in an email header | ||
*/ | ||
class ReceivedInfo extends EmailElement { | ||
|
||
private final String name; | ||
|
||
private final ZonedDateTime datestamp; | ||
|
||
public ReceivedInfo(String name, String text, ZonedDateTime datestamp) { | ||
super(text); | ||
this.name = name; | ||
this.datestamp = datestamp; | ||
} | ||
|
||
public String getName() { | ||
return name; | ||
} | ||
|
||
public ZonedDateTime getDatestamp() { | ||
return datestamp; | ||
} | ||
|
||
} |
Oops, something went wrong.