Skip to content

Commit

Permalink
Merge pull request #381 from brianxiadong/feat-brianxiadong-email
Browse files Browse the repository at this point in the history
feat: .eml email document reader (The eml file cannot have a license added.)
  • Loading branch information
chickenlj authored Jan 20, 2025
2 parents c8691c4 + 55defbd commit 4c7d823
Show file tree
Hide file tree
Showing 11 changed files with 1,464 additions and 0 deletions.
71 changes: 71 additions & 0 deletions community/document-readers/email-document-reader/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# Email Document Reader

A Spring AI document reader implementation for parsing email files (EML format).

## Features

- Support for EML format email files
- Extracts email metadata (subject, from, to, date, etc.)
- Handles both plain text and HTML content
- Supports various character encodings (UTF-8, etc.)
- Handles Base64 and Quoted-Printable encoded content
- Processes attachments using Apache Tika (supports PDF, DOC, etc.)
- Compliant with Spring AI Document interface specification

## Dependencies

```xml
<dependencies>
<!-- Spring AI Document Reader API -->
<dependency>
<groupId>com.alibaba.cloud.ai</groupId>
<artifactId>email-document-reader</artifactId>
</dependency>
</dependencies>
```

## Usage

```java
// Create a reader instance with an EML file and enable attachment processing
EmlEmailDocumentReader reader = new EmlEmailDocumentReader("path/to/email.eml", true);

// Get documents (email body and attachments if any)
List<Document> documents = reader.get();

// Access email metadata
Document emailDoc = documents.get(0);
Map<String, Object> metadata = emailDoc.getMetadata();

String subject = (String) metadata.get("subject");
String from = (String) metadata.get("from");
String date = (String) metadata.get("date");

// Access email content
String content = emailDoc.getText();

// Access attachment content (if any)
if (documents.size() > 1) {
Document attachmentDoc = documents.get(1);
String filename = (String) attachmentDoc.getMetadata().get("filename");
String attachmentContent = attachmentDoc.getText();
}
```

## Metadata Fields

The following metadata fields are available:

- `subject`: Email subject line
- `from`: Sender's email address
- `from_name`: Sender's display name (if available)
- `to`: Recipient's email address
- `to_name`: Recipient's display name (if available)
- `date`: Email date in RFC 822 format
- `content_type`: MIME content type of the email
- `filename`: Original filename (for attachments)
- `size`: File size in bytes (for attachments)

## License

Licensed under the Apache License, Version 2.0.
75 changes: 75 additions & 0 deletions community/document-readers/email-document-reader/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.alibaba.cloud.ai</groupId>
<artifactId>spring-ai-alibaba</artifactId>
<version>${revision}</version>
<relativePath>../../../pom.xml</relativePath>
</parent>

<artifactId>email-document-reader</artifactId>
<name>Spring AI Alibaba Email Document Reader</name>
<description>Spring AI Alibaba Email Document Reader</description>

<dependencies>
<!-- Spring AI -->
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-core</artifactId>
</dependency>

<!-- JavaMail API -->
<dependency>
<groupId>javax.mail</groupId>
<artifactId>javax.mail-api</artifactId>
<version>1.6.2</version>
</dependency>
<dependency>
<groupId>com.sun.mail</groupId>
<artifactId>javax.mail</artifactId>
<version>1.6.2</version>
</dependency>

<!-- BsHtml Document Parser -->
<dependency>
<groupId>com.alibaba.cloud.ai</groupId>
<artifactId>document-parser-bshtml</artifactId>
<version>${project.version}</version>
</dependency>

<!-- tika Document Parser -->
<dependency>
<groupId>com.alibaba.cloud.ai</groupId>
<artifactId>document-parser-tika</artifactId>
<version>${project.version}</version>
</dependency>

<!-- Apache Commons IO -->
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.11.0</version>
</dependency>

<!-- Test dependencies -->
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.assertj</groupId>
<artifactId>assertj-core</artifactId>
<scope>test</scope>
</dependency>
</dependencies>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
/*
* Copyright 2024-2025 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.alibaba.cloud.ai.reader.email.eml;

import java.time.ZonedDateTime;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
import java.io.UnsupportedEncodingException;
import javax.mail.internet.MimeUtility;

/**
* Base class for all email elements Represents different parts of an email like subject,
* sender, recipient, etc.
*
* @author brianxiadong
* @since 2024-01-19
*/
public abstract class EmailElement {

/**
* The text content of the element
*/
protected String text;

/**
* Metadata associated with this element
*/
protected Map<String, Object> metadata;

/**
* Constructor
* @param text The text content
*/
protected EmailElement(String text) {
this.text = text;
this.metadata = new HashMap<>();
}

/**
* Get the text content
* @return The text content
*/
public String getText() {
return text;
}

/**
* Set the text content
* @param text The text content
*/
public void setText(String text) {
this.text = text;
}

/**
* Get the metadata
* @return The metadata map
*/
public Map<String, Object> getMetadata() {
return metadata;
}

/**
* Set the metadata
* @param metadata The metadata map
*/
public void setMetadata(Map<String, Object> metadata) {
this.metadata = metadata;
}

}

/**
* Represents the subject of an email
*/
class Subject extends EmailElement {

// Pattern for Q-encoded content
private static final Pattern Q_ENCODED_PATTERN = Pattern.compile("=\\?[^?]+\\?[qQbB]\\?[^?]+\\?=");

public Subject(String text) {
super(decodeSubject(text));
}

/**
* Decode the subject text, handling Q-encoded and Base64 encoded content
* @param text The subject text to decode
* @return The decoded subject text
*/
private static String decodeSubject(String text) {
if (text == null || text.isEmpty()) {
return text;
}

try {
// 使用 JavaMail 的 MimeUtility 来解码
return MimeUtility.decodeText(text);
}
catch (UnsupportedEncodingException e) {
// 如果解码失败,返回原始文本
return text;
}
}

}

/**
* Represents a sender of an email
*/
class Sender extends EmailElement {

private final String name;

public Sender(String name, String email) {
super(email);
this.name = name;
}

public String getName() {
return name;
}

}

/**
* Represents a recipient of an email
*/
class Recipient extends EmailElement {

private final String name;

public Recipient(String name, String email) {
super(email);
this.name = name;
}

public String getName() {
return name;
}

}

/**
* Represents metadata information in an email
*/
class MetaData extends EmailElement {

private final String name;

public MetaData(String name, String value) {
super(value);
this.name = name;
}

public String getName() {
return name;
}

}

/**
* Represents received information in an email header
*/
class ReceivedInfo extends EmailElement {

private final String name;

private final ZonedDateTime datestamp;

public ReceivedInfo(String name, String text, ZonedDateTime datestamp) {
super(text);
this.name = name;
this.datestamp = datestamp;
}

public String getName() {
return name;
}

public ZonedDateTime getDatestamp() {
return datestamp;
}

}
Loading

0 comments on commit 4c7d823

Please sign in to comment.