From 8738d95b2613b7f5a338b188ea68446b33594fe5 Mon Sep 17 00:00:00 2001 From: Xia Dong Date: Thu, 16 Jan 2025 17:25:54 +0800 Subject: [PATCH] feat(document-readers): add mbox implements --- .../ai/reader/mbox/MboxDocumentReader.java | 72 +++++++++------- .../reader/mbox/MboxDocumentReaderTest.java | 86 ++++++++++--------- 2 files changed, 86 insertions(+), 72 deletions(-) diff --git a/community/document-readers/mbox-document-reader/src/main/java/com/alibaba/cloud/ai/reader/mbox/MboxDocumentReader.java b/community/document-readers/mbox-document-reader/src/main/java/com/alibaba/cloud/ai/reader/mbox/MboxDocumentReader.java index 6ff9b50c..4d61cdf2 100644 --- a/community/document-readers/mbox-document-reader/src/main/java/com/alibaba/cloud/ai/reader/mbox/MboxDocumentReader.java +++ b/community/document-readers/mbox-document-reader/src/main/java/com/alibaba/cloud/ai/reader/mbox/MboxDocumentReader.java @@ -48,13 +48,19 @@ public class MboxDocumentReader implements DocumentReader { public static final String DEFAULT_MESSAGE_FORMAT = "Date: %s\nFrom: %s\nTo: %s\nSubject: %s\nContent: %s"; private static final Pattern FROM_LINE_PATTERN = Pattern.compile("^From .*\\d{4}$"); + private static final Pattern HEADER_PATTERN = Pattern.compile("^([^:]+):\\s*(.*)$"); + private static final Pattern BOUNDARY_PATTERN = Pattern.compile("boundary=\"?([^\"]+)\"?"); private final File mboxFile; + private final int maxCount; + private final String messageFormat; + private final BsHtmlDocumentParser htmlParser; + private final SimpleDateFormat dateFormat; /** @@ -85,7 +91,7 @@ public MboxDocumentReader(File mboxFile, int maxCount, String messageFormat) { Assert.notNull(mboxFile, "Mbox file must not be null"); Assert.isTrue(mboxFile.exists(), "Mbox file does not exist: " + mboxFile.getAbsolutePath()); Assert.isTrue(mboxFile.isFile(), "Mbox path is not a file: " + mboxFile.getAbsolutePath()); - + this.mboxFile = mboxFile; this.maxCount = maxCount; this.messageFormat = messageFormat; @@ -112,7 +118,7 @@ private List readMboxFile() throws IOException { try (LineIterator it = FileUtils.lineIterator(mboxFile, StandardCharsets.UTF_8.name())) { while (it.hasNext()) { String line = it.nextLine(); - + // Check if this is a new message if (FROM_LINE_PATTERN.matcher(line).matches()) { // Process previous message if exists @@ -121,7 +127,7 @@ private List readMboxFile() throws IOException { if (doc != null) { documents.add(doc); count++; - + if (maxCount > 0 && count >= maxCount) { break; } @@ -137,7 +143,7 @@ private List readMboxFile() throws IOException { currentMessage.append(line).append("\n"); } } - + // Process the last message if (!currentMessage.isEmpty()) { Document doc = parseMessage(currentMessage.toString()); @@ -155,16 +161,16 @@ private Document parseMessage(String messageContent) { Map headers = new HashMap<>(); StringBuilder content = new StringBuilder(); String[] lines = messageContent.split("\n"); - + boolean inHeaders = true; String boundary = null; boolean inHtmlPart = false; boolean skipCurrentPart = false; StringBuilder currentPart = new StringBuilder(); - + for (int i = 0; i < lines.length; i++) { String line = lines[i]; - + if (inHeaders) { if (line.trim().isEmpty()) { inHeaders = false; @@ -178,7 +184,7 @@ private Document parseMessage(String messageContent) { } continue; } - + Matcher m = HEADER_PATTERN.matcher(line); if (m.matches()) { String name = m.group(1).trim(); @@ -187,7 +193,7 @@ private Document parseMessage(String messageContent) { } continue; } - + // Process message body if (boundary != null) { if (line.contains("--" + boundary)) { @@ -199,7 +205,8 @@ private Document parseMessage(String messageContent) { if (!parsedHtml.isEmpty()) { content = new StringBuilder(parsedHtml); } - } else if (content.isEmpty() && !skipCurrentPart) { + } + else if (content.isEmpty() && !skipCurrentPart) { content = currentPart; } } @@ -208,23 +215,25 @@ private Document parseMessage(String messageContent) { skipCurrentPart = false; continue; } - + // Check content type of the part if (line.startsWith("Content-Type:")) { if (line.contains("text/html")) { inHtmlPart = true; skipCurrentPart = false; - } else if (!line.contains("text/plain")) { + } + else if (!line.contains("text/plain")) { // Skip non-text parts skipCurrentPart = true; } continue; } - + if (!skipCurrentPart) { currentPart.append(line).append("\n"); } - } else { + } + else { // For non-multipart messages String contentType = headers.get("Content-Type"); if (contentType != null && contentType.contains("text/html")) { @@ -237,12 +246,13 @@ private Document parseMessage(String messageContent) { content = new StringBuilder(parsedHtml); } } - } else { + } + else { content.append(line).append("\n"); } } } - + // Extract metadata metadata.put("subject", headers.getOrDefault("Subject", "")); metadata.put("from", headers.getOrDefault("From", "")); @@ -252,32 +262,31 @@ private Document parseMessage(String messageContent) { if (dateStr != null) { metadata.put("date", dateFormat.parse(dateStr)); } - } catch (ParseException e) { + } + catch (ParseException e) { throw new RuntimeException("Failed to parse date: " + e.getMessage(), e); } - + // Check if content is empty String contentStr = content.toString().trim(); if (contentStr.isEmpty()) { - throw new RuntimeException("Empty content found for message: " + headers.getOrDefault("Message-ID", "unknown")); + throw new RuntimeException( + "Empty content found for message: " + headers.getOrDefault("Message-ID", "unknown")); } - + // Format the content - String formattedContent = String.format(messageFormat, - metadata.getOrDefault("date", ""), - metadata.get("from"), - metadata.get("to"), - metadata.get("subject"), - contentStr); - + String formattedContent = String.format(messageFormat, metadata.getOrDefault("date", ""), metadata.get("from"), + metadata.get("to"), metadata.get("subject"), contentStr); + // Check if formatted content is empty if (formattedContent.trim().isEmpty()) { - throw new RuntimeException("Empty formatted content for message: " + headers.getOrDefault("Message-ID", "unknown")); + throw new RuntimeException( + "Empty formatted content for message: " + headers.getOrDefault("Message-ID", "unknown")); } - + // Use Message-ID as document ID String id = headers.getOrDefault("Message-ID", "msg-" + System.currentTimeMillis()); - + return new Document(id, formattedContent, metadata); } @@ -285,7 +294,7 @@ private String parseHtmlContent(String html) { if (html == null || html.trim().isEmpty()) { throw new RuntimeException("HTML content is null or empty"); } - + try (InputStream is = new ByteArrayInputStream(html.getBytes(StandardCharsets.UTF_8))) { List docs = htmlParser.parse(is); if (!docs.isEmpty()) { @@ -303,4 +312,5 @@ private String parseHtmlContent(String html) { throw new RuntimeException("Failed to parse HTML content: " + e.getMessage(), e); } } + } diff --git a/community/document-readers/mbox-document-reader/src/test/java/com/alibaba/cloud/ai/reader/mbox/MboxDocumentReaderTest.java b/community/document-readers/mbox-document-reader/src/test/java/com/alibaba/cloud/ai/reader/mbox/MboxDocumentReaderTest.java index 2c992ccb..b687960b 100644 --- a/community/document-readers/mbox-document-reader/src/test/java/com/alibaba/cloud/ai/reader/mbox/MboxDocumentReaderTest.java +++ b/community/document-readers/mbox-document-reader/src/test/java/com/alibaba/cloud/ai/reader/mbox/MboxDocumentReaderTest.java @@ -35,7 +35,7 @@ /** * Test cases for MboxDocumentReader functionality - * + * * @author brianxiadong */ public class MboxDocumentReaderTest { @@ -44,9 +44,13 @@ public class MboxDocumentReaderTest { Path tempDir; private static final String SAMPLE_MBOX = "sample.mbox"; + private static final String INVALID_MBOX = "invalid.mbox"; + private File sampleMboxFile; + private File invalidMboxFile; + private SimpleDateFormat dateFormat; @BeforeEach @@ -54,18 +58,14 @@ void setUp() throws IOException { // Create temporary test files sampleMboxFile = tempDir.resolve(SAMPLE_MBOX).toFile(); invalidMboxFile = tempDir.resolve(INVALID_MBOX).toFile(); - + // Copy test resource files from classpath to temporary directory - FileCopyUtils.copy( - new ClassPathResource(SAMPLE_MBOX).getInputStream(), - Files.newOutputStream(sampleMboxFile.toPath()) - ); - - FileCopyUtils.copy( - new ClassPathResource(INVALID_MBOX).getInputStream(), - Files.newOutputStream(invalidMboxFile.toPath()) - ); - + FileCopyUtils.copy(new ClassPathResource(SAMPLE_MBOX).getInputStream(), + Files.newOutputStream(sampleMboxFile.toPath())); + + FileCopyUtils.copy(new ClassPathResource(INVALID_MBOX).getInputStream(), + Files.newOutputStream(invalidMboxFile.toPath())); + // Initialize date formatter dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.US); } @@ -76,24 +76,25 @@ void setUp() throws IOException { @Test void testPlainTextEmail() { // Create reader instance - MboxDocumentReader reader = new MboxDocumentReader(sampleMboxFile.getAbsolutePath(), 1, MboxDocumentReader.DEFAULT_MESSAGE_FORMAT); - + MboxDocumentReader reader = new MboxDocumentReader(sampleMboxFile.getAbsolutePath(), 1, + MboxDocumentReader.DEFAULT_MESSAGE_FORMAT); + // Get document list List documents = reader.get(); - + // Verify only one email was read assertEquals(1, documents.size(), "Should only read one email"); - + // Get the first email Document doc = documents.get(0); Map metadata = doc.getMetadata(); - + // Verify metadata assertEquals("Plain Text Email", metadata.get("subject")); assertEquals("Test Sender ", metadata.get("from")); assertEquals("recipient@example.com", metadata.get("to")); assertEquals("", doc.getId()); - + // Verify plain text message content String content = doc.getContent(); assertTrue(content.contains("This is a plain text email message")); @@ -106,32 +107,35 @@ void testPlainTextEmail() { @Test void testHtmlEmail() { // Create reader instance and set to read first two emails - MboxDocumentReader reader = new MboxDocumentReader(sampleMboxFile.getAbsolutePath(), 2, MboxDocumentReader.DEFAULT_MESSAGE_FORMAT); + MboxDocumentReader reader = new MboxDocumentReader(sampleMboxFile.getAbsolutePath(), 2, + MboxDocumentReader.DEFAULT_MESSAGE_FORMAT); List documents = reader.get(); - + // Verify two emails were read assertEquals(2, documents.size(), "Should read two emails"); - + // Get the second HTML email Document doc = documents.get(1); Map metadata = doc.getMetadata(); - + // Verify metadata assertEquals("HTML Email", metadata.get("subject")); assertEquals("Test Sender ", metadata.get("from")); assertEquals("recipient@example.com", metadata.get("to")); assertEquals("", doc.getId()); - + // Verify HTML content was correctly parsed to text String content = doc.getContent(); - + // Verify heading was correctly extracted assertTrue(content.contains("HTML Email Test"), "Should contain the h1 heading text"); - + // Verify paragraph content was correctly extracted - assertTrue(content.contains("This is a HTML formatted email message"), "Should contain the first paragraph text"); - assertTrue(content.contains("It contains styled text and multiple paragraphs"), "Should contain the second paragraph text"); - + assertTrue(content.contains("This is a HTML formatted email message"), + "Should contain the first paragraph text"); + assertTrue(content.contains("It contains styled text and multiple paragraphs"), + "Should contain the second paragraph text"); + // Verify HTML tags were correctly removed assertFalse(content.contains(""), "Should not contain html tag"); assertFalse(content.contains(""), "Should not contain head tag"); @@ -140,14 +144,12 @@ void testHtmlEmail() { assertFalse(content.contains("

"), "Should not contain p tag"); assertFalse(content.contains(""), "Should not contain b tag"); assertFalse(content.contains(""), "Should not contain i tag"); - + // Verify formatted content structure - String expectedFormat = String.format(MboxDocumentReader.DEFAULT_MESSAGE_FORMAT, - metadata.get("date"), - metadata.get("from"), - metadata.get("to"), - metadata.get("subject"), - "HTML Email Test This is a HTML formatted email message It contains styled text and multiple paragraphs".trim()); + String expectedFormat = String.format(MboxDocumentReader.DEFAULT_MESSAGE_FORMAT, metadata.get("date"), + metadata.get("from"), metadata.get("to"), metadata.get("subject"), + "HTML Email Test This is a HTML formatted email message It contains styled text and multiple paragraphs" + .trim()); } /** @@ -156,7 +158,8 @@ void testHtmlEmail() { @Test void testMultipartEmail() { // Create reader instance and set to read third email - MboxDocumentReader reader = new MboxDocumentReader(sampleMboxFile.getAbsolutePath(), 3, MboxDocumentReader.DEFAULT_MESSAGE_FORMAT); + MboxDocumentReader reader = new MboxDocumentReader(sampleMboxFile.getAbsolutePath(), 3, + MboxDocumentReader.DEFAULT_MESSAGE_FORMAT); List documents = reader.get(); // Get multipart email @@ -182,13 +185,13 @@ void testMultipartEmail() { void testReadAllEmails() { // Create reader instance with no limit MboxDocumentReader reader = new MboxDocumentReader(sampleMboxFile.getAbsolutePath()); - + // Get all emails List documents = reader.get(); - + // Verify total count assertEquals(3, documents.size(), "Should read all four emails"); - + // Verify email IDs in order assertEquals("", documents.get(0).getId()); assertEquals("", documents.get(1).getId()); @@ -203,11 +206,11 @@ void testCustomMessageFormat() { // Create reader with custom format String customFormat = "Email Details:\nSubject: %4$s\nSender: %2$s\nReceiver: %3$s\nDate: %1$s\n\nMessage:\n%5$s"; MboxDocumentReader reader = new MboxDocumentReader(sampleMboxFile.getAbsolutePath(), 1, customFormat); - + // Read first email Document doc = reader.get().get(0); String content = doc.getContent(); - + // Verify custom format assertTrue(content.startsWith("Email Details:")); assertTrue(content.contains("Subject: Plain Text Email")); @@ -239,4 +242,5 @@ void testNonExistentFile() { new MboxDocumentReader("non_existent.mbox"); }); } + }