Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Homework pdftranslator #95

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Ignore files in the tests directory
openai-translator/tests/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down Expand Up @@ -166,4 +169,12 @@ openai_api/data/fine_food_reviews_with_embeddings_1k_2146.csv
*.bin

langchain/openai-translator/flagged/*
langchain/openai-translator/flask_temps/*
langchain/openai-translator/flask_temps/*

# idea
.idea/

uploads/

#ipynb
langchain/jupyter/*
302 changes: 301 additions & 1 deletion langchain/sales_chatbot/real_estate_sales_data.txt

Large diffs are not rendered by default.

16 changes: 9 additions & 7 deletions langchain/sales_chatbot/sales_chatbot.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
import os

import gradio as gr

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain_community.chat_models import ChatOpenAI


def initialize_sales_bot(vector_store_dir: str="real_estates_sale"):
db = FAISS.load_local(vector_store_dir, OpenAIEmbeddings())
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
db = FAISS.load_local(vector_store_dir, OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY")), allow_dangerous_deserialization=True)
llm = ChatOpenAI(model_name="gpt-4", temperature=0.5, openai_api_key=os.getenv("OPENAI_API_KEY"))

global SALES_BOT
SALES_BOT = RetrievalQA.from_chain_type(llm,
retriever=db.as_retriever(search_type="similarity_score_threshold",
search_kwargs={"score_threshold": 0.8}))
search_kwargs={"score_threshold": 0.6}))
# 返回向量数据库的检索结果
SALES_BOT.return_source_documents = True

Expand All @@ -40,7 +42,7 @@ def sales_chat(message, history):
def launch_gradio():
demo = gr.ChatInterface(
fn=sales_chat,
title="房产销售",
title="销售顾问(您可以提交关于房产、少儿编程培训相关的问题)",
# retry_btn=None,
# undo_btn=None,
chatbot=gr.Chatbot(height=600),
Expand Down
22 changes: 22 additions & 0 deletions langchain/sales_chatbot/save_QA.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import os

from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

with open("real_estate_sales_data.txt") as f:
real_estate_sales = f.read()

text_splitter = CharacterTextSplitter(
separator = r'\d+\.',
chunk_size = 100,
chunk_overlap = 0,
length_function = len,
is_separator_regex = True,
)

docs = text_splitter.create_documents([real_estate_sales])
db = FAISS.from_documents(docs, OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY")))

db.save_local("real_estates_sale")
6 changes: 4 additions & 2 deletions openai-translator/ai_translator/book/content.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import pandas as pd
import re
from enum import Enum, auto
from PIL import Image as PILImage
from utils import LOG
from ai_translator.utils import LOG

class ContentType(Enum):
TEXT = auto()
Expand Down Expand Up @@ -48,7 +49,8 @@ def set_translation(self, translation, status):

LOG.debug(translation)
# Convert the string to a list of lists
table_data = [row.strip().split() for row in translation.strip().split('\n')]
translation = re.sub(r'\[|\]', '', translation)
table_data = [row.strip().split(',') for row in translation.strip().split('\n')]
LOG.debug(table_data)
# Create a DataFrame from the table_data
translated_df = pd.DataFrame(table_data[1:], columns=table_data[0])
Expand Down
51 changes: 51 additions & 0 deletions openai-translator/ai_translator/chain/translator_chain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import os
from ai_translator.utils import LOG

# 导入 Chat Model 即将使用的 Prompt Templates
from langchain.prompts.chat import (
ChatPromptTemplate,
SystemMessagePromptTemplate,
AIMessagePromptTemplate,
HumanMessagePromptTemplate,
)
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain


class TranslatorChain:
def __init__(self, model_name: str = "gpt-3.5-turbo", verbose: bool = True):
system_template = (
"""你是一个翻译专家,擅长各国语言。 \n
请将我发送给你的文字内容,翻译成{target_language}。 \n
翻译结果以{lang_style}风格展示 \n
"""
# 注意:\n
# 1、保留原文间距和格式(空格,分隔符,换行符)\n
# 2、如果原文是表格,请按照下面的表格形式返回(文字内容仍然需要翻译):\n
# [Title1, Title2, Title3 ] \n
# [context1, context2, context3] \n

)
system_message_prompt = SystemMessagePromptTemplate.from_template(system_template)

human_template = "{text}"
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

chat_prompt_template = ChatPromptTemplate.from_messages(
[system_message_prompt, human_message_prompt]
)

chat = ChatOpenAI(model_name=model_name, temperature=0, verbose=verbose,
openai_api_key=os.getenv("OPENAI_API_KEY"))
self.chain = LLMChain(llm=chat, prompt=chat_prompt_template)

def run(self, text: str, target_language: str, lang_style: str) -> (str, bool):
result = ""
try:
result = self.chain.run(text=text, target_language=target_language, lang_style=lang_style)
except Exception as e:
LOG.error(f"An error occurred: {e}")
return result, False

return result, True

65 changes: 51 additions & 14 deletions openai-translator/ai_translator/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,62 @@

sys.path.append(os.path.dirname(os.path.abspath(__file__)))

from utils import ArgumentParser, ConfigLoader, LOG
from model import GLMModel, OpenAIModel
from translator import PDFTranslator
from flask import Flask, render_template, request, jsonify, flash

if __name__ == "__main__":
argument_parser = ArgumentParser()
args = argument_parser.parse_arguments()
config_loader = ConfigLoader(args.config)
app = Flask(__name__)
app.secret_key = b'_5#y2L"F4Q8z\n\xec]/'
app.config['UPLOAD_FOLDER'] = 'uploads'

ALLOWED_EXTENSIONS = {'pdf'}

# Function to check if file extension is allowed
def allowed_file(filename):
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

# 定义首页路由,用于显示翻译页面
@app.route('/', methods=['GET'])
def home():
return render_template('index.html')

config = config_loader.load_config()
# 定义首页路由
@app.route('/translate', methods=['POST'])
def translate():
# Check if POST request has file part
if 'file' not in request.files:
flash('No file part')
#return render_template('index.html')
return jsonify({"error": "No file part"})

model_name = args.openai_model if args.openai_model else config['OpenAIModel']['model']
api_key = args.openai_api_key if args.openai_api_key else config['OpenAIModel']['api_key']
model = OpenAIModel(model=model_name, api_key=api_key)
file = request.files['file']

# Check if file is uploaded
if file.filename == '':
flash('No selected file')
#return render_template('index.html')
return jsonify({"error": "No selected file"})

pdf_file_path = args.book if args.book else config['common']['book']
file_format = args.file_format if args.file_format else config['common']['file_format']
# Check if file type is allowed
if file and allowed_file(file.filename):
target_lang = request.form.get('target_lang')
target_format = request.form.get('target_format')
lang_style = request.form.get('lang_style')
file_path = os.path.join(app.config['UPLOAD_FOLDER'], file.filename)
file.save(file_path)

# 实例化 PDFTranslator 类,并调用 translate_pdf() 方法
translator = PDFTranslator(model)
translator.translate_pdf(pdf_file_path, file_format)
# model = OpenAIModel(model='gpt-3.5-turbo', api_key=os.getenv("OPENAI_API_KEY"))
pdf_file_path = file_path
file_format = target_format
# 实例化 PDFTranslator 类,并调用 translate_pdf() 方法
translator = PDFTranslator('gpt-3.5-turbo')
output_file_path = translator.translate_pdf(pdf_file_path=pdf_file_path, file_format=file_format, target_language=target_lang, lang_style=lang_style)

result = {
"message": "Translation successful",
"output_file_path": 'File saved as ' + os.path.dirname(os.path.abspath(__file__)) + '/' + output_file_path
}
return jsonify(result)

if __name__ == "__main__":
app.run(host='0.0.0.0', debug=True, port=5000)
10 changes: 7 additions & 3 deletions openai-translator/ai_translator/model/model.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
from book import ContentType
from ai_translator.book import ContentType


class Model:
def make_text_prompt(self, text: str, target_language: str) -> str:
return f"翻译为{target_language}:{text}"
return f"翻译为{target_language},保留原文间距(空格,分隔符,换行符):{text}"

def make_table_prompt(self, table: str, target_language: str) -> str:
return f"翻译为{target_language},保持间距(空格,分隔符),以表格形式返回:\n{table}"
return f"表格形式:" + \
"[Fruit, Color, Price (USD)]\n" + \
"[Apple, Red, 1.20]\n" + \
f"请将下列文字翻译为{target_language},保持间距(空格,分隔符),不同行要换行,并以上述表格形式返回:{table}"

def translate_prompt(self, content, target_language: str) -> str:
if content.content_type == ContentType.TEXT:
Expand Down
5 changes: 3 additions & 2 deletions openai-translator/ai_translator/model/openai_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
import os
import openai

from model import Model
from utils import LOG
from ai_translator.model import Model
from ai_translator.utils import LOG
from openai import OpenAI

class OpenAIModel(Model):

def __init__(self, model: str, api_key: str):
self.model = model
self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
Expand Down
Loading