| |
| """ |
| @author:XuMing(xuming624@qq.com) |
| @description: |
| """ |
| from similarities import Similarity |
| from textgen import ChatGlmModel, LlamaModel |
|
|
| PROMPT_TEMPLATE = """\ |
| 基于以下已知信息,简洁和专业的来回答用户的问题。 |
| 如果无法从中得到答案,请说 "根据已知信息无法回答该问题" 或 "没有提供足够的相关信息",不允许在答案中添加编造成分,答案请使用中文。 |
| |
| 已知内容: |
| {context_str} |
| |
| 问题: |
| {query_str} |
| """ |
|
|
|
|
| class ChatPDF: |
| def __init__( |
| self, |
| sim_model_name_or_path: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", |
| gen_model_type: str = "chatglm", |
| gen_model_name_or_path: str = "THUDM/chatglm-6b-int4", |
| lora_model_name_or_path: str = None, |
| |
| ): |
| self.sim_model = Similarity(model_name_or_path=sim_model_name_or_path) |
|
|
| if gen_model_type == "chatglm": |
| self.gen_model = ChatGlmModel(gen_model_type, gen_model_name_or_path, lora_name=lora_model_name_or_path) |
| elif gen_model_type == "llama": |
| self.gen_model = LlamaModel(gen_model_type, gen_model_name_or_path, lora_name=lora_model_name_or_path) |
| else: |
| raise ValueError('gen_model_type must be chatglm or llama.') |
| self.history = None |
| self.pdf_path = None |
|
|
| def load_pdf_file(self, pdf_path: str): |
| """Load a PDF file.""" |
| if pdf_path.endswith('.pdf'): |
| corpus = self.extract_text_from_pdf(pdf_path) |
| elif pdf_path.endswith('.docx'): |
| corpus = self.extract_text_from_docx(pdf_path) |
| elif pdf_path.endswith('.md'): |
| corpus = self.extract_text_from_markdown(pdf_path) |
| else: |
| corpus = self.extract_text_from_txt(pdf_path) |
| self.sim_model.add_corpus(corpus) |
| self.pdf_path = pdf_path |
|
|
| @staticmethod |
| def extract_text_from_pdf(file_path: str): |
| """Extract text content from a PDF file.""" |
| import PyPDF2 |
| contents = [] |
| with open(file_path, 'rb') as f: |
| pdf_reader = PyPDF2.PdfReader(f) |
| for page in pdf_reader.pages: |
| page_text = page.extract_text().strip() |
| raw_text = [text.strip() for text in page_text.splitlines() if text.strip()] |
| new_text = '' |
| for text in raw_text: |
| new_text += text |
| if text[-1] in ['.', '!', '?', '。', '!', '?', '…', ';', ';', ':', ':', '”', '’', ')', '】', '》', '」', |
| '』', '〕', '〉', '》', '〗', '〞', '〟', '»', '"', "'", ')', ']', '}']: |
| contents.append(new_text) |
| new_text = '' |
| if new_text: |
| contents.append(new_text) |
| return contents |
|
|
| @staticmethod |
| def extract_text_from_txt(file_path: str): |
| """Extract text content from a TXT file.""" |
| contents = [] |
| with open(file_path, 'r', encoding='utf-8') as f: |
| contents = [text.strip() for text in f.readlines() if text.strip()] |
| return contents |
|
|
| @staticmethod |
| def extract_text_from_docx(file_path: str): |
| """Extract text content from a DOCX file.""" |
| import docx |
| document = docx.Document(file_path) |
| contents = [paragraph.text.strip() for paragraph in document.paragraphs if paragraph.text.strip()] |
| return contents |
|
|
| @staticmethod |
| def extract_text_from_markdown(file_path: str): |
| """Extract text content from a Markdown file.""" |
| import markdown |
| from bs4 import BeautifulSoup |
| with open(file_path, 'r', encoding='utf-8') as f: |
| markdown_text = f.read() |
| html = markdown.markdown(markdown_text) |
| soup = BeautifulSoup(html, 'html.parser') |
| contents = [text.strip() for text in soup.get_text().splitlines() if text.strip()] |
| return contents |
|
|
| @staticmethod |
| def _add_source_numbers(lst): |
| """Add source numbers to a list of strings.""" |
| return [f'[{idx + 1}]\t "{item}"' for idx, item in enumerate(lst)] |
|
|
| def _generate_answer(self, query_str, context_str, history=None, max_length=1024): |
| """Generate answer from query and context.""" |
| prompt = PROMPT_TEMPLATE.format(context_str=context_str, query_str=query_str) |
| response, out_history = self.gen_model.chat(prompt, history, max_length=max_length) |
| return response, out_history |
|
|
| def query( |
| self, |
| query, |
| topn: int = 5, |
| max_length: int = 1024, |
| max_input_size: int = 1024, |
| use_history: bool = False |
| ): |
| """Query from corpus.""" |
|
|
| sim_contents = self.sim_model.most_similar(query, topn=topn) |
|
|
| reference_results = [] |
| for query_id, id_score_dict in sim_contents.items(): |
| for corpus_id, s in id_score_dict.items(): |
| reference_results.append(self.sim_model.corpus[corpus_id]) |
| if not reference_results: |
| return '没有提供足够的相关信息', reference_results |
| reference_results = self._add_source_numbers(reference_results) |
|
|
| context_str = '\n'.join(reference_results)[:(max_input_size - len(PROMPT_TEMPLATE))] |
|
|
| if use_history: |
| response, out_history = self._generate_answer(query, context_str, self.history, max_length=max_length) |
| self.history = out_history |
| else: |
|
|
| response, out_history = self._generate_answer(query, context_str) |
|
|
| return response, out_history, reference_results |
|
|
| def save_index(self, index_path=None): |
| """Save model.""" |
| if index_path is None: |
| index_path = '.'.join(self.pdf_path.split('.')[:-1]) + '_index.json' |
| self.sim_model.save_index(index_path) |
|
|
| def load_index(self, index_path=None): |
| """Load model.""" |
| if index_path is None: |
| index_path = '.'.join(self.pdf_path.split('.')[:-1]) + '_index.json' |
| self.sim_model.load_index(index_path) |
|
|
|
|
| if __name__ == "__main__": |
| import sys |
|
|
| if len(sys.argv) > 2: |
| gen_model_name_or_path = sys.argv[1] |
| else: |
| print('Usage: python chatpdf.py <gen_model_name_or_path>') |
| gen_model_name_or_path = "THUDM/chatglm-6b-int4" |
| m = ChatPDF(gen_model_name_or_path=gen_model_name_or_path) |
| m.load_pdf_file(pdf_path='sample.pdf') |
| response = m.query('自然语言中的非平行迁移是指什么?') |
| print(response[0]) |
| response = m.query('本文作者是谁?') |
| print(response[0]) |