| import os |
| import time |
| from langchain.chains import MapReduceDocumentsChain, LLMChain, ReduceDocumentsChain, StuffDocumentsChain |
| from langchain.document_loaders import NewsURLLoader |
| from langchain.llms import CTransformers |
| from langchain.prompts import PromptTemplate |
| from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
|
|
| def summarize_article(article_url): |
| |
| loader = NewsURLLoader([article_url]) |
| docs = loader.load() |
|
|
| |
| config = {'max_new_tokens': 4096, 'temperature': 0.7, 'context_length': 4096} |
| llm = CTransformers(model="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", |
| model_file="mistral-7b-instruct-v0.1.Q4_K_M.gguf", |
| config=config, |
| threads=os.cpu_count()) |
|
|
| |
| map_template = """<s>[INST] The following is a part of an article: |
| {docs} |
| Based on this, please identify the main points. |
| Answer: [/INST] </s>""" |
| map_prompt = PromptTemplate.from_template(map_template) |
| map_chain = LLMChain(llm=llm, prompt=map_prompt) |
|
|
| |
| reduce_template = """<s>[INST] The following is set of summaries from the article: |
| {doc_summaries} |
| Take these and distill it into a final, consolidated summary of the main points. |
| Construct it as a well organized summary of the main points and should be between 3 and 5 paragraphs. |
| Answer: [/INST] </s>""" |
| reduce_prompt = PromptTemplate.from_template(reduce_template) |
| reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt) |
|
|
| |
| combine_documents_chain = StuffDocumentsChain( |
| llm_chain=reduce_chain, document_variable_name="doc_summaries" |
| ) |
| |
| reduce_documents_chain = ReduceDocumentsChain( |
| |
| combine_documents_chain=combine_documents_chain, |
| |
| collapse_documents_chain=combine_documents_chain, |
| |
| token_max=4000, |
| ) |
| |
| map_reduce_chain = MapReduceDocumentsChain( |
| |
| llm_chain=map_chain, |
| |
| reduce_documents_chain=reduce_documents_chain, |
| |
| document_variable_name="docs", |
| |
| return_intermediate_steps=True, |
| ) |
|
|
| |
| text_splitter = RecursiveCharacterTextSplitter( |
| chunk_size=4000, chunk_overlap=0 |
| ) |
| split_docs = text_splitter.split_documents(docs) |
|
|
| |
| start_time = time.time() |
| result = map_reduce_chain.__call__(split_docs, return_only_outputs=True) |
| time_taken = time.time() - start_time |
| return result['output_text'], time_taken |
|
|