#!/usr/bin/env python # coding: utf-8 # # GPT4All Langchain Demo # # Example of locally running [`GPT4All`](https://github.com/nomic-ai/gpt4all), a 4GB, *llama.cpp* based large language model (LLM) under [`langchain`](https://github.com/hwchase17/langchain), in a Jupyter notebook running a Python 3.10 kernel. # # *Tested on a mid-2015 16GB Macbook Pro, concurrently running Docker (a single container running a sepearate Jupyter server) and Chrome with approx. 40 open tabs).* # ## Model preparation # - download `gpt4all` model: # In[ ]: #https://the-eye.eu/public/AI/models/nomic-ai/gpt4all/gpt4all-lora-quantized.bin # - download `llama.cpp` 7B model # In[2]: #%pip install pyllama #!python3.10 -m llama.download --model_size 7B --folder llama/ # - transform `gpt4all` model: # In[ ]: #%pip install pyllamacpp #!pyllamacpp-convert-gpt4all ./gpt4all-main/chat/gpt4all-lora-quantized.bin llama/tokenizer.model ./gpt4all-main/chat/gpt4all-lora-q-converted.bin # In[25]: GPT4ALL_MODEL_PATH = "./gpt4all-main/chat/gpt4all-lora-q-converted.bin" # ## `langchain` Demo # # Example of running a prompt using `langchain`. # In[1]: #https://python.langchain.com/en/latest/ecosystem/llamacpp.html #%pip uninstall -y langchain #%pip install --upgrade git+https://github.com/hwchase17/langchain.git from langchain.llms import LlamaCpp from langchain import PromptTemplate, LLMChain # - set up prompt template: # In[2]: template = """ Question: {question} Answer: Let's think step by step. """ prompt = PromptTemplate(template=template, input_variables=["question"]) # - load model: # In[3]: get_ipython().run_cell_magic('time', '', '\nllm = LlamaCpp(model_path=GPT4ALL_MODEL_PATH)\n') # - create language chain using prompt template and loaded model: # In[5]: llm_chain = LLMChain(prompt=prompt, llm=llm) # - run prompt: # In[6]: get_ipython().run_cell_magic('time', '', 'question = "What NFL team won the Super Bowl in the year Justin Bieber was born?"\n\nllm_chain.run(question)\n') # Another example... # In[7]: template = """ Question: {question} Answer: """ prompt = PromptTemplate(template=template, input_variables=["question"]) # In[8]: llm_chain = LLMChain(prompt=prompt, llm=llm) # In[9]: get_ipython().run_cell_magic('time', '', 'question = "What is a relational database and what is ACID in that context?"\n\nllm_chain.run(question2)\n') # ## Generating Embeddings # # We can also use the model to generate embddings. # In[28]: get_ipython().run_cell_magic('time', '', '#https://abetlen.github.io/llama-cpp-python/\n#%pip uninstall -y llama-cpp-python\n#%pip install --upgrade llama-cpp-python\n\nfrom langchain.embeddings import LlamaCppEmbeddings\n\nllama_embeddings = LlamaCppEmbeddings(model_path=GPT4ALL_MODEL_PATH)\n') # In[30]: get_ipython().run_cell_magic('time', '', 'text = "This is a test document."\n\nquery_result = llama_embeddings.embed_query(text)\n') # In[12]: get_ipython().run_cell_magic('time', '', 'doc_result = llama.embed_documents([text])\n') # ## Example Query Supported by a Document Based Knowledge Source # # Example document query using the example from the [`langchain` docs](https://python.langchain.com/en/latest/use_cases/question_answering.html). # # The idea is to run the query against a document source to retrieve some relevant context, and use that as part of the prompt context. # In[19]: #https://python.langchain.com/en/latest/use_cases/question_answering.html template = """ Question: {question} Answer: """ prompt = PromptTemplate(template=template, input_variables=["question"]) llm_chain = LLMChain(prompt=prompt, llm=llm) # A naive prompt gives an irrelevant answer: # In[20]: get_ipython().run_cell_magic('time', '', 'query = "What did the president say about Ketanji Brown Jackson"\nllm_chain.run(question)\n') # Now let's try with a source document. # In[21]: #!wget https://raw.githubusercontent.com/hwchase17/langchainjs/main/examples/state_of_the_union.txt from langchain.document_loaders import TextLoader # Ideally.... loader = TextLoader('./state_of_the_union.txt') # However, creating the embeddings is qute slow so I'm going to use a fragment of the text: # In[108]: #ish via chatgpt... def search_context(src, phrase, buffer=100): with open(src, 'r') as f: txt=f.read() words = txt.split() index = words.index(phrase) start_index = max(0, index - buffer) end_index = min(len(words), index + buffer+1) return ' '.join(words[start_index:end_index]) fragment = './fragment.txt' with open(fragment, 'w') as fo: _txt = search_context('./state_of_the_union.txt', "Ketanji") fo.write(_txt) get_ipython().system('cat $fragment') # In[101]: loader = TextLoader('./fragment.txt') # In[102]: #%pip install chromadb from langchain.indexes import VectorstoreIndexCreator # Generate an index from the knowledge source text: # In[103]: get_ipython().run_line_magic('time', '') # Time: ~0.5s per token # NOTE: "You must specify a persist_directory oncreation to persist the collection." # TO DO: How do we load in an already generated and persisted index? index = VectorstoreIndexCreator(embedding=llama_embeddings, vectorstore_kwargs={"persist_directory": "db"} ).from_loaders([loader]) # In[106]: get_ipython().run_line_magic('time', '') pass # The following errors... #index.query(query, llm=llm) # With the full SOTH text, I got: # Error: llama_tokenize: too many tokens; # Also getting: # ValueError: Requested tokens exceed context window of 512 # If we do get passed that, # NotEnoughElementsException # For the latter, somehow need to set something like search_kwargs={"k": 1} # It seems the retriever is expecting, by default, 4 results documents. I can't see how to pass in a lower limit (a single response document is acceptable in this case), so we nd to roll our own chain... # In[121]: get_ipython().run_cell_magic('time', '', '\n# Roll our own....\n\n#https://github.com/hwchase17/langchain/issues/2255\nfrom langchain.vectorstores import Chroma\nfrom langchain.text_splitter import RecursiveCharacterTextSplitter\nfrom langchain.chains import RetrievalQA\n\ndocuments = loader.load()\n\ntext_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)\ntexts = text_splitter.split_documents(documents)\n\n# Again, we should persist the db and figure out how to reuse it\ndocsearch = Chroma.from_documents(texts, llama_embeddings)\n') # In[122]: get_ipython().run_cell_magic('time', '', '\n# Just getting a single result document from the knowledge lookup is fine...\nMIN_DOCS = 1\n\nqa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff",\n retriever=docsearch.as_retriever(search_kwargs={"k": MIN_DOCS}))\n') # What do we get in response to our original query now? # In[127]: get_ipython().run_cell_magic('time', '', '\nprint(query)\n\nqa.run(query)\n') # In[128]: get_ipython().run_cell_magic('time', '', '\nquery = "Identify three things the president said about Ketanji Brown Jackson"\n\nqa.run(query)\n') # In[129]: get_ipython().run_cell_magic('time', '', '\nquery = """\nIdentify three things the president said about Ketanji Brown Jackson. Provide the answer in the form: \n\n- ITEM 1\n- ITEM 2\n- ITEM 3\n"""\n\nqa.run(query)\n')