From cdddd00a30a047dca2b5aa13b38f9d2391dc56be Mon Sep 17 00:00:00 2001 From: Evan Reichard Date: Wed, 8 Nov 2023 18:34:55 -0500 Subject: [PATCH] [add] migrate chromadb to plugin --- .gitignore | 1 + README.md | 89 ++++++++++++++++---------------- minyma/__init__.py | 7 ++- minyma/config.py | 4 +- minyma/normalizer.py | 1 + minyma/oai.py | 43 ++------------- minyma/plugin.py | 9 ++-- minyma/plugins/chroma_db.py | 41 +++++++++++++++ minyma/plugins/duckduckgo.py | 4 +- minyma/plugins/home_assistant.py | 6 +-- minyma/plugins/vehicle_lookup.py | 4 +- minyma/vdb.py | 25 +++++---- 12 files changed, 123 insertions(+), 111 deletions(-) create mode 100644 minyma/plugins/chroma_db.py diff --git a/.gitignore b/.gitignore index 01671fc..a040b8d 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ __pycache__ .DS_Store .direnv data +datasets venv openai_key ha_key diff --git a/README.md b/README.md index 8e7f4e3..ce89a38 100644 --- a/README.md +++ b/README.md @@ -13,12 +13,44 @@ --- -AI Chat Bot with Plugins (Home Assistant, Vehicle Lookup, DuckDuckGo Search) +AI Chat Bot with Plugins (RAG VectorDB - ChromaDB, DuckDuckGo Search, Home Assistant, Vehicle Lookup) [![Build Status](https://drone.va.reichard.io/api/badges/evan/minyma/status.svg)](https://drone.va.reichard.io/evan/minyma) ## Plugins +### ChromeDB Embeddings / Vectors + +This utilizes a local embeddings DB. This allows you to ask the assistant +about local information. [Utilizes Retrieval-Augmented Generation (RAG)](https://arxiv.org/abs/2005.11401). + +``` +User: What are some common symptoms of COVID-19? +Assistant: Some common symptoms of COVID-19 mentioned in the context are + fatigue, headache, dyspnea (shortness of breath), anosmia (loss of + sense of smell), lower respiratory symptoms, cardiac symptoms, + concentration or memory issues, tinnitus and earache, and peripheral + neuropathy symptoms. +``` + +**NOTE:** Instructions on how to load this with your own information are in the +"Normalizing & Loading Data" section. We include a PubMed data normalizer as an +example. + +### DuckDuckGo + +This utilizes DuckDuckGo Search by scraping the top 5 results. + +``` +User: Tell me about Evan Reichard +Assistant: Evan Reichard is a Principal Detection and Response Engineer based + in the Washington DC-Baltimore Area. He has been in this role since + August 2022. Evan has created a browser extension that helps SOC + analysts and saves them over 300 hours per month. Additionally, + there are three professionals named Evan Reichard on LinkedIn and + there are also profiles of people named Evan Reichard on Facebook. +``` + ### Vehicle Lookup API This utilizes Carvana's undocumented API to lookup details on a vehicle. @@ -41,25 +73,12 @@ User: Turn on the living room lights Assistant: The living room lights have been turned on successfully. ``` -### DuckDuckGo - -This utilizes DuckDuckGo Search by scraping the top 5 results. - -``` -User: Tell me about Evan Reichard -Assistant: Evan Reichard is a Principal Detection and Response Engineer based - in the Washington DC-Baltimore Area. He has been in this role since - August 2022. Evan has created a browser extension that helps SOC - analysts and saves them over 300 hours per month. Additionally, - there are three professionals named Evan Reichard on LinkedIn and - there are also profiles of people named Evan Reichard on Facebook. -``` - ## Running Server ```bash # Locally (See "Development" Section) export OPENAI_API_KEY=`cat openai_key` +export CHROMA_DATA_PATH=/data export HOME_ASSISTANT_API_KEY=`cat ha_key` export HOME_ASSISTANT_URL=https://some-url.com @@ -69,7 +88,7 @@ minyma server run docker run \ -p 5000:5000 \ -e OPENAI_API_KEY=`cat openai_key` \ - -e DATA_PATH=/data \ + -e CHROMA_DATA_PATH=/data \ -v ./data:/data \ gitea.va.reichard.io/evan/minyma:latest ``` @@ -87,10 +106,10 @@ To normalize data, you can use Minyma's `normalize` CLI command: ```bash minyma normalize \ - --filename ./pubmed_manuscripts.jsonl \ --normalizer pubmed \ --database chroma \ - --datapath ./chroma + --datapath ./data \ + --filename ./datasets/pubmed_manuscripts.jsonl ``` The above example does the following: @@ -105,10 +124,12 @@ The above example does the following: ## Configuration -| Environment Variable | Default Value | Description | -| -------------------- | ------------- | ---------------------------------------------------------------------------------- | -| OPENAI_API_KEY | NONE | Required OpenAI API Key for ChatGPT access. | -| DATA_PATH | ./data | The path to the data directory. Chroma will store its data in the `chroma` subdir. | +| Environment Variable | Default Value | Description | +| ---------------------- | ------------- | ----------------------------------- | +| OPENAI_API_KEY | NONE | Required OpenAI API Key for ChatGPT | +| CHROMA_DATA_PATH | NONE | ChromaDB Persistent Data Director | +| HOME_ASSISTANT_API_KEY | NONE | Home Assistant API Key | +| HOME_ASSISTANT_URL | NONE | Home Assistant Instance URL | # Development @@ -120,31 +141,9 @@ python3 -m venv venv # Local Development pip install -e . -# Creds +# Creds & Other Environment Variables export OPENAI_API_KEY=`cat openai_key` # Docker make docker_build_local ``` - -# Notes - -This is the first time I'm doing anything LLM related, so it was an adventure. -Initially I was entertaining OpenAI's Embedding API with plans to load embeddings -into Pinecone, however initial calculations with `tiktoken` showed that generating -embeddings would cost roughly $250 USD. - -Fortunately I found [Chroma](https://www.trychroma.com/), which basically solved -both of those issues. It allowed me to load in the normalized data and automatically -generated embeddings for me. - -In order to fit into OpenAI ChatGPT's token limit, I limited each document to roughly -1000 words. I wanted to make sure I could add the top two matches as context while -still having enough headroom for the actual question from the user. - -A few notes: - -- Context is not carried over from previous messages -- I "stole" the prompt that is used in LangChain (See `oai.py`). I tried some variations without much (subjective) improvement. -- A generalized normalizer format. This should make it fairly easy to use completely different data. Just add a new normalizer that implements the super class. -- Basic web front end with TailwindCSS diff --git a/minyma/__init__.py b/minyma/__init__.py index c6dc393..a9b710c 100644 --- a/minyma/__init__.py +++ b/minyma/__init__.py @@ -16,15 +16,14 @@ def signal_handler(sig, frame): def create_app(): - global oai, vdb, plugins + global oai, plugins from minyma.config import Config import minyma.api.common as api_common import minyma.api.v1 as api_v1 app = Flask(__name__) - vdb = ChromaDB(path.join(Config.DATA_PATH, "chroma")) - oai = OpenAIConnector(Config.OPENAI_API_KEY, vdb) + oai = OpenAIConnector(Config.OPENAI_API_KEY) plugins = PluginLoader(Config) app.register_blueprint(api_common.bp) @@ -70,7 +69,7 @@ def normalize(filename, normalizer, database, datapath): return print("INVALID NORMALIZER:", normalizer) # Process Data - vdb.load_documents(norm) + vdb.load_documents(norm.name, norm) signal.signal(signal.SIGINT, signal_handler) diff --git a/minyma/config.py b/minyma/config.py index 9fb1594..32c686e 100644 --- a/minyma/config.py +++ b/minyma/config.py @@ -19,7 +19,7 @@ class Config: OpenAI API Key - Required """ - DATA_PATH: str = get_env("DATA_PATH", default="./data") - OPENAI_API_KEY: str = get_env("OPENAI_API_KEY", required=True) + CHROMA_DATA_PATH: str = get_env("CHROMA_DATA_PATH", required=False) HOME_ASSISTANT_API_KEY: str = get_env("HOME_ASSISTANT_API_KEY", required=False) HOME_ASSISTANT_URL: str = get_env("HOME_ASSISTANT_URL", required=False) + OPENAI_API_KEY: str = get_env("OPENAI_API_KEY", required=True) diff --git a/minyma/normalizer.py b/minyma/normalizer.py index 2d6b2b6..2599a10 100644 --- a/minyma/normalizer.py +++ b/minyma/normalizer.py @@ -18,6 +18,7 @@ class PubMedNormalizer(DataNormalizer): normalized inside the iterator. """ def __init__(self, file: TextIOWrapper): + self.name = "pubmed" self.file = file self.length = 0 diff --git a/minyma/oai.py b/minyma/oai.py index 2d2976c..18f5e69 100644 --- a/minyma/oai.py +++ b/minyma/oai.py @@ -3,22 +3,8 @@ from textwrap import indent from dataclasses import dataclass from typing import Any, List import openai - -from minyma.vdb import VectorDB import minyma -# Stolen LangChain Prompt -PROMPT_TEMPLATE = """ -Use the following pieces of context to answer the question at the end. -If you don't know the answer, just say that you don't know, don't try to -make up an answer. - -{context} - -Question: {question} -Helpful Answer: -""" - INITIAL_PROMPT_TEMPLATE = """ You are a helpful assistant. You are connected to various external functions that can provide you with more personalized and up-to-date information and have already been granted the permissions to execute these functions at will. DO NOT say you don't have access to real time information, instead attempt to call one or more of the listed functions: @@ -47,8 +33,7 @@ class ChatCompletion: usage: dict class OpenAIConnector: - def __init__(self, api_key: str, vdb: VectorDB): - self.vdb = vdb + def __init__(self, api_key: str): self.model = "gpt-3.5-turbo" self.word_cap = 1000 openai.api_key = api_key @@ -102,7 +87,8 @@ class OpenAIConnector: # Build Response Text response_content_arr = [] for key, val in func_responses.items(): - response_content_arr.append("- %s\n%s" % (key, val)) + indented_val = indent(val, ' ' * 2) + response_content_arr.append("- %s\n%s" % (key, indented_val)) response_content = "\n".join(response_content_arr) # Create Follow Up Prompt @@ -137,26 +123,3 @@ class OpenAIConnector: "total_tokens": total_tokens } } - - def old_query(self, question: str) -> Any: - # Get related documents from vector db - related = self.vdb.get_related(question) - - # Validate results - all_docs = related.get("docs", []) - if len(all_docs) == 0: - return { "error": "No Context Found" } - - # Join on new line (cap @ word limit), generate main prompt - reduced_docs = list(map(lambda x: " ".join(x.split()[:self.word_cap]), all_docs)) - context = '\n'.join(reduced_docs) - prompt = PROMPT_TEMPLATE.format(context = context, question = question) - - # Query OpenAI ChatCompletion - response = openai.ChatCompletion.create( - model=self.model, - messages=[{"role": "user", "content": prompt}] - ) - - # Return Response - return { "llm": response, "vdb": related } diff --git a/minyma/plugin.py b/minyma/plugin.py index d331fb6..53452b4 100644 --- a/minyma/plugin.py +++ b/minyma/plugin.py @@ -38,7 +38,6 @@ class PluginLoader: for func_obj in plugin.functions: func_name = func_obj.__name__ - function_name = "%s_%s" % (plugin_name, func_name) signature = inspect.signature(func_obj) params = list( @@ -48,8 +47,12 @@ class PluginLoader: ) ) - func_def = "%s(%s)" % (function_name, ", ".join(params)) - defs[function_name] = { "func": func_obj, "def": func_def } + if func_name in defs: + print("[PluginLoader] Error: Duplicate Function : (%s) %s" % (plugin_name, func_name)) + continue + + func_def = "%s(%s)" % (func_name, ", ".join(params)) + defs[func_name] = { "func": func_obj, "def": func_def } return defs diff --git a/minyma/plugins/chroma_db.py b/minyma/plugins/chroma_db.py new file mode 100644 index 0000000..98c360c --- /dev/null +++ b/minyma/plugins/chroma_db.py @@ -0,0 +1,41 @@ +from textwrap import indent +from minyma.plugin import MinymaPlugin +from minyma.vdb import ChromaDB + + +class ChromaDBPlugin(MinymaPlugin): + """Perform Local VectorDB Lookup + + ChromDB can access multiple "collections". You can add additional functions + here that just access a different collection (i.e. different data) + """ + + def __init__(self, config): + self.name = "chroma_db" + self.config = config + + if not config.CHROMA_DATA_PATH: + self.functions = [] + else: + self.vdb = ChromaDB(config.CHROMA_DATA_PATH) + self.functions = [self.lookup_pubmed_data] + + + def __lookup_data(self, collection_name: str, query: str): + # Get Related + related = self.vdb.get_related(collection_name, query) + + # Normalize Data + return list( + map( + lambda x: " ".join(x.split()[:self.vdb.word_cap]), + related.get("docs", []) + ) + ) + + + def lookup_pubmed_data(self, query: str): + COLLECTION_NAME = "pubmed" + documents = self.__lookup_data(COLLECTION_NAME, query) + context = '\n'.join(documents) + return context diff --git a/minyma/plugins/duckduckgo.py b/minyma/plugins/duckduckgo.py index 7cf6167..3b5df41 100644 --- a/minyma/plugins/duckduckgo.py +++ b/minyma/plugins/duckduckgo.py @@ -14,9 +14,9 @@ class DuckDuckGoPlugin(MinymaPlugin): def __init__(self, config): self.config = config self.name = "duck_duck_go" - self.functions = [self.search] + self.functions = [self.duck_duck_go_search] - def search(self, query: str): + def duck_duck_go_search(self, query: str): """Search DuckDuckGo""" resp = requests.get("https://html.duckduckgo.com/html/?q=%s" % query, headers=HEADERS) soup = BeautifulSoup(resp.text, features="html.parser") diff --git a/minyma/plugins/home_assistant.py b/minyma/plugins/home_assistant.py index 5ef0b8b..d06191e 100644 --- a/minyma/plugins/home_assistant.py +++ b/minyma/plugins/home_assistant.py @@ -9,7 +9,7 @@ class HomeAssistantPlugin(MinymaPlugin): def __init__(self, config): self.config = config - self.name = "home_automation" + self.name = "home_assistant" if not config.HOME_ASSISTANT_API_KEY or not config.HOME_ASSISTANT_URL: @@ -20,9 +20,9 @@ class HomeAssistantPlugin(MinymaPlugin): self.functions = [] else: - self.functions = [self.command] + self.functions = [self.home_automation_command] - def command(self, natural_language_command: str): + def home_automation_command(self, natural_language_command: str): url = urllib.parse.urljoin(self.config.HOME_ASSISTANT_URL, "/api/conversation/process") headers = { "Authorization": "Bearer %s" % self.config.HOME_ASSISTANT_API_KEY, diff --git a/minyma/plugins/vehicle_lookup.py b/minyma/plugins/vehicle_lookup.py index 599cbf8..48b28d1 100644 --- a/minyma/plugins/vehicle_lookup.py +++ b/minyma/plugins/vehicle_lookup.py @@ -14,7 +14,7 @@ class VehicleLookupPlugin(MinymaPlugin): def __init__(self, config): self.config = config self.name = "vehicle_state_plate" - self.functions = [self.lookup] + self.functions = [self.lookup_vehicle_by_state_plate] def __query_api(self, url, json=None, headers=None): # Perform Request @@ -39,7 +39,7 @@ class VehicleLookupPlugin(MinymaPlugin): return None, text, error - def lookup(self, state_abbreviation: str, licence_plate: str): + def lookup_vehicle_by_state_plate(self, state_abbreviation: str, licence_plate: str): CARVANA_URL = ( "https://apim.carvana.io/trades/api/v5/vehicleconfiguration/plateLookup/%s/%s" % (state_abbreviation, licence_plate) diff --git a/minyma/vdb.py b/minyma/vdb.py index ee2c52e..6d06c41 100644 --- a/minyma/vdb.py +++ b/minyma/vdb.py @@ -18,11 +18,11 @@ def chunk(iterable, chunk_size: int): VectorDB Interface """ class VectorDB: - def load_documents(self, normalizer: DataNormalizer): - pass + def load_documents(self, name: str, normalizer: DataNormalizer, chunk_size: int = 10): + raise NotImplementedError("VectorDB must implement load_documents") - def get_related(self, question: str) -> Any: - pass + def get_related(self, name: str, question: str) -> Any: + raise NotImplementedError("VectorDB must implement get_related") """ ChromaDV VectorDB Type @@ -31,12 +31,13 @@ class ChromaDB(VectorDB): def __init__(self, path: str): self.client: API = chromadb.PersistentClient(path=path) self.word_cap = 2500 - self.collection_name: str = "vdb" - self.collection: chromadb.Collection = self.client.create_collection(name=self.collection_name, get_or_create=True) - def get_related(self, question: str) -> Any: + def get_related(self, name: str, question: str) -> Any: + # Get or Create Collection + collection = chromadb.Collection = self.client.create_collection(name=name, get_or_create=True) + """Returns line separated related docs""" - results = self.collection.query( + results = collection.query( query_texts=[question.lower()], n_results=2 ) @@ -53,7 +54,11 @@ class ChromaDB(VectorDB): "ids": all_ids } - def load_documents(self, normalizer: DataNormalizer, chunk_size: int = 10): + def load_documents(self, name: str, normalizer: DataNormalizer, chunk_size: int = 10): + # Get or Create Collection + collection = chromadb.Collection = self.client.create_collection(name=name, get_or_create=True) + + # Load Items length = len(normalizer) / chunk_size for items in tqdm(chunk(normalizer, chunk_size), total=length): ids = [] @@ -65,7 +70,7 @@ class ChromaDB(VectorDB): ids.append(item.get("id")) metadatas.append(item.get("metadata", {})) - self.collection.add( + collection.add( ids=ids, documents=documents, metadatas=metadatas,