Initial Commit
This commit is contained in:
		
						commit
						f862e1b8bb
					
				
							
								
								
									
										6
									
								
								.dockerignore
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								.dockerignore
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,6 @@ | ||||
| __pycache__ | ||||
| .direnv | ||||
| data | ||||
| venv | ||||
| openai_key | ||||
| minyma.egg-info/ | ||||
							
								
								
									
										6
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @ -0,0 +1,6 @@ | ||||
| __pycache__ | ||||
| .direnv | ||||
| data | ||||
| venv | ||||
| openai_key | ||||
| minyma.egg-info/ | ||||
							
								
								
									
										22
									
								
								.pre-commit-config.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								.pre-commit-config.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,22 @@ | ||||
| repos: | ||||
|   - repo: https://github.com/psf/black | ||||
|     rev: 23.9.1 | ||||
|     hooks: | ||||
|       - id: black | ||||
|         name: black | ||||
|         language_version: python3.10 | ||||
|         files: "^minyma/|^setup.py|^tests/minyma/" | ||||
|   - repo: https://github.com/pycqa/flake8 | ||||
|     rev: 6.1.0 | ||||
|     hooks: | ||||
|       - id: flake8 | ||||
|         name: flake8 | ||||
|         args: ["--config=.flake8"] | ||||
|         files: "^minyma/|^setup.py|^tests/minyma/" | ||||
|   - repo: https://github.com/pycqa/isort | ||||
|     rev: 5.12.0 | ||||
|     hooks: | ||||
|       - id: isort | ||||
|         name: isort | ||||
|         args: ["--profile", "black", "--filter-files"] | ||||
|         files: "^minyma/|^setup.py|^tests/minyma/" | ||||
							
								
								
									
										26
									
								
								Dockerfile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								Dockerfile
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,26 @@ | ||||
| # Build Container | ||||
| FROM python:3.11-slim | ||||
| 
 | ||||
| # Install App | ||||
| WORKDIR /app | ||||
| COPY . /app | ||||
| 
 | ||||
| # Install Curl | ||||
| RUN apt-get update -y | ||||
| RUN apt-get install curl -y | ||||
| 
 | ||||
| # Install Chroma Dependencies | ||||
| RUN mkdir -p /root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/ | ||||
| RUN curl https://chroma-onnx-models.s3.amazonaws.com/all-MiniLM-L6-v2/onnx.tar.gz --output /root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz | ||||
| 
 | ||||
| # Install App & Gunicorn | ||||
| RUN pip install . | ||||
| RUN pip3 install gunicorn | ||||
| 
 | ||||
| # Cleanup | ||||
| RUN rm -rf /app | ||||
| 
 | ||||
| # Start Application | ||||
| ENTRYPOINT ["gunicorn"] | ||||
| EXPOSE 5000 | ||||
| CMD ["minyma:create_app()", "--bind", "0.0.0.0:5000", "--threads=4", "--access-logfile", "-"] | ||||
							
								
								
									
										339
									
								
								LICENSE
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										339
									
								
								LICENSE
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,339 @@ | ||||
| GNU GENERAL PUBLIC LICENSE | ||||
|                        Version 2, June 1991 | ||||
| 
 | ||||
|  Copyright (C) 1989, 1991 Free Software Foundation, Inc., | ||||
|  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||||
|  Everyone is permitted to copy and distribute verbatim copies | ||||
|  of this license document, but changing it is not allowed. | ||||
| 
 | ||||
|                             Preamble | ||||
| 
 | ||||
|   The licenses for most software are designed to take away your | ||||
| freedom to share and change it.  By contrast, the GNU General Public | ||||
| License is intended to guarantee your freedom to share and change free | ||||
| software--to make sure the software is free for all its users.  This | ||||
| General Public License applies to most of the Free Software | ||||
| Foundation's software and to any other program whose authors commit to | ||||
| using it.  (Some other Free Software Foundation software is covered by | ||||
| the GNU Lesser General Public License instead.)  You can apply it to | ||||
| your programs, too. | ||||
| 
 | ||||
|   When we speak of free software, we are referring to freedom, not | ||||
| price.  Our General Public Licenses are designed to make sure that you | ||||
| have the freedom to distribute copies of free software (and charge for | ||||
| this service if you wish), that you receive source code or can get it | ||||
| if you want it, that you can change the software or use pieces of it | ||||
| in new free programs; and that you know you can do these things. | ||||
| 
 | ||||
|   To protect your rights, we need to make restrictions that forbid | ||||
| anyone to deny you these rights or to ask you to surrender the rights. | ||||
| These restrictions translate to certain responsibilities for you if you | ||||
| distribute copies of the software, or if you modify it. | ||||
| 
 | ||||
|   For example, if you distribute copies of such a program, whether | ||||
| gratis or for a fee, you must give the recipients all the rights that | ||||
| you have.  You must make sure that they, too, receive or can get the | ||||
| source code.  And you must show them these terms so they know their | ||||
| rights. | ||||
| 
 | ||||
|   We protect your rights with two steps: (1) copyright the software, and | ||||
| (2) offer you this license which gives you legal permission to copy, | ||||
| distribute and/or modify the software. | ||||
| 
 | ||||
|   Also, for each author's protection and ours, we want to make certain | ||||
| that everyone understands that there is no warranty for this free | ||||
| software.  If the software is modified by someone else and passed on, we | ||||
| want its recipients to know that what they have is not the original, so | ||||
| that any problems introduced by others will not reflect on the original | ||||
| authors' reputations. | ||||
| 
 | ||||
|   Finally, any free program is threatened constantly by software | ||||
| patents.  We wish to avoid the danger that redistributors of a free | ||||
| program will individually obtain patent licenses, in effect making the | ||||
| program proprietary.  To prevent this, we have made it clear that any | ||||
| patent must be licensed for everyone's free use or not licensed at all. | ||||
| 
 | ||||
|   The precise terms and conditions for copying, distribution and | ||||
| modification follow. | ||||
| 
 | ||||
|                     GNU GENERAL PUBLIC LICENSE | ||||
|    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION | ||||
| 
 | ||||
|   0. This License applies to any program or other work which contains | ||||
| a notice placed by the copyright holder saying it may be distributed | ||||
| under the terms of this General Public License.  The "Program", below, | ||||
| refers to any such program or work, and a "work based on the Program" | ||||
| means either the Program or any derivative work under copyright law: | ||||
| that is to say, a work containing the Program or a portion of it, | ||||
| either verbatim or with modifications and/or translated into another | ||||
| language.  (Hereinafter, translation is included without limitation in | ||||
| the term "modification".)  Each licensee is addressed as "you". | ||||
| 
 | ||||
| Activities other than copying, distribution and modification are not | ||||
| covered by this License; they are outside its scope.  The act of | ||||
| running the Program is not restricted, and the output from the Program | ||||
| is covered only if its contents constitute a work based on the | ||||
| Program (independent of having been made by running the Program). | ||||
| Whether that is true depends on what the Program does. | ||||
| 
 | ||||
|   1. You may copy and distribute verbatim copies of the Program's | ||||
| source code as you receive it, in any medium, provided that you | ||||
| conspicuously and appropriately publish on each copy an appropriate | ||||
| copyright notice and disclaimer of warranty; keep intact all the | ||||
| notices that refer to this License and to the absence of any warranty; | ||||
| and give any other recipients of the Program a copy of this License | ||||
| along with the Program. | ||||
| 
 | ||||
| You may charge a fee for the physical act of transferring a copy, and | ||||
| you may at your option offer warranty protection in exchange for a fee. | ||||
| 
 | ||||
|   2. You may modify your copy or copies of the Program or any portion | ||||
| of it, thus forming a work based on the Program, and copy and | ||||
| distribute such modifications or work under the terms of Section 1 | ||||
| above, provided that you also meet all of these conditions: | ||||
| 
 | ||||
|     a) You must cause the modified files to carry prominent notices | ||||
|     stating that you changed the files and the date of any change. | ||||
| 
 | ||||
|     b) You must cause any work that you distribute or publish, that in | ||||
|     whole or in part contains or is derived from the Program or any | ||||
|     part thereof, to be licensed as a whole at no charge to all third | ||||
|     parties under the terms of this License. | ||||
| 
 | ||||
|     c) If the modified program normally reads commands interactively | ||||
|     when run, you must cause it, when started running for such | ||||
|     interactive use in the most ordinary way, to print or display an | ||||
|     announcement including an appropriate copyright notice and a | ||||
|     notice that there is no warranty (or else, saying that you provide | ||||
|     a warranty) and that users may redistribute the program under | ||||
|     these conditions, and telling the user how to view a copy of this | ||||
|     License.  (Exception: if the Program itself is interactive but | ||||
|     does not normally print such an announcement, your work based on | ||||
|     the Program is not required to print an announcement.) | ||||
| 
 | ||||
| These requirements apply to the modified work as a whole.  If | ||||
| identifiable sections of that work are not derived from the Program, | ||||
| and can be reasonably considered independent and separate works in | ||||
| themselves, then this License, and its terms, do not apply to those | ||||
| sections when you distribute them as separate works.  But when you | ||||
| distribute the same sections as part of a whole which is a work based | ||||
| on the Program, the distribution of the whole must be on the terms of | ||||
| this License, whose permissions for other licensees extend to the | ||||
| entire whole, and thus to each and every part regardless of who wrote it. | ||||
| 
 | ||||
| Thus, it is not the intent of this section to claim rights or contest | ||||
| your rights to work written entirely by you; rather, the intent is to | ||||
| exercise the right to control the distribution of derivative or | ||||
| collective works based on the Program. | ||||
| 
 | ||||
| In addition, mere aggregation of another work not based on the Program | ||||
| with the Program (or with a work based on the Program) on a volume of | ||||
| a storage or distribution medium does not bring the other work under | ||||
| the scope of this License. | ||||
| 
 | ||||
|   3. You may copy and distribute the Program (or a work based on it, | ||||
| under Section 2) in object code or executable form under the terms of | ||||
| Sections 1 and 2 above provided that you also do one of the following: | ||||
| 
 | ||||
|     a) Accompany it with the complete corresponding machine-readable | ||||
|     source code, which must be distributed under the terms of Sections | ||||
|     1 and 2 above on a medium customarily used for software interchange; or, | ||||
| 
 | ||||
|     b) Accompany it with a written offer, valid for at least three | ||||
|     years, to give any third party, for a charge no more than your | ||||
|     cost of physically performing source distribution, a complete | ||||
|     machine-readable copy of the corresponding source code, to be | ||||
|     distributed under the terms of Sections 1 and 2 above on a medium | ||||
|     customarily used for software interchange; or, | ||||
| 
 | ||||
|     c) Accompany it with the information you received as to the offer | ||||
|     to distribute corresponding source code.  (This alternative is | ||||
|     allowed only for noncommercial distribution and only if you | ||||
|     received the program in object code or executable form with such | ||||
|     an offer, in accord with Subsection b above.) | ||||
| 
 | ||||
| The source code for a work means the preferred form of the work for | ||||
| making modifications to it.  For an executable work, complete source | ||||
| code means all the source code for all modules it contains, plus any | ||||
| associated interface definition files, plus the scripts used to | ||||
| control compilation and installation of the executable.  However, as a | ||||
| special exception, the source code distributed need not include | ||||
| anything that is normally distributed (in either source or binary | ||||
| form) with the major components (compiler, kernel, and so on) of the | ||||
| operating system on which the executable runs, unless that component | ||||
| itself accompanies the executable. | ||||
| 
 | ||||
| If distribution of executable or object code is made by offering | ||||
| access to copy from a designated place, then offering equivalent | ||||
| access to copy the source code from the same place counts as | ||||
| distribution of the source code, even though third parties are not | ||||
| compelled to copy the source along with the object code. | ||||
| 
 | ||||
|   4. You may not copy, modify, sublicense, or distribute the Program | ||||
| except as expressly provided under this License.  Any attempt | ||||
| otherwise to copy, modify, sublicense or distribute the Program is | ||||
| void, and will automatically terminate your rights under this License. | ||||
| However, parties who have received copies, or rights, from you under | ||||
| this License will not have their licenses terminated so long as such | ||||
| parties remain in full compliance. | ||||
| 
 | ||||
|   5. You are not required to accept this License, since you have not | ||||
| signed it.  However, nothing else grants you permission to modify or | ||||
| distribute the Program or its derivative works.  These actions are | ||||
| prohibited by law if you do not accept this License.  Therefore, by | ||||
| modifying or distributing the Program (or any work based on the | ||||
| Program), you indicate your acceptance of this License to do so, and | ||||
| all its terms and conditions for copying, distributing or modifying | ||||
| the Program or works based on it. | ||||
| 
 | ||||
|   6. Each time you redistribute the Program (or any work based on the | ||||
| Program), the recipient automatically receives a license from the | ||||
| original licensor to copy, distribute or modify the Program subject to | ||||
| these terms and conditions.  You may not impose any further | ||||
| restrictions on the recipients' exercise of the rights granted herein. | ||||
| You are not responsible for enforcing compliance by third parties to | ||||
| this License. | ||||
| 
 | ||||
|   7. If, as a consequence of a court judgment or allegation of patent | ||||
| infringement or for any other reason (not limited to patent issues), | ||||
| conditions are imposed on you (whether by court order, agreement or | ||||
| otherwise) that contradict the conditions of this License, they do not | ||||
| excuse you from the conditions of this License.  If you cannot | ||||
| distribute so as to satisfy simultaneously your obligations under this | ||||
| License and any other pertinent obligations, then as a consequence you | ||||
| may not distribute the Program at all.  For example, if a patent | ||||
| license would not permit royalty-free redistribution of the Program by | ||||
| all those who receive copies directly or indirectly through you, then | ||||
| the only way you could satisfy both it and this License would be to | ||||
| refrain entirely from distribution of the Program. | ||||
| 
 | ||||
| If any portion of this section is held invalid or unenforceable under | ||||
| any particular circumstance, the balance of the section is intended to | ||||
| apply and the section as a whole is intended to apply in other | ||||
| circumstances. | ||||
| 
 | ||||
| It is not the purpose of this section to induce you to infringe any | ||||
| patents or other property right claims or to contest validity of any | ||||
| such claims; this section has the sole purpose of protecting the | ||||
| integrity of the free software distribution system, which is | ||||
| implemented by public license practices.  Many people have made | ||||
| generous contributions to the wide range of software distributed | ||||
| through that system in reliance on consistent application of that | ||||
| system; it is up to the author/donor to decide if he or she is willing | ||||
| to distribute software through any other system and a licensee cannot | ||||
| impose that choice. | ||||
| 
 | ||||
| This section is intended to make thoroughly clear what is believed to | ||||
| be a consequence of the rest of this License. | ||||
| 
 | ||||
|   8. If the distribution and/or use of the Program is restricted in | ||||
| certain countries either by patents or by copyrighted interfaces, the | ||||
| original copyright holder who places the Program under this License | ||||
| may add an explicit geographical distribution limitation excluding | ||||
| those countries, so that distribution is permitted only in or among | ||||
| countries not thus excluded.  In such case, this License incorporates | ||||
| the limitation as if written in the body of this License. | ||||
| 
 | ||||
|   9. The Free Software Foundation may publish revised and/or new versions | ||||
| of the General Public License from time to time.  Such new versions will | ||||
| be similar in spirit to the present version, but may differ in detail to | ||||
| address new problems or concerns. | ||||
| 
 | ||||
| Each version is given a distinguishing version number.  If the Program | ||||
| specifies a version number of this License which applies to it and "any | ||||
| later version", you have the option of following the terms and conditions | ||||
| either of that version or of any later version published by the Free | ||||
| Software Foundation.  If the Program does not specify a version number of | ||||
| this License, you may choose any version ever published by the Free Software | ||||
| Foundation. | ||||
| 
 | ||||
|   10. If you wish to incorporate parts of the Program into other free | ||||
| programs whose distribution conditions are different, write to the author | ||||
| to ask for permission.  For software which is copyrighted by the Free | ||||
| Software Foundation, write to the Free Software Foundation; we sometimes | ||||
| make exceptions for this.  Our decision will be guided by the two goals | ||||
| of preserving the free status of all derivatives of our free software and | ||||
| of promoting the sharing and reuse of software generally. | ||||
| 
 | ||||
|                             NO WARRANTY | ||||
| 
 | ||||
|   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY | ||||
| FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN | ||||
| OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES | ||||
| PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED | ||||
| OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF | ||||
| MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS | ||||
| TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE | ||||
| PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, | ||||
| REPAIR OR CORRECTION. | ||||
| 
 | ||||
|   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING | ||||
| WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR | ||||
| REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, | ||||
| INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING | ||||
| OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED | ||||
| TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY | ||||
| YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER | ||||
| PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE | ||||
| POSSIBILITY OF SUCH DAMAGES. | ||||
| 
 | ||||
|                      END OF TERMS AND CONDITIONS | ||||
| 
 | ||||
|             How to Apply These Terms to Your New Programs | ||||
| 
 | ||||
|   If you develop a new program, and you want it to be of the greatest | ||||
| possible use to the public, the best way to achieve this is to make it | ||||
| free software which everyone can redistribute and change under these terms. | ||||
| 
 | ||||
|   To do so, attach the following notices to the program.  It is safest | ||||
| to attach them to the start of each source file to most effectively | ||||
| convey the exclusion of warranty; and each file should have at least | ||||
| the "copyright" line and a pointer to where the full notice is found. | ||||
| 
 | ||||
|     {{description}} | ||||
|     Copyright (C) {{year}}  {{fullname}} | ||||
| 
 | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
|     the Free Software Foundation; either version 2 of the License, or | ||||
|     (at your option) any later version. | ||||
| 
 | ||||
|     This program is distributed in the hope that it will be useful, | ||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|     GNU General Public License for more details. | ||||
| 
 | ||||
|     You should have received a copy of the GNU General Public License along | ||||
|     with this program; if not, write to the Free Software Foundation, Inc., | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
| 
 | ||||
| Also add information on how to contact you by electronic and paper mail. | ||||
| 
 | ||||
| If the program is interactive, make it output a short notice like this | ||||
| when it starts in an interactive mode: | ||||
| 
 | ||||
|     Gnomovision version 69, Copyright (C) year name of author | ||||
|     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. | ||||
|     This is free software, and you are welcome to redistribute it | ||||
|     under certain conditions; type `show c' for details. | ||||
| 
 | ||||
| The hypothetical commands `show w' and `show c' should show the appropriate | ||||
| parts of the General Public License.  Of course, the commands you use may | ||||
| be called something other than `show w' and `show c'; they could even be | ||||
| mouse-clicks or menu items--whatever suits your program. | ||||
| 
 | ||||
| You should also get your employer (if you work as a programmer) or your | ||||
| school, if any, to sign a "copyright disclaimer" for the program, if | ||||
| necessary.  Here is a sample; alter the names: | ||||
| 
 | ||||
|   Yoyodyne, Inc., hereby disclaims all copyright interest in the program | ||||
|   `Gnomovision' (which makes passes at compilers) written by James Hacker. | ||||
| 
 | ||||
|   {signature of Ty Coon}, 1 April 1989 | ||||
|   Ty Coon, President of Vice | ||||
| 
 | ||||
| This General Public License does not permit incorporating your program into | ||||
| proprietary programs.  If your program is a subroutine library, you may | ||||
| consider it more useful to permit linking proprietary applications with the | ||||
| library.  If this is what you want to do, use the GNU Lesser General | ||||
| Public License instead of this License. | ||||
							
								
								
									
										2
									
								
								MANIFEST.in
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								MANIFEST.in
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,2 @@ | ||||
| recursive-include minyma/api *.py | ||||
| recursive-include minyma/templates * | ||||
							
								
								
									
										76
									
								
								README.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										76
									
								
								README.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,76 @@ | ||||
| # Usage | ||||
| 
 | ||||
| ## Running Server | ||||
| 
 | ||||
| ```bash | ||||
| # Locally | ||||
| minyma server run | ||||
| 
 | ||||
| # Docker Quick Start | ||||
| make docker_build_local | ||||
| docker run \ | ||||
|     -p 5000:5000 \ | ||||
|     -e OPENAI_API_KEY=`cat openai_key` \ | ||||
|     -e DATA_PATH=/data \ | ||||
|     -v ./data:/data \ | ||||
|     minyma:latest | ||||
| ``` | ||||
| 
 | ||||
| The server will now be accessible at `http://localhost:5000` | ||||
| 
 | ||||
| ## Normalizing & Loading Data | ||||
| 
 | ||||
| Minyma is designed to be extensible. You can add normalizers and vector db's | ||||
| using the appropriate interfaces defined in `./minyma/normalizer.py` and | ||||
| `./minyma/vdb.py`. At the moment the only supported database is `chroma` | ||||
| and the only supported normalizer is the `pubmed` normalizer. | ||||
| 
 | ||||
| To normalize data, you can use Minyma's `normalize` CLI command: | ||||
| 
 | ||||
| ```bash | ||||
| minyma normalize --filename ./pubmed_manuscripts.jsonl --normalizer pubmed --database chroma --datapath ./chroma | ||||
| ``` | ||||
| 
 | ||||
| The above example does the following: | ||||
| 
 | ||||
| - Uses the `pubmed` normalizer | ||||
| - Normalizes the `./pubmed_manuscripts.jsonl` raw dataset [0] | ||||
| - Loads the output into a `chroma` database and persists the data to the `./chroma` directory | ||||
| 
 | ||||
| **NOTE:** The above dataset took about an hour to normalize on my MPB M2 Max | ||||
| 
 | ||||
| [0] https://huggingface.co/datasets/TaylorAI/pubmed_author_manuscripts/tree/main | ||||
| 
 | ||||
| # Development | ||||
| 
 | ||||
| ```bash | ||||
| # Initiate | ||||
| python3 -m venv venv | ||||
| . ./venv/bin/activate | ||||
| 
 | ||||
| # Local Development | ||||
| pip install -e . | ||||
| 
 | ||||
| # Creds | ||||
| export OPENAI_API_KEY=`cat openai_key` | ||||
| ``` | ||||
| 
 | ||||
| # Datasets | ||||
| 
 | ||||
| https://huggingface.co/datasets/TaylorAI/pubmed_author_manuscripts/tree/main | ||||
| 
 | ||||
| # Notes | ||||
| 
 | ||||
| - https://docs.pinecone.io/docs/openai | ||||
| - https://docs.pinecone.io/docs/langchain | ||||
| - https://docs.pinecone.io/docs/langchain#creating-embeddings | ||||
| - https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb | ||||
| - https://medium.com/@abhishekranjandev/building-a-speech-recognition-app-with-deepspeech-word2vec-and-pinecone-1e5907d103e2 | ||||
| - https://medium.com/@mishra.thedeepak/doc2vec-simple-implementation-example-df2afbbfbad5 | ||||
| - https://cookbook.openai.com/examples/semantic_text_search_using_embeddings | ||||
| 
 | ||||
| TODO: | ||||
| 
 | ||||
| - Build this with Word2Vec / Doc2Vec: https://docs.pinecone.io/docs/openai | ||||
| - https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html#sphx-glr-auto-examples-tutorials-run-doc2vec-lee-py | ||||
| - https://webcache.googleusercontent.com/search?q=cache:https://medium.com/@rubentak/unleashing-the-power-of-intelligent-chatbots-with-gpt-4-and-vector-databases-a-step-by-step-8027e2ce9e78 | ||||
							
								
								
									
										73
									
								
								minyma/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										73
									
								
								minyma/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,73 @@ | ||||
| import click | ||||
| import signal | ||||
| import sys | ||||
| from importlib.metadata import version | ||||
| from minyma.config import Config | ||||
| from minyma.oai import OpenAIConnector | ||||
| from minyma.vdb import ChromaDB | ||||
| from flask import Flask | ||||
| from flask.cli import FlaskGroup | ||||
| 
 | ||||
| __version__ = version("minyma") | ||||
| 
 | ||||
| def signal_handler(sig, frame): | ||||
|     sys.exit(0) | ||||
| 
 | ||||
| 
 | ||||
| def create_app(): | ||||
|     global oai, cdb | ||||
| 
 | ||||
|     import minyma.api.common as api_common | ||||
|     import minyma.api.v1 as api_v1 | ||||
| 
 | ||||
|     app = Flask(__name__) | ||||
|     cdb = ChromaDB(Config.DATA_PATH) | ||||
|     oai = OpenAIConnector(Config.OPENAI_API_KEY, cdb) | ||||
| 
 | ||||
|     app.register_blueprint(api_common.bp) | ||||
|     app.register_blueprint(api_v1.bp) | ||||
| 
 | ||||
|     return app | ||||
| 
 | ||||
| 
 | ||||
| @click.group() | ||||
| def cli(): | ||||
|     """Minyma CLI""" | ||||
| 
 | ||||
| 
 | ||||
| @cli.group(cls=FlaskGroup, create_app=create_app) | ||||
| def server(): | ||||
|     """Minyma flask server""" | ||||
| 
 | ||||
| 
 | ||||
| @cli.command() | ||||
| @click.option('--filename', type=click.File('r'), required=True) | ||||
| @click.option('--normalizer', help="pubmed", required=True) | ||||
| @click.option('--database', help="chroma", required=True) | ||||
| @click.option('--datapath', type=click.Path(), help="database datapath", required=False) | ||||
| def normalize(filename, normalizer, database, datapath): | ||||
|     """Minyma data normalizer & loader""" | ||||
| 
 | ||||
|     database = database.lower() | ||||
|     normalizer = normalizer.lower() | ||||
| 
 | ||||
|     # Validate Database | ||||
|     if database == "chroma": | ||||
|         if datapath is None: | ||||
|             return print("INVALID DATAPATH") | ||||
|         vdb = ChromaDB(datapath) | ||||
|     else: | ||||
|         return print("INVALID DATABASE:", database) | ||||
| 
 | ||||
|     # Select Normalizer | ||||
|     if normalizer == "pubmed": | ||||
|         from minyma.normalizer import PubMedNormalizer | ||||
|         norm = PubMedNormalizer(filename) | ||||
|     else: | ||||
|         return print("INVALID NORMALIZER:", normalizer) | ||||
| 
 | ||||
|     # Process Data | ||||
|     vdb.load_documents(norm) | ||||
| 
 | ||||
| 
 | ||||
| signal.signal(signal.SIGINT, signal_handler) | ||||
							
								
								
									
										0
									
								
								minyma/api/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								minyma/api/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										7
									
								
								minyma/api/common.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										7
									
								
								minyma/api/common.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,7 @@ | ||||
| from flask import make_response, render_template, send_from_directory | ||||
| from flask import Blueprint | ||||
| bp = Blueprint("common", __name__) | ||||
| 
 | ||||
| @bp.route("/", methods=["GET"]) | ||||
| def main_entry(): | ||||
|     return make_response(render_template("index.html")) | ||||
							
								
								
									
										38
									
								
								minyma/api/v1.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										38
									
								
								minyma/api/v1.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,38 @@ | ||||
| import minyma | ||||
| 
 | ||||
| from flask import Blueprint, request | ||||
| bp = Blueprint("v1", __name__, url_prefix="/api/v1") | ||||
| 
 | ||||
| """ | ||||
| Return OpenAI LLM final response with vector db embedding | ||||
| context | ||||
| """ | ||||
| @bp.route("/query", methods=["POST"]) | ||||
| def get_response(): | ||||
|     data = request.get_json() | ||||
|     if not data: | ||||
|         return {"error": "Missing Message"} | ||||
| 
 | ||||
|     message = str(data.get("message")) | ||||
|     if message == "": | ||||
|         return {"error": "Empty Message"} | ||||
| 
 | ||||
|     oai_response = minyma.oai.query(message) | ||||
|     return oai_response | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| Return the raw vector db related response | ||||
| """ | ||||
| @bp.route("/related", methods=["POST"]) | ||||
| def get_related(): | ||||
|     data = request.get_json() | ||||
|     if not data: | ||||
|         return {"error": "Missing Message"} | ||||
| 
 | ||||
|     message = str(data.get("message")) | ||||
|     if message == "": | ||||
|         return {"error": "Empty Message"} | ||||
| 
 | ||||
|     related_documents = minyma.cdb.get_related(message) | ||||
|     return related_documents | ||||
							
								
								
									
										22
									
								
								minyma/config.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								minyma/config.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,22 @@ | ||||
| import os | ||||
| 
 | ||||
| 
 | ||||
| def get_env(key, default=None, required=False) -> str: | ||||
|     """Wrapper for gathering env vars.""" | ||||
|     if required: | ||||
|         assert key in os.environ, "Missing Environment Variable: %s" % key | ||||
|     return str(os.environ.get(key, default)) | ||||
| 
 | ||||
| 
 | ||||
| class Config: | ||||
|     """Wrap application configurations | ||||
| 
 | ||||
|     Attributes | ||||
|     ---------- | ||||
|     DATA_PATH : str | ||||
|         The path where to store any resources (default: ./) | ||||
|     """ | ||||
| 
 | ||||
|     DATA_PATH: str = get_env("DATA_PATH", default="./data") | ||||
|     CHROMA_DATA_PATH: str = get_env("CHROMA_DATA_PATH", default="./data/chroma") | ||||
|     OPENAI_API_KEY: str = get_env("OPENAI_API_KEY", required=True) | ||||
							
								
								
									
										46
									
								
								minyma/normalizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										46
									
								
								minyma/normalizer.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,46 @@ | ||||
| from io import TextIOWrapper | ||||
| import json | ||||
| 
 | ||||
| class DataNormalizer: | ||||
|     def __init__(self, file: TextIOWrapper): | ||||
|         pass | ||||
| 
 | ||||
|     def __iter__(self): | ||||
|         pass | ||||
| 
 | ||||
| # Iterator class that takes a file and iterates over each line. | ||||
| # Data is normalized inside the iterator | ||||
| class PubMedNormalizer(DataNormalizer): | ||||
|     def __init__(self, file: TextIOWrapper): | ||||
|          self.file = file | ||||
| 
 | ||||
|     def __iter__(self): | ||||
|         count = 0 | ||||
| 
 | ||||
|         # Iterate over each line in self.file, normalize | ||||
|         # increment counter, and yield the normalized data. | ||||
|         while True: | ||||
|             line = self.file.readline() | ||||
| 
 | ||||
|             # EOF | ||||
|             if not line: | ||||
|                 break | ||||
| 
 | ||||
|             # Load JSON | ||||
|             l = json.loads(line, strict=False) | ||||
|             norm_text = l.get("text").lower() | ||||
| 
 | ||||
|             # Using the second occurance of "text mining" as a break | ||||
|             # point. We only capture what follows. Initially tried | ||||
|             # using regular expressions, but this is significantly | ||||
|             # faster. | ||||
|             split_data = norm_text.split("text mining") | ||||
|             # if len(split_data) < 3: | ||||
|             #      print("NOT FOUND STG1", count) | ||||
|             norm_text = "text mining".join(split_data[2:]) | ||||
|             norm_text = "\n".join(norm_text.split("\n")[1:]) | ||||
| 
 | ||||
|             count += 1 | ||||
| 
 | ||||
|             # ID = Line Number | ||||
|             yield { "doc": norm_text, "id": str(count - 1) } | ||||
							
								
								
									
										44
									
								
								minyma/oai.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										44
									
								
								minyma/oai.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,44 @@ | ||||
| from typing import Any | ||||
| import openai | ||||
| 
 | ||||
| from minyma.vdb import VectorDB | ||||
| 
 | ||||
| # Stolen LangChain Prompt | ||||
| PROMPT_TEMPLATE = """ | ||||
| Use the following pieces of context to answer the question at the end.  | ||||
| If you don't know the answer, just say that you don't know, don't try to  | ||||
| make up an answer. | ||||
| 
 | ||||
| {context} | ||||
| 
 | ||||
| Question: {question} | ||||
| Helpful Answer: | ||||
| """ | ||||
| 
 | ||||
| class OpenAIConnector: | ||||
|     def __init__(self, api_key: str, vdb: VectorDB): | ||||
|         self.vdb = vdb | ||||
|         self.model = "gpt-3.5-turbo" | ||||
|         openai.api_key = api_key | ||||
| 
 | ||||
|     def query(self, question: str) -> Any: | ||||
|         # Get related documents from vector db | ||||
|         related = self.vdb.get_related(question) | ||||
| 
 | ||||
|         # Validate results | ||||
|         all_docs = related.get("docs", []) | ||||
|         if len(all_docs) == 0: | ||||
|             return { "error": "No Context Found" } | ||||
| 
 | ||||
|         # Join on new line, generate main prompt | ||||
|         context = '\n'.join(all_docs) | ||||
|         prompt = PROMPT_TEMPLATE.format(context = context, question = question) | ||||
| 
 | ||||
|         # Query OpenAI ChatCompletion | ||||
|         response = openai.ChatCompletion.create( | ||||
|           model=self.model, | ||||
|           messages=[{"role": "user", "content": prompt}] | ||||
|         ) | ||||
| 
 | ||||
|         # Return Response | ||||
|         return response | ||||
							
								
								
									
										184
									
								
								minyma/templates/index.html
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										184
									
								
								minyma/templates/index.html
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,184 @@ | ||||
| <!DOCTYPE html> | ||||
| <html lang="en"> | ||||
|   <head> | ||||
|     <meta charset="utf-8" /> | ||||
|     <title>Minyma - Chat</title> | ||||
|     <script src="https://cdn.tailwindcss.com"></script> | ||||
|   </head> | ||||
|   <body class="bg-slate-900 h-screen p-5 flex flex-col justify-between"> | ||||
|     <header class="w-full"> | ||||
|       <svg | ||||
|         preserveAspectRatio="xMidYMid meet" | ||||
|         color-interpolation-filters="sRGB" | ||||
|         style="margin: auto" | ||||
|         height="80" | ||||
|         width="200" | ||||
|         viewBox="70 90 200 90" | ||||
|       > | ||||
|         <g | ||||
|           fill="#ebb919" | ||||
|           transform="translate(69.05000305175781,91.03400039672852)" | ||||
|         > | ||||
|           <g transform="translate(0,0)"> | ||||
|             <g transform="scale(1)"> | ||||
|               <g> | ||||
|                 <path | ||||
|                   d="M33.96-30.84L33.96-30.84Q36.48-30.84 38.37-29.88 40.26-28.92 41.46-27.24 42.66-25.56 43.26-23.34 43.86-21.12 43.86-18.54L43.86-18.54 43.86 0 36.66 0 36.66-18.54Q36.66-20.64 35.16-22.14L35.16-22.14Q33.72-23.64 31.56-23.64L31.56-23.64Q29.4-23.64 27.96-22.14L27.96-22.14Q26.46-20.64 26.46-18.54L26.46-18.54 26.46 0 19.26 0 19.26-18.54Q19.26-20.64 17.76-22.14L17.76-22.14Q17.04-22.92 16.11-23.28 15.18-23.64 14.16-23.64L14.16-23.64Q11.94-23.64 10.5-22.14L10.5-22.14Q9-20.64 9-18.54L9-18.54 9 0 1.8 0 1.8-30 9-30 9-27.36Q10.74-28.86 12.66-29.85 14.58-30.84 16.56-30.84L16.56-30.84Q19.26-30.84 21-29.76 22.74-28.68 24.12-26.76L24.12-26.76Q25.74-28.5 28.32-29.67 30.9-30.84 33.96-30.84ZM54.96 0L47.76 0 47.76-30 54.96-30 54.96 0ZM47.76-34.8L47.76-42 54.96-42 54.96-34.8 47.76-34.8ZM74.28-30.84L74.28-30.84Q77.22-30.84 79.62-29.73 82.02-28.62 83.73-26.67 85.44-24.72 86.37-22.14 87.3-19.56 87.3-16.62L87.3-16.62 87.3 0 80.1 0 80.1-16.62Q80.1-19.62 78-21.6L78-21.6Q75.96-23.64 73.08-23.64L73.08-23.64Q70.14-23.64 68.1-21.6L68.1-21.6Q66.06-19.56 66.06-16.62L66.06-16.62 66.06 0 58.86 0 58.86-30 66.06-30 66.06-27.72Q67.68-29.1 69.72-29.97 71.76-30.84 74.28-30.84ZM116.94-30L124.86-30 110.94 0 109.08 4.08Q107.4 7.74 104.04 9.9 100.68 12.06 96.6 12.06L96.6 12.06 93.42 12.06 95.22 4.86 96.96 4.86Q98.7 4.86 100.2 3.9 101.7 2.94 102.42 1.32L102.42 1.32 103.02 0 89.1-30 97.02-30 106.98-8.52 116.94-30ZM159.12-30.84L159.12-30.84Q161.64-30.84 163.53-29.88 165.42-28.92 166.62-27.24 167.82-25.56 168.42-23.34 169.02-21.12 169.02-18.54L169.02-18.54 169.02 0 161.82 0 161.82-18.54Q161.82-20.64 160.32-22.14L160.32-22.14Q158.88-23.64 156.72-23.64L156.72-23.64Q154.56-23.64 153.12-22.14L153.12-22.14Q151.62-20.64 151.62-18.54L151.62-18.54 151.62 0 144.42 0 144.42-18.54Q144.42-20.64 142.92-22.14L142.92-22.14Q142.2-22.92 141.27-23.28 140.34-23.64 139.32-23.64L139.32-23.64Q137.1-23.64 135.66-22.14L135.66-22.14Q134.16-20.64 134.16-18.54L134.16-18.54 134.16 0 126.96 0 126.96-30 134.16-30 134.16-27.36Q135.9-28.86 137.82-29.85 139.74-30.84 141.72-30.84L141.72-30.84Q144.42-30.84 146.16-29.76 147.9-28.68 149.28-26.76L149.28-26.76Q150.9-28.5 153.48-29.67 156.06-30.84 159.12-30.84ZM196.5-30.06L203.7-30.06 203.7 0 196.5 0 196.5-15Q196.5-18.6 193.98-21.12L193.98-21.12Q191.46-23.64 187.86-23.64L187.86-23.64Q186.12-23.64 184.53-22.98 182.94-22.32 181.74-21.12L181.74-21.12Q179.22-18.6 179.22-15L179.22-15Q179.22-11.46 181.74-8.94L181.74-8.94Q182.94-7.68 184.53-7.05 186.12-6.42 187.86-6.42L187.86-6.42Q189.66-6.42 191.1-7.02L191.1-7.02 193.68-0.6Q190.92 0.78 187.26 0.78L187.26 0.78Q183.96 0.78 181.17-0.45 178.38-1.68 176.34-3.84 174.3-6 173.16-8.88 172.02-11.76 172.02-15L172.02-15Q172.02-18.3 173.16-21.18 174.3-24.06 176.34-26.22 178.38-28.38 181.17-29.61 183.96-30.84 187.26-30.84L187.26-30.84Q190.2-30.84 192.48-29.94 194.76-29.04 196.5-27.66L196.5-27.66 196.5-30.06Z" | ||||
|                   transform="translate(-1.7999999523162842, 42)" | ||||
|                 ></path> | ||||
|               </g> | ||||
|             </g> | ||||
|           </g> | ||||
|           <g fill="#ebb919" transform="translate(5,60.060001373291016)"> | ||||
|             <rect | ||||
|               x="0" | ||||
|               height="1" | ||||
|               y="3.434999942779541" | ||||
|               width="88.66999673843384" | ||||
|             ></rect> | ||||
|             <rect | ||||
|               height="1" | ||||
|               y="3.434999942779541" | ||||
|               width="88.66999673843384" | ||||
|               x="103.22999715805054" | ||||
|             ></rect> | ||||
|             <g transform="translate(91.66999673843384,0)"> | ||||
|               <g transform="scale(1)"> | ||||
|                 <path | ||||
|                   d="M4.43-3.20L2.06-3.20L2.44-4.40C2.58-4.84 2.72-5.28 2.84-5.72C2.97-6.15 3.10-6.60 3.22-7.06L3.26-7.06C3.39-6.60 3.52-6.15 3.65-5.72C3.78-5.28 3.91-4.84 4.06-4.40ZM4.68-2.40L5.42 0L6.49 0L3.83-7.87L2.70-7.87L0.04 0L1.06 0L1.81-2.40ZM7.61-7.87L7.61 0L8.60 0L8.60-7.87Z" | ||||
|                   transform="translate(-0.036000000000000004, 7.872)" | ||||
|                 ></path> | ||||
|               </g> | ||||
|             </g> | ||||
|           </g> | ||||
|         </g> | ||||
|       </svg> | ||||
|     </header> | ||||
|     <main | ||||
|       class="flex flex-col justify-between w-11/12 mx-auto bg-slate-700 text-gray-300 rounded p-2 gap-4 h-full" | ||||
|     > | ||||
|       <div | ||||
|         id="messages" | ||||
|         class="flex flex-col-reverse gap-2 p-2 h-full overflow-scroll" | ||||
|       ></div> | ||||
|       <div | ||||
|         contenteditable | ||||
|         class="w-full border-2 rounded p-1 border-slate-800 outline-none" | ||||
|       /> | ||||
|     </main> | ||||
|     <script> | ||||
|       const LOADING_SVG = `<svg | ||||
| 	width="24" | ||||
| 	height="24" | ||||
| 	viewBox="0 0 24 24" | ||||
| 	xmlns="http://www.w3.org/2000/svg" | ||||
| 	fill="currentColor" | ||||
|       > | ||||
| 	<style> | ||||
| 	  .spinner_qM83 { | ||||
| 	    animation: spinner_8HQG 1.05s infinite; | ||||
| 	  } | ||||
| 	  .spinner_oXPr { | ||||
| 	    animation-delay: 0.1s; | ||||
| 	  } | ||||
| 	  .spinner_ZTLf { | ||||
| 	    animation-delay: 0.2s; | ||||
| 	  } | ||||
| 	  @keyframes spinner_8HQG { | ||||
| 	    0%, | ||||
| 	    57.14% { | ||||
| 	      animation-timing-function: cubic-bezier(0.33, 0.66, 0.66, 1); | ||||
| 	      transform: translate(0); | ||||
| 	    } | ||||
| 	    28.57% { | ||||
| 	      animation-timing-function: cubic-bezier(0.33, 0, 0.66, 0.33); | ||||
| 	      transform: translateY(-6px); | ||||
| 	    } | ||||
| 	    100% { | ||||
| 	      transform: translate(0); | ||||
| 	    } | ||||
| 	  } | ||||
| 	</style> | ||||
| 	<circle class="spinner_qM83" cx="4" cy="12" r="3"></circle> | ||||
| 	<circle class="spinner_qM83 spinner_oXPr" cx="12" cy="12" r="3"></circle> | ||||
| 	<circle class="spinner_qM83 spinner_ZTLf" cx="20" cy="12" r="3"></circle> | ||||
|       </svg>`; | ||||
| 
 | ||||
|       /** | ||||
|        * Wrapper API Call | ||||
|        **/ | ||||
|       function apiCall(data) { | ||||
|         return fetch(data.url, { | ||||
|           method: data.method || "GET", | ||||
|           headers: { | ||||
|             "Content-Type": "application/json", | ||||
|           }, | ||||
|           body: JSON.stringify(data.data || {}), | ||||
|         }).then((resp) => resp.json()); | ||||
|       } | ||||
| 
 | ||||
|       function appendMessageElement(name, content) { | ||||
|         // Wrapping Element | ||||
|         let wrapEl = document.createElement("div"); | ||||
|         wrapEl.innerHTML = `<div class="flex"> | ||||
| 	   <span class="font-bold w-24 grow-0 shrink-0"></span> | ||||
| 	   <span class="whitespace-break-spaces w-full"></span> | ||||
| 	 </div>`; | ||||
| 
 | ||||
|         // Get Elements | ||||
|         let nameEl = wrapEl.querySelector("span"); | ||||
|         let contentEl = nameEl.nextElementSibling; | ||||
| 
 | ||||
|         // Prevent XSS | ||||
|         nameEl.innerText = name + ":"; | ||||
|         contentEl.innerText = content; | ||||
| 
 | ||||
|         // Add to DOM | ||||
|         let newEl = wrapEl.querySelector("div"); | ||||
|         document.querySelector("#messages").prepend(newEl); | ||||
| 
 | ||||
|         // Return References (Used in sendMessage) | ||||
|         return { name: nameEl, content: contentEl }; | ||||
|       } | ||||
| 
 | ||||
|       function sendMessage(message) { | ||||
|         // Set Loading | ||||
|         let { name, content } = appendMessageElement("Assistant", ""); | ||||
|         content.innerHTML = LOADING_SVG; | ||||
| 
 | ||||
|         // Request API | ||||
|         apiCall({ | ||||
|           url: "./api/v1/query", | ||||
|           method: "POST", | ||||
|           data: { message }, | ||||
|         }) | ||||
|           .then((data) => { | ||||
|             console.log("SUCCESS:", data); | ||||
|             content.innerText = data.choices[0].message.content; | ||||
|           }) | ||||
|           .catch((e) => { | ||||
|             console.log("ERROR:", e); | ||||
|             content.innerText = "[API ERROR]"; | ||||
|           }); | ||||
|       } | ||||
| 
 | ||||
|       function initListeners() { | ||||
|         let messageBox = document.querySelector("[contenteditable]"); | ||||
|         messageBox.addEventListener("keydown", (evt) => { | ||||
|           if (evt.keyCode != 13) return; | ||||
| 
 | ||||
|           // Send Message & Add to DOM | ||||
|           let textContent = evt.target.innerText; | ||||
|           appendMessageElement("User", textContent); | ||||
|           sendMessage(textContent); | ||||
| 
 | ||||
|           // Reset | ||||
|           evt.target.innerHTML = ""; | ||||
|           evt.preventDefault(); | ||||
|         }); | ||||
|       } | ||||
| 
 | ||||
|       initListeners(); | ||||
|     </script> | ||||
|   </body> | ||||
| </html> | ||||
							
								
								
									
										75
									
								
								minyma/vdb.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										75
									
								
								minyma/vdb.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,75 @@ | ||||
| from chromadb.api import API | ||||
| from itertools import islice | ||||
| from os import path | ||||
| from tqdm.auto import tqdm | ||||
| from typing import Any, cast | ||||
| import chromadb | ||||
| 
 | ||||
| from minyma.normalizer import DataNormalizer | ||||
| 
 | ||||
| """ | ||||
| Given an iterable, chunk it by `chunk_size` | ||||
| """ | ||||
| def chunk(iterable, chunk_size: int): | ||||
|     iterator = iter(iterable) | ||||
|     while batch := list(islice(iterator, chunk_size)): | ||||
|         yield batch | ||||
| 
 | ||||
| """ | ||||
| VectorDB Interface | ||||
| """ | ||||
| class VectorDB: | ||||
|     def load_documents(self, normalizer: DataNormalizer): | ||||
|         pass | ||||
| 
 | ||||
|     def get_related(self, question: str) -> Any: | ||||
|         pass | ||||
| 
 | ||||
| """ | ||||
| ChromaDV VectorDB Type | ||||
| """ | ||||
| class ChromaDB(VectorDB): | ||||
|     def __init__(self, base_path: str): | ||||
|         chroma_path = path.join(base_path, "chroma") | ||||
|         self.client: API = chromadb.PersistentClient(path=chroma_path) | ||||
|         self.word_limit = 1000 | ||||
|         self.collection_name: str = "vdb" | ||||
|         self.collection: chromadb.Collection = self.client.create_collection(name=self.collection_name, get_or_create=True) | ||||
| 
 | ||||
|     def get_related(self, question) -> Any: | ||||
|         """Returns line separated related docs""" | ||||
|         results = self.collection.query( | ||||
|             query_texts=[question], | ||||
|             n_results=2 | ||||
|         ) | ||||
| 
 | ||||
|         all_docs: list = cast(list, results.get("documents", [[]]))[0] | ||||
|         all_distances: list = cast(list, results.get("distances", [[]]))[0] | ||||
|         all_ids: list = cast(list, results.get("ids", [[]]))[0] | ||||
| 
 | ||||
|         return { | ||||
|             "distances":all_distances,  | ||||
|             "docs": all_docs, | ||||
|             "ids": all_ids | ||||
|         } | ||||
| 
 | ||||
|     def load_documents(self, normalizer: DataNormalizer): | ||||
|         # 10 Item Chunking | ||||
|         for items in tqdm(chunk(normalizer, 50)): | ||||
|             ids = [] | ||||
|             documents = [] | ||||
| 
 | ||||
|             # Limit words per document to accommodate context token limits | ||||
|             for item in items: | ||||
|                 doc = " ".join(item.get("doc").split()[:self.word_limit]) | ||||
|                 documents.append(doc) | ||||
|                 ids.append(item.get("id")) | ||||
| 
 | ||||
|             # Ideally we parse out metadata from each document | ||||
|             # and pass to the metadata kwarg. However, each | ||||
|             # document appears to have a slightly different format, | ||||
|             # so it's difficult to parse out. | ||||
|             self.collection.add( | ||||
|                 documents=documents, | ||||
|                 ids=ids | ||||
|             ) | ||||
							
								
								
									
										25
									
								
								pyproject.toml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								pyproject.toml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,25 @@ | ||||
| [project] | ||||
| name = "minyma" | ||||
| version = "0.0.1" | ||||
| description = "AI Chat Bot with Vector DB Context" | ||||
| authors = [ | ||||
|   { name = "Evan Reichard", email = "evan@reichard.io" }, | ||||
| ] | ||||
| license = { file = "LICENSE" } | ||||
| readme = "README.md" | ||||
| requires-python = ">=3.11" | ||||
| dependencies = [ | ||||
|   "Flask>=3.0", | ||||
|   "openai==0.28.1", | ||||
|   "openai[datalib]==0.28.1", | ||||
|   "tqdm", | ||||
|   "chromadb", | ||||
|   "sqlite-utils", | ||||
|   "click" | ||||
| ] | ||||
| 
 | ||||
| [project.scripts] | ||||
| minyma = "minyma:cli" | ||||
| 
 | ||||
| [tool.setuptools.packages] | ||||
| find = {} | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user