Initial Commit
This commit is contained in:
commit
f862e1b8bb
6
.dockerignore
Normal file
6
.dockerignore
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
__pycache__
|
||||||
|
.direnv
|
||||||
|
data
|
||||||
|
venv
|
||||||
|
openai_key
|
||||||
|
minyma.egg-info/
|
6
.gitignore
vendored
Normal file
6
.gitignore
vendored
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
__pycache__
|
||||||
|
.direnv
|
||||||
|
data
|
||||||
|
venv
|
||||||
|
openai_key
|
||||||
|
minyma.egg-info/
|
22
.pre-commit-config.yaml
Normal file
22
.pre-commit-config.yaml
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
repos:
|
||||||
|
- repo: https://github.com/psf/black
|
||||||
|
rev: 23.9.1
|
||||||
|
hooks:
|
||||||
|
- id: black
|
||||||
|
name: black
|
||||||
|
language_version: python3.10
|
||||||
|
files: "^minyma/|^setup.py|^tests/minyma/"
|
||||||
|
- repo: https://github.com/pycqa/flake8
|
||||||
|
rev: 6.1.0
|
||||||
|
hooks:
|
||||||
|
- id: flake8
|
||||||
|
name: flake8
|
||||||
|
args: ["--config=.flake8"]
|
||||||
|
files: "^minyma/|^setup.py|^tests/minyma/"
|
||||||
|
- repo: https://github.com/pycqa/isort
|
||||||
|
rev: 5.12.0
|
||||||
|
hooks:
|
||||||
|
- id: isort
|
||||||
|
name: isort
|
||||||
|
args: ["--profile", "black", "--filter-files"]
|
||||||
|
files: "^minyma/|^setup.py|^tests/minyma/"
|
26
Dockerfile
Normal file
26
Dockerfile
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
# Build Container
|
||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
# Install App
|
||||||
|
WORKDIR /app
|
||||||
|
COPY . /app
|
||||||
|
|
||||||
|
# Install Curl
|
||||||
|
RUN apt-get update -y
|
||||||
|
RUN apt-get install curl -y
|
||||||
|
|
||||||
|
# Install Chroma Dependencies
|
||||||
|
RUN mkdir -p /root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/
|
||||||
|
RUN curl https://chroma-onnx-models.s3.amazonaws.com/all-MiniLM-L6-v2/onnx.tar.gz --output /root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz
|
||||||
|
|
||||||
|
# Install App & Gunicorn
|
||||||
|
RUN pip install .
|
||||||
|
RUN pip3 install gunicorn
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
RUN rm -rf /app
|
||||||
|
|
||||||
|
# Start Application
|
||||||
|
ENTRYPOINT ["gunicorn"]
|
||||||
|
EXPOSE 5000
|
||||||
|
CMD ["minyma:create_app()", "--bind", "0.0.0.0:5000", "--threads=4", "--access-logfile", "-"]
|
339
LICENSE
Normal file
339
LICENSE
Normal file
@ -0,0 +1,339 @@
|
|||||||
|
GNU GENERAL PUBLIC LICENSE
|
||||||
|
Version 2, June 1991
|
||||||
|
|
||||||
|
Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
Everyone is permitted to copy and distribute verbatim copies
|
||||||
|
of this license document, but changing it is not allowed.
|
||||||
|
|
||||||
|
Preamble
|
||||||
|
|
||||||
|
The licenses for most software are designed to take away your
|
||||||
|
freedom to share and change it. By contrast, the GNU General Public
|
||||||
|
License is intended to guarantee your freedom to share and change free
|
||||||
|
software--to make sure the software is free for all its users. This
|
||||||
|
General Public License applies to most of the Free Software
|
||||||
|
Foundation's software and to any other program whose authors commit to
|
||||||
|
using it. (Some other Free Software Foundation software is covered by
|
||||||
|
the GNU Lesser General Public License instead.) You can apply it to
|
||||||
|
your programs, too.
|
||||||
|
|
||||||
|
When we speak of free software, we are referring to freedom, not
|
||||||
|
price. Our General Public Licenses are designed to make sure that you
|
||||||
|
have the freedom to distribute copies of free software (and charge for
|
||||||
|
this service if you wish), that you receive source code or can get it
|
||||||
|
if you want it, that you can change the software or use pieces of it
|
||||||
|
in new free programs; and that you know you can do these things.
|
||||||
|
|
||||||
|
To protect your rights, we need to make restrictions that forbid
|
||||||
|
anyone to deny you these rights or to ask you to surrender the rights.
|
||||||
|
These restrictions translate to certain responsibilities for you if you
|
||||||
|
distribute copies of the software, or if you modify it.
|
||||||
|
|
||||||
|
For example, if you distribute copies of such a program, whether
|
||||||
|
gratis or for a fee, you must give the recipients all the rights that
|
||||||
|
you have. You must make sure that they, too, receive or can get the
|
||||||
|
source code. And you must show them these terms so they know their
|
||||||
|
rights.
|
||||||
|
|
||||||
|
We protect your rights with two steps: (1) copyright the software, and
|
||||||
|
(2) offer you this license which gives you legal permission to copy,
|
||||||
|
distribute and/or modify the software.
|
||||||
|
|
||||||
|
Also, for each author's protection and ours, we want to make certain
|
||||||
|
that everyone understands that there is no warranty for this free
|
||||||
|
software. If the software is modified by someone else and passed on, we
|
||||||
|
want its recipients to know that what they have is not the original, so
|
||||||
|
that any problems introduced by others will not reflect on the original
|
||||||
|
authors' reputations.
|
||||||
|
|
||||||
|
Finally, any free program is threatened constantly by software
|
||||||
|
patents. We wish to avoid the danger that redistributors of a free
|
||||||
|
program will individually obtain patent licenses, in effect making the
|
||||||
|
program proprietary. To prevent this, we have made it clear that any
|
||||||
|
patent must be licensed for everyone's free use or not licensed at all.
|
||||||
|
|
||||||
|
The precise terms and conditions for copying, distribution and
|
||||||
|
modification follow.
|
||||||
|
|
||||||
|
GNU GENERAL PUBLIC LICENSE
|
||||||
|
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
||||||
|
|
||||||
|
0. This License applies to any program or other work which contains
|
||||||
|
a notice placed by the copyright holder saying it may be distributed
|
||||||
|
under the terms of this General Public License. The "Program", below,
|
||||||
|
refers to any such program or work, and a "work based on the Program"
|
||||||
|
means either the Program or any derivative work under copyright law:
|
||||||
|
that is to say, a work containing the Program or a portion of it,
|
||||||
|
either verbatim or with modifications and/or translated into another
|
||||||
|
language. (Hereinafter, translation is included without limitation in
|
||||||
|
the term "modification".) Each licensee is addressed as "you".
|
||||||
|
|
||||||
|
Activities other than copying, distribution and modification are not
|
||||||
|
covered by this License; they are outside its scope. The act of
|
||||||
|
running the Program is not restricted, and the output from the Program
|
||||||
|
is covered only if its contents constitute a work based on the
|
||||||
|
Program (independent of having been made by running the Program).
|
||||||
|
Whether that is true depends on what the Program does.
|
||||||
|
|
||||||
|
1. You may copy and distribute verbatim copies of the Program's
|
||||||
|
source code as you receive it, in any medium, provided that you
|
||||||
|
conspicuously and appropriately publish on each copy an appropriate
|
||||||
|
copyright notice and disclaimer of warranty; keep intact all the
|
||||||
|
notices that refer to this License and to the absence of any warranty;
|
||||||
|
and give any other recipients of the Program a copy of this License
|
||||||
|
along with the Program.
|
||||||
|
|
||||||
|
You may charge a fee for the physical act of transferring a copy, and
|
||||||
|
you may at your option offer warranty protection in exchange for a fee.
|
||||||
|
|
||||||
|
2. You may modify your copy or copies of the Program or any portion
|
||||||
|
of it, thus forming a work based on the Program, and copy and
|
||||||
|
distribute such modifications or work under the terms of Section 1
|
||||||
|
above, provided that you also meet all of these conditions:
|
||||||
|
|
||||||
|
a) You must cause the modified files to carry prominent notices
|
||||||
|
stating that you changed the files and the date of any change.
|
||||||
|
|
||||||
|
b) You must cause any work that you distribute or publish, that in
|
||||||
|
whole or in part contains or is derived from the Program or any
|
||||||
|
part thereof, to be licensed as a whole at no charge to all third
|
||||||
|
parties under the terms of this License.
|
||||||
|
|
||||||
|
c) If the modified program normally reads commands interactively
|
||||||
|
when run, you must cause it, when started running for such
|
||||||
|
interactive use in the most ordinary way, to print or display an
|
||||||
|
announcement including an appropriate copyright notice and a
|
||||||
|
notice that there is no warranty (or else, saying that you provide
|
||||||
|
a warranty) and that users may redistribute the program under
|
||||||
|
these conditions, and telling the user how to view a copy of this
|
||||||
|
License. (Exception: if the Program itself is interactive but
|
||||||
|
does not normally print such an announcement, your work based on
|
||||||
|
the Program is not required to print an announcement.)
|
||||||
|
|
||||||
|
These requirements apply to the modified work as a whole. If
|
||||||
|
identifiable sections of that work are not derived from the Program,
|
||||||
|
and can be reasonably considered independent and separate works in
|
||||||
|
themselves, then this License, and its terms, do not apply to those
|
||||||
|
sections when you distribute them as separate works. But when you
|
||||||
|
distribute the same sections as part of a whole which is a work based
|
||||||
|
on the Program, the distribution of the whole must be on the terms of
|
||||||
|
this License, whose permissions for other licensees extend to the
|
||||||
|
entire whole, and thus to each and every part regardless of who wrote it.
|
||||||
|
|
||||||
|
Thus, it is not the intent of this section to claim rights or contest
|
||||||
|
your rights to work written entirely by you; rather, the intent is to
|
||||||
|
exercise the right to control the distribution of derivative or
|
||||||
|
collective works based on the Program.
|
||||||
|
|
||||||
|
In addition, mere aggregation of another work not based on the Program
|
||||||
|
with the Program (or with a work based on the Program) on a volume of
|
||||||
|
a storage or distribution medium does not bring the other work under
|
||||||
|
the scope of this License.
|
||||||
|
|
||||||
|
3. You may copy and distribute the Program (or a work based on it,
|
||||||
|
under Section 2) in object code or executable form under the terms of
|
||||||
|
Sections 1 and 2 above provided that you also do one of the following:
|
||||||
|
|
||||||
|
a) Accompany it with the complete corresponding machine-readable
|
||||||
|
source code, which must be distributed under the terms of Sections
|
||||||
|
1 and 2 above on a medium customarily used for software interchange; or,
|
||||||
|
|
||||||
|
b) Accompany it with a written offer, valid for at least three
|
||||||
|
years, to give any third party, for a charge no more than your
|
||||||
|
cost of physically performing source distribution, a complete
|
||||||
|
machine-readable copy of the corresponding source code, to be
|
||||||
|
distributed under the terms of Sections 1 and 2 above on a medium
|
||||||
|
customarily used for software interchange; or,
|
||||||
|
|
||||||
|
c) Accompany it with the information you received as to the offer
|
||||||
|
to distribute corresponding source code. (This alternative is
|
||||||
|
allowed only for noncommercial distribution and only if you
|
||||||
|
received the program in object code or executable form with such
|
||||||
|
an offer, in accord with Subsection b above.)
|
||||||
|
|
||||||
|
The source code for a work means the preferred form of the work for
|
||||||
|
making modifications to it. For an executable work, complete source
|
||||||
|
code means all the source code for all modules it contains, plus any
|
||||||
|
associated interface definition files, plus the scripts used to
|
||||||
|
control compilation and installation of the executable. However, as a
|
||||||
|
special exception, the source code distributed need not include
|
||||||
|
anything that is normally distributed (in either source or binary
|
||||||
|
form) with the major components (compiler, kernel, and so on) of the
|
||||||
|
operating system on which the executable runs, unless that component
|
||||||
|
itself accompanies the executable.
|
||||||
|
|
||||||
|
If distribution of executable or object code is made by offering
|
||||||
|
access to copy from a designated place, then offering equivalent
|
||||||
|
access to copy the source code from the same place counts as
|
||||||
|
distribution of the source code, even though third parties are not
|
||||||
|
compelled to copy the source along with the object code.
|
||||||
|
|
||||||
|
4. You may not copy, modify, sublicense, or distribute the Program
|
||||||
|
except as expressly provided under this License. Any attempt
|
||||||
|
otherwise to copy, modify, sublicense or distribute the Program is
|
||||||
|
void, and will automatically terminate your rights under this License.
|
||||||
|
However, parties who have received copies, or rights, from you under
|
||||||
|
this License will not have their licenses terminated so long as such
|
||||||
|
parties remain in full compliance.
|
||||||
|
|
||||||
|
5. You are not required to accept this License, since you have not
|
||||||
|
signed it. However, nothing else grants you permission to modify or
|
||||||
|
distribute the Program or its derivative works. These actions are
|
||||||
|
prohibited by law if you do not accept this License. Therefore, by
|
||||||
|
modifying or distributing the Program (or any work based on the
|
||||||
|
Program), you indicate your acceptance of this License to do so, and
|
||||||
|
all its terms and conditions for copying, distributing or modifying
|
||||||
|
the Program or works based on it.
|
||||||
|
|
||||||
|
6. Each time you redistribute the Program (or any work based on the
|
||||||
|
Program), the recipient automatically receives a license from the
|
||||||
|
original licensor to copy, distribute or modify the Program subject to
|
||||||
|
these terms and conditions. You may not impose any further
|
||||||
|
restrictions on the recipients' exercise of the rights granted herein.
|
||||||
|
You are not responsible for enforcing compliance by third parties to
|
||||||
|
this License.
|
||||||
|
|
||||||
|
7. If, as a consequence of a court judgment or allegation of patent
|
||||||
|
infringement or for any other reason (not limited to patent issues),
|
||||||
|
conditions are imposed on you (whether by court order, agreement or
|
||||||
|
otherwise) that contradict the conditions of this License, they do not
|
||||||
|
excuse you from the conditions of this License. If you cannot
|
||||||
|
distribute so as to satisfy simultaneously your obligations under this
|
||||||
|
License and any other pertinent obligations, then as a consequence you
|
||||||
|
may not distribute the Program at all. For example, if a patent
|
||||||
|
license would not permit royalty-free redistribution of the Program by
|
||||||
|
all those who receive copies directly or indirectly through you, then
|
||||||
|
the only way you could satisfy both it and this License would be to
|
||||||
|
refrain entirely from distribution of the Program.
|
||||||
|
|
||||||
|
If any portion of this section is held invalid or unenforceable under
|
||||||
|
any particular circumstance, the balance of the section is intended to
|
||||||
|
apply and the section as a whole is intended to apply in other
|
||||||
|
circumstances.
|
||||||
|
|
||||||
|
It is not the purpose of this section to induce you to infringe any
|
||||||
|
patents or other property right claims or to contest validity of any
|
||||||
|
such claims; this section has the sole purpose of protecting the
|
||||||
|
integrity of the free software distribution system, which is
|
||||||
|
implemented by public license practices. Many people have made
|
||||||
|
generous contributions to the wide range of software distributed
|
||||||
|
through that system in reliance on consistent application of that
|
||||||
|
system; it is up to the author/donor to decide if he or she is willing
|
||||||
|
to distribute software through any other system and a licensee cannot
|
||||||
|
impose that choice.
|
||||||
|
|
||||||
|
This section is intended to make thoroughly clear what is believed to
|
||||||
|
be a consequence of the rest of this License.
|
||||||
|
|
||||||
|
8. If the distribution and/or use of the Program is restricted in
|
||||||
|
certain countries either by patents or by copyrighted interfaces, the
|
||||||
|
original copyright holder who places the Program under this License
|
||||||
|
may add an explicit geographical distribution limitation excluding
|
||||||
|
those countries, so that distribution is permitted only in or among
|
||||||
|
countries not thus excluded. In such case, this License incorporates
|
||||||
|
the limitation as if written in the body of this License.
|
||||||
|
|
||||||
|
9. The Free Software Foundation may publish revised and/or new versions
|
||||||
|
of the General Public License from time to time. Such new versions will
|
||||||
|
be similar in spirit to the present version, but may differ in detail to
|
||||||
|
address new problems or concerns.
|
||||||
|
|
||||||
|
Each version is given a distinguishing version number. If the Program
|
||||||
|
specifies a version number of this License which applies to it and "any
|
||||||
|
later version", you have the option of following the terms and conditions
|
||||||
|
either of that version or of any later version published by the Free
|
||||||
|
Software Foundation. If the Program does not specify a version number of
|
||||||
|
this License, you may choose any version ever published by the Free Software
|
||||||
|
Foundation.
|
||||||
|
|
||||||
|
10. If you wish to incorporate parts of the Program into other free
|
||||||
|
programs whose distribution conditions are different, write to the author
|
||||||
|
to ask for permission. For software which is copyrighted by the Free
|
||||||
|
Software Foundation, write to the Free Software Foundation; we sometimes
|
||||||
|
make exceptions for this. Our decision will be guided by the two goals
|
||||||
|
of preserving the free status of all derivatives of our free software and
|
||||||
|
of promoting the sharing and reuse of software generally.
|
||||||
|
|
||||||
|
NO WARRANTY
|
||||||
|
|
||||||
|
11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
|
||||||
|
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
|
||||||
|
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
|
||||||
|
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
|
||||||
|
OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||||
|
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
|
||||||
|
TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
|
||||||
|
PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
|
||||||
|
REPAIR OR CORRECTION.
|
||||||
|
|
||||||
|
12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||||
|
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
|
||||||
|
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
|
||||||
|
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
|
||||||
|
OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
|
||||||
|
TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
|
||||||
|
YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
|
||||||
|
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
|
||||||
|
POSSIBILITY OF SUCH DAMAGES.
|
||||||
|
|
||||||
|
END OF TERMS AND CONDITIONS
|
||||||
|
|
||||||
|
How to Apply These Terms to Your New Programs
|
||||||
|
|
||||||
|
If you develop a new program, and you want it to be of the greatest
|
||||||
|
possible use to the public, the best way to achieve this is to make it
|
||||||
|
free software which everyone can redistribute and change under these terms.
|
||||||
|
|
||||||
|
To do so, attach the following notices to the program. It is safest
|
||||||
|
to attach them to the start of each source file to most effectively
|
||||||
|
convey the exclusion of warranty; and each file should have at least
|
||||||
|
the "copyright" line and a pointer to where the full notice is found.
|
||||||
|
|
||||||
|
{{description}}
|
||||||
|
Copyright (C) {{year}} {{fullname}}
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
Also add information on how to contact you by electronic and paper mail.
|
||||||
|
|
||||||
|
If the program is interactive, make it output a short notice like this
|
||||||
|
when it starts in an interactive mode:
|
||||||
|
|
||||||
|
Gnomovision version 69, Copyright (C) year name of author
|
||||||
|
Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
||||||
|
This is free software, and you are welcome to redistribute it
|
||||||
|
under certain conditions; type `show c' for details.
|
||||||
|
|
||||||
|
The hypothetical commands `show w' and `show c' should show the appropriate
|
||||||
|
parts of the General Public License. Of course, the commands you use may
|
||||||
|
be called something other than `show w' and `show c'; they could even be
|
||||||
|
mouse-clicks or menu items--whatever suits your program.
|
||||||
|
|
||||||
|
You should also get your employer (if you work as a programmer) or your
|
||||||
|
school, if any, to sign a "copyright disclaimer" for the program, if
|
||||||
|
necessary. Here is a sample; alter the names:
|
||||||
|
|
||||||
|
Yoyodyne, Inc., hereby disclaims all copyright interest in the program
|
||||||
|
`Gnomovision' (which makes passes at compilers) written by James Hacker.
|
||||||
|
|
||||||
|
{signature of Ty Coon}, 1 April 1989
|
||||||
|
Ty Coon, President of Vice
|
||||||
|
|
||||||
|
This General Public License does not permit incorporating your program into
|
||||||
|
proprietary programs. If your program is a subroutine library, you may
|
||||||
|
consider it more useful to permit linking proprietary applications with the
|
||||||
|
library. If this is what you want to do, use the GNU Lesser General
|
||||||
|
Public License instead of this License.
|
2
MANIFEST.in
Normal file
2
MANIFEST.in
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
recursive-include minyma/api *.py
|
||||||
|
recursive-include minyma/templates *
|
76
README.md
Normal file
76
README.md
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
# Usage
|
||||||
|
|
||||||
|
## Running Server
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Locally
|
||||||
|
minyma server run
|
||||||
|
|
||||||
|
# Docker Quick Start
|
||||||
|
make docker_build_local
|
||||||
|
docker run \
|
||||||
|
-p 5000:5000 \
|
||||||
|
-e OPENAI_API_KEY=`cat openai_key` \
|
||||||
|
-e DATA_PATH=/data \
|
||||||
|
-v ./data:/data \
|
||||||
|
minyma:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
The server will now be accessible at `http://localhost:5000`
|
||||||
|
|
||||||
|
## Normalizing & Loading Data
|
||||||
|
|
||||||
|
Minyma is designed to be extensible. You can add normalizers and vector db's
|
||||||
|
using the appropriate interfaces defined in `./minyma/normalizer.py` and
|
||||||
|
`./minyma/vdb.py`. At the moment the only supported database is `chroma`
|
||||||
|
and the only supported normalizer is the `pubmed` normalizer.
|
||||||
|
|
||||||
|
To normalize data, you can use Minyma's `normalize` CLI command:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
minyma normalize --filename ./pubmed_manuscripts.jsonl --normalizer pubmed --database chroma --datapath ./chroma
|
||||||
|
```
|
||||||
|
|
||||||
|
The above example does the following:
|
||||||
|
|
||||||
|
- Uses the `pubmed` normalizer
|
||||||
|
- Normalizes the `./pubmed_manuscripts.jsonl` raw dataset [0]
|
||||||
|
- Loads the output into a `chroma` database and persists the data to the `./chroma` directory
|
||||||
|
|
||||||
|
**NOTE:** The above dataset took about an hour to normalize on my MPB M2 Max
|
||||||
|
|
||||||
|
[0] https://huggingface.co/datasets/TaylorAI/pubmed_author_manuscripts/tree/main
|
||||||
|
|
||||||
|
# Development
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Initiate
|
||||||
|
python3 -m venv venv
|
||||||
|
. ./venv/bin/activate
|
||||||
|
|
||||||
|
# Local Development
|
||||||
|
pip install -e .
|
||||||
|
|
||||||
|
# Creds
|
||||||
|
export OPENAI_API_KEY=`cat openai_key`
|
||||||
|
```
|
||||||
|
|
||||||
|
# Datasets
|
||||||
|
|
||||||
|
https://huggingface.co/datasets/TaylorAI/pubmed_author_manuscripts/tree/main
|
||||||
|
|
||||||
|
# Notes
|
||||||
|
|
||||||
|
- https://docs.pinecone.io/docs/openai
|
||||||
|
- https://docs.pinecone.io/docs/langchain
|
||||||
|
- https://docs.pinecone.io/docs/langchain#creating-embeddings
|
||||||
|
- https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
|
||||||
|
- https://medium.com/@abhishekranjandev/building-a-speech-recognition-app-with-deepspeech-word2vec-and-pinecone-1e5907d103e2
|
||||||
|
- https://medium.com/@mishra.thedeepak/doc2vec-simple-implementation-example-df2afbbfbad5
|
||||||
|
- https://cookbook.openai.com/examples/semantic_text_search_using_embeddings
|
||||||
|
|
||||||
|
TODO:
|
||||||
|
|
||||||
|
- Build this with Word2Vec / Doc2Vec: https://docs.pinecone.io/docs/openai
|
||||||
|
- https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html#sphx-glr-auto-examples-tutorials-run-doc2vec-lee-py
|
||||||
|
- https://webcache.googleusercontent.com/search?q=cache:https://medium.com/@rubentak/unleashing-the-power-of-intelligent-chatbots-with-gpt-4-and-vector-databases-a-step-by-step-8027e2ce9e78
|
73
minyma/__init__.py
Normal file
73
minyma/__init__.py
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
import click
|
||||||
|
import signal
|
||||||
|
import sys
|
||||||
|
from importlib.metadata import version
|
||||||
|
from minyma.config import Config
|
||||||
|
from minyma.oai import OpenAIConnector
|
||||||
|
from minyma.vdb import ChromaDB
|
||||||
|
from flask import Flask
|
||||||
|
from flask.cli import FlaskGroup
|
||||||
|
|
||||||
|
__version__ = version("minyma")
|
||||||
|
|
||||||
|
def signal_handler(sig, frame):
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
|
||||||
|
def create_app():
|
||||||
|
global oai, cdb
|
||||||
|
|
||||||
|
import minyma.api.common as api_common
|
||||||
|
import minyma.api.v1 as api_v1
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
cdb = ChromaDB(Config.DATA_PATH)
|
||||||
|
oai = OpenAIConnector(Config.OPENAI_API_KEY, cdb)
|
||||||
|
|
||||||
|
app.register_blueprint(api_common.bp)
|
||||||
|
app.register_blueprint(api_v1.bp)
|
||||||
|
|
||||||
|
return app
|
||||||
|
|
||||||
|
|
||||||
|
@click.group()
|
||||||
|
def cli():
|
||||||
|
"""Minyma CLI"""
|
||||||
|
|
||||||
|
|
||||||
|
@cli.group(cls=FlaskGroup, create_app=create_app)
|
||||||
|
def server():
|
||||||
|
"""Minyma flask server"""
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
@click.option('--filename', type=click.File('r'), required=True)
|
||||||
|
@click.option('--normalizer', help="pubmed", required=True)
|
||||||
|
@click.option('--database', help="chroma", required=True)
|
||||||
|
@click.option('--datapath', type=click.Path(), help="database datapath", required=False)
|
||||||
|
def normalize(filename, normalizer, database, datapath):
|
||||||
|
"""Minyma data normalizer & loader"""
|
||||||
|
|
||||||
|
database = database.lower()
|
||||||
|
normalizer = normalizer.lower()
|
||||||
|
|
||||||
|
# Validate Database
|
||||||
|
if database == "chroma":
|
||||||
|
if datapath is None:
|
||||||
|
return print("INVALID DATAPATH")
|
||||||
|
vdb = ChromaDB(datapath)
|
||||||
|
else:
|
||||||
|
return print("INVALID DATABASE:", database)
|
||||||
|
|
||||||
|
# Select Normalizer
|
||||||
|
if normalizer == "pubmed":
|
||||||
|
from minyma.normalizer import PubMedNormalizer
|
||||||
|
norm = PubMedNormalizer(filename)
|
||||||
|
else:
|
||||||
|
return print("INVALID NORMALIZER:", normalizer)
|
||||||
|
|
||||||
|
# Process Data
|
||||||
|
vdb.load_documents(norm)
|
||||||
|
|
||||||
|
|
||||||
|
signal.signal(signal.SIGINT, signal_handler)
|
0
minyma/api/__init__.py
Normal file
0
minyma/api/__init__.py
Normal file
7
minyma/api/common.py
Normal file
7
minyma/api/common.py
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
from flask import make_response, render_template, send_from_directory
|
||||||
|
from flask import Blueprint
|
||||||
|
bp = Blueprint("common", __name__)
|
||||||
|
|
||||||
|
@bp.route("/", methods=["GET"])
|
||||||
|
def main_entry():
|
||||||
|
return make_response(render_template("index.html"))
|
38
minyma/api/v1.py
Normal file
38
minyma/api/v1.py
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
import minyma
|
||||||
|
|
||||||
|
from flask import Blueprint, request
|
||||||
|
bp = Blueprint("v1", __name__, url_prefix="/api/v1")
|
||||||
|
|
||||||
|
"""
|
||||||
|
Return OpenAI LLM final response with vector db embedding
|
||||||
|
context
|
||||||
|
"""
|
||||||
|
@bp.route("/query", methods=["POST"])
|
||||||
|
def get_response():
|
||||||
|
data = request.get_json()
|
||||||
|
if not data:
|
||||||
|
return {"error": "Missing Message"}
|
||||||
|
|
||||||
|
message = str(data.get("message"))
|
||||||
|
if message == "":
|
||||||
|
return {"error": "Empty Message"}
|
||||||
|
|
||||||
|
oai_response = minyma.oai.query(message)
|
||||||
|
return oai_response
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Return the raw vector db related response
|
||||||
|
"""
|
||||||
|
@bp.route("/related", methods=["POST"])
|
||||||
|
def get_related():
|
||||||
|
data = request.get_json()
|
||||||
|
if not data:
|
||||||
|
return {"error": "Missing Message"}
|
||||||
|
|
||||||
|
message = str(data.get("message"))
|
||||||
|
if message == "":
|
||||||
|
return {"error": "Empty Message"}
|
||||||
|
|
||||||
|
related_documents = minyma.cdb.get_related(message)
|
||||||
|
return related_documents
|
22
minyma/config.py
Normal file
22
minyma/config.py
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def get_env(key, default=None, required=False) -> str:
|
||||||
|
"""Wrapper for gathering env vars."""
|
||||||
|
if required:
|
||||||
|
assert key in os.environ, "Missing Environment Variable: %s" % key
|
||||||
|
return str(os.environ.get(key, default))
|
||||||
|
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
"""Wrap application configurations
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
DATA_PATH : str
|
||||||
|
The path where to store any resources (default: ./)
|
||||||
|
"""
|
||||||
|
|
||||||
|
DATA_PATH: str = get_env("DATA_PATH", default="./data")
|
||||||
|
CHROMA_DATA_PATH: str = get_env("CHROMA_DATA_PATH", default="./data/chroma")
|
||||||
|
OPENAI_API_KEY: str = get_env("OPENAI_API_KEY", required=True)
|
46
minyma/normalizer.py
Normal file
46
minyma/normalizer.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
from io import TextIOWrapper
|
||||||
|
import json
|
||||||
|
|
||||||
|
class DataNormalizer:
|
||||||
|
def __init__(self, file: TextIOWrapper):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Iterator class that takes a file and iterates over each line.
|
||||||
|
# Data is normalized inside the iterator
|
||||||
|
class PubMedNormalizer(DataNormalizer):
|
||||||
|
def __init__(self, file: TextIOWrapper):
|
||||||
|
self.file = file
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
count = 0
|
||||||
|
|
||||||
|
# Iterate over each line in self.file, normalize
|
||||||
|
# increment counter, and yield the normalized data.
|
||||||
|
while True:
|
||||||
|
line = self.file.readline()
|
||||||
|
|
||||||
|
# EOF
|
||||||
|
if not line:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Load JSON
|
||||||
|
l = json.loads(line, strict=False)
|
||||||
|
norm_text = l.get("text").lower()
|
||||||
|
|
||||||
|
# Using the second occurance of "text mining" as a break
|
||||||
|
# point. We only capture what follows. Initially tried
|
||||||
|
# using regular expressions, but this is significantly
|
||||||
|
# faster.
|
||||||
|
split_data = norm_text.split("text mining")
|
||||||
|
# if len(split_data) < 3:
|
||||||
|
# print("NOT FOUND STG1", count)
|
||||||
|
norm_text = "text mining".join(split_data[2:])
|
||||||
|
norm_text = "\n".join(norm_text.split("\n")[1:])
|
||||||
|
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
# ID = Line Number
|
||||||
|
yield { "doc": norm_text, "id": str(count - 1) }
|
44
minyma/oai.py
Normal file
44
minyma/oai.py
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
from typing import Any
|
||||||
|
import openai
|
||||||
|
|
||||||
|
from minyma.vdb import VectorDB
|
||||||
|
|
||||||
|
# Stolen LangChain Prompt
|
||||||
|
PROMPT_TEMPLATE = """
|
||||||
|
Use the following pieces of context to answer the question at the end.
|
||||||
|
If you don't know the answer, just say that you don't know, don't try to
|
||||||
|
make up an answer.
|
||||||
|
|
||||||
|
{context}
|
||||||
|
|
||||||
|
Question: {question}
|
||||||
|
Helpful Answer:
|
||||||
|
"""
|
||||||
|
|
||||||
|
class OpenAIConnector:
|
||||||
|
def __init__(self, api_key: str, vdb: VectorDB):
|
||||||
|
self.vdb = vdb
|
||||||
|
self.model = "gpt-3.5-turbo"
|
||||||
|
openai.api_key = api_key
|
||||||
|
|
||||||
|
def query(self, question: str) -> Any:
|
||||||
|
# Get related documents from vector db
|
||||||
|
related = self.vdb.get_related(question)
|
||||||
|
|
||||||
|
# Validate results
|
||||||
|
all_docs = related.get("docs", [])
|
||||||
|
if len(all_docs) == 0:
|
||||||
|
return { "error": "No Context Found" }
|
||||||
|
|
||||||
|
# Join on new line, generate main prompt
|
||||||
|
context = '\n'.join(all_docs)
|
||||||
|
prompt = PROMPT_TEMPLATE.format(context = context, question = question)
|
||||||
|
|
||||||
|
# Query OpenAI ChatCompletion
|
||||||
|
response = openai.ChatCompletion.create(
|
||||||
|
model=self.model,
|
||||||
|
messages=[{"role": "user", "content": prompt}]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Return Response
|
||||||
|
return response
|
184
minyma/templates/index.html
Normal file
184
minyma/templates/index.html
Normal file
@ -0,0 +1,184 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8" />
|
||||||
|
<title>Minyma - Chat</title>
|
||||||
|
<script src="https://cdn.tailwindcss.com"></script>
|
||||||
|
</head>
|
||||||
|
<body class="bg-slate-900 h-screen p-5 flex flex-col justify-between">
|
||||||
|
<header class="w-full">
|
||||||
|
<svg
|
||||||
|
preserveAspectRatio="xMidYMid meet"
|
||||||
|
color-interpolation-filters="sRGB"
|
||||||
|
style="margin: auto"
|
||||||
|
height="80"
|
||||||
|
width="200"
|
||||||
|
viewBox="70 90 200 90"
|
||||||
|
>
|
||||||
|
<g
|
||||||
|
fill="#ebb919"
|
||||||
|
transform="translate(69.05000305175781,91.03400039672852)"
|
||||||
|
>
|
||||||
|
<g transform="translate(0,0)">
|
||||||
|
<g transform="scale(1)">
|
||||||
|
<g>
|
||||||
|
<path
|
||||||
|
d="M33.96-30.84L33.96-30.84Q36.48-30.84 38.37-29.88 40.26-28.92 41.46-27.24 42.66-25.56 43.26-23.34 43.86-21.12 43.86-18.54L43.86-18.54 43.86 0 36.66 0 36.66-18.54Q36.66-20.64 35.16-22.14L35.16-22.14Q33.72-23.64 31.56-23.64L31.56-23.64Q29.4-23.64 27.96-22.14L27.96-22.14Q26.46-20.64 26.46-18.54L26.46-18.54 26.46 0 19.26 0 19.26-18.54Q19.26-20.64 17.76-22.14L17.76-22.14Q17.04-22.92 16.11-23.28 15.18-23.64 14.16-23.64L14.16-23.64Q11.94-23.64 10.5-22.14L10.5-22.14Q9-20.64 9-18.54L9-18.54 9 0 1.8 0 1.8-30 9-30 9-27.36Q10.74-28.86 12.66-29.85 14.58-30.84 16.56-30.84L16.56-30.84Q19.26-30.84 21-29.76 22.74-28.68 24.12-26.76L24.12-26.76Q25.74-28.5 28.32-29.67 30.9-30.84 33.96-30.84ZM54.96 0L47.76 0 47.76-30 54.96-30 54.96 0ZM47.76-34.8L47.76-42 54.96-42 54.96-34.8 47.76-34.8ZM74.28-30.84L74.28-30.84Q77.22-30.84 79.62-29.73 82.02-28.62 83.73-26.67 85.44-24.72 86.37-22.14 87.3-19.56 87.3-16.62L87.3-16.62 87.3 0 80.1 0 80.1-16.62Q80.1-19.62 78-21.6L78-21.6Q75.96-23.64 73.08-23.64L73.08-23.64Q70.14-23.64 68.1-21.6L68.1-21.6Q66.06-19.56 66.06-16.62L66.06-16.62 66.06 0 58.86 0 58.86-30 66.06-30 66.06-27.72Q67.68-29.1 69.72-29.97 71.76-30.84 74.28-30.84ZM116.94-30L124.86-30 110.94 0 109.08 4.08Q107.4 7.74 104.04 9.9 100.68 12.06 96.6 12.06L96.6 12.06 93.42 12.06 95.22 4.86 96.96 4.86Q98.7 4.86 100.2 3.9 101.7 2.94 102.42 1.32L102.42 1.32 103.02 0 89.1-30 97.02-30 106.98-8.52 116.94-30ZM159.12-30.84L159.12-30.84Q161.64-30.84 163.53-29.88 165.42-28.92 166.62-27.24 167.82-25.56 168.42-23.34 169.02-21.12 169.02-18.54L169.02-18.54 169.02 0 161.82 0 161.82-18.54Q161.82-20.64 160.32-22.14L160.32-22.14Q158.88-23.64 156.72-23.64L156.72-23.64Q154.56-23.64 153.12-22.14L153.12-22.14Q151.62-20.64 151.62-18.54L151.62-18.54 151.62 0 144.42 0 144.42-18.54Q144.42-20.64 142.92-22.14L142.92-22.14Q142.2-22.92 141.27-23.28 140.34-23.64 139.32-23.64L139.32-23.64Q137.1-23.64 135.66-22.14L135.66-22.14Q134.16-20.64 134.16-18.54L134.16-18.54 134.16 0 126.96 0 126.96-30 134.16-30 134.16-27.36Q135.9-28.86 137.82-29.85 139.74-30.84 141.72-30.84L141.72-30.84Q144.42-30.84 146.16-29.76 147.9-28.68 149.28-26.76L149.28-26.76Q150.9-28.5 153.48-29.67 156.06-30.84 159.12-30.84ZM196.5-30.06L203.7-30.06 203.7 0 196.5 0 196.5-15Q196.5-18.6 193.98-21.12L193.98-21.12Q191.46-23.64 187.86-23.64L187.86-23.64Q186.12-23.64 184.53-22.98 182.94-22.32 181.74-21.12L181.74-21.12Q179.22-18.6 179.22-15L179.22-15Q179.22-11.46 181.74-8.94L181.74-8.94Q182.94-7.68 184.53-7.05 186.12-6.42 187.86-6.42L187.86-6.42Q189.66-6.42 191.1-7.02L191.1-7.02 193.68-0.6Q190.92 0.78 187.26 0.78L187.26 0.78Q183.96 0.78 181.17-0.45 178.38-1.68 176.34-3.84 174.3-6 173.16-8.88 172.02-11.76 172.02-15L172.02-15Q172.02-18.3 173.16-21.18 174.3-24.06 176.34-26.22 178.38-28.38 181.17-29.61 183.96-30.84 187.26-30.84L187.26-30.84Q190.2-30.84 192.48-29.94 194.76-29.04 196.5-27.66L196.5-27.66 196.5-30.06Z"
|
||||||
|
transform="translate(-1.7999999523162842, 42)"
|
||||||
|
></path>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
<g fill="#ebb919" transform="translate(5,60.060001373291016)">
|
||||||
|
<rect
|
||||||
|
x="0"
|
||||||
|
height="1"
|
||||||
|
y="3.434999942779541"
|
||||||
|
width="88.66999673843384"
|
||||||
|
></rect>
|
||||||
|
<rect
|
||||||
|
height="1"
|
||||||
|
y="3.434999942779541"
|
||||||
|
width="88.66999673843384"
|
||||||
|
x="103.22999715805054"
|
||||||
|
></rect>
|
||||||
|
<g transform="translate(91.66999673843384,0)">
|
||||||
|
<g transform="scale(1)">
|
||||||
|
<path
|
||||||
|
d="M4.43-3.20L2.06-3.20L2.44-4.40C2.58-4.84 2.72-5.28 2.84-5.72C2.97-6.15 3.10-6.60 3.22-7.06L3.26-7.06C3.39-6.60 3.52-6.15 3.65-5.72C3.78-5.28 3.91-4.84 4.06-4.40ZM4.68-2.40L5.42 0L6.49 0L3.83-7.87L2.70-7.87L0.04 0L1.06 0L1.81-2.40ZM7.61-7.87L7.61 0L8.60 0L8.60-7.87Z"
|
||||||
|
transform="translate(-0.036000000000000004, 7.872)"
|
||||||
|
></path>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
</svg>
|
||||||
|
</header>
|
||||||
|
<main
|
||||||
|
class="flex flex-col justify-between w-11/12 mx-auto bg-slate-700 text-gray-300 rounded p-2 gap-4 h-full"
|
||||||
|
>
|
||||||
|
<div
|
||||||
|
id="messages"
|
||||||
|
class="flex flex-col-reverse gap-2 p-2 h-full overflow-scroll"
|
||||||
|
></div>
|
||||||
|
<div
|
||||||
|
contenteditable
|
||||||
|
class="w-full border-2 rounded p-1 border-slate-800 outline-none"
|
||||||
|
/>
|
||||||
|
</main>
|
||||||
|
<script>
|
||||||
|
const LOADING_SVG = `<svg
|
||||||
|
width="24"
|
||||||
|
height="24"
|
||||||
|
viewBox="0 0 24 24"
|
||||||
|
xmlns="http://www.w3.org/2000/svg"
|
||||||
|
fill="currentColor"
|
||||||
|
>
|
||||||
|
<style>
|
||||||
|
.spinner_qM83 {
|
||||||
|
animation: spinner_8HQG 1.05s infinite;
|
||||||
|
}
|
||||||
|
.spinner_oXPr {
|
||||||
|
animation-delay: 0.1s;
|
||||||
|
}
|
||||||
|
.spinner_ZTLf {
|
||||||
|
animation-delay: 0.2s;
|
||||||
|
}
|
||||||
|
@keyframes spinner_8HQG {
|
||||||
|
0%,
|
||||||
|
57.14% {
|
||||||
|
animation-timing-function: cubic-bezier(0.33, 0.66, 0.66, 1);
|
||||||
|
transform: translate(0);
|
||||||
|
}
|
||||||
|
28.57% {
|
||||||
|
animation-timing-function: cubic-bezier(0.33, 0, 0.66, 0.33);
|
||||||
|
transform: translateY(-6px);
|
||||||
|
}
|
||||||
|
100% {
|
||||||
|
transform: translate(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
<circle class="spinner_qM83" cx="4" cy="12" r="3"></circle>
|
||||||
|
<circle class="spinner_qM83 spinner_oXPr" cx="12" cy="12" r="3"></circle>
|
||||||
|
<circle class="spinner_qM83 spinner_ZTLf" cx="20" cy="12" r="3"></circle>
|
||||||
|
</svg>`;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wrapper API Call
|
||||||
|
**/
|
||||||
|
function apiCall(data) {
|
||||||
|
return fetch(data.url, {
|
||||||
|
method: data.method || "GET",
|
||||||
|
headers: {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
},
|
||||||
|
body: JSON.stringify(data.data || {}),
|
||||||
|
}).then((resp) => resp.json());
|
||||||
|
}
|
||||||
|
|
||||||
|
function appendMessageElement(name, content) {
|
||||||
|
// Wrapping Element
|
||||||
|
let wrapEl = document.createElement("div");
|
||||||
|
wrapEl.innerHTML = `<div class="flex">
|
||||||
|
<span class="font-bold w-24 grow-0 shrink-0"></span>
|
||||||
|
<span class="whitespace-break-spaces w-full"></span>
|
||||||
|
</div>`;
|
||||||
|
|
||||||
|
// Get Elements
|
||||||
|
let nameEl = wrapEl.querySelector("span");
|
||||||
|
let contentEl = nameEl.nextElementSibling;
|
||||||
|
|
||||||
|
// Prevent XSS
|
||||||
|
nameEl.innerText = name + ":";
|
||||||
|
contentEl.innerText = content;
|
||||||
|
|
||||||
|
// Add to DOM
|
||||||
|
let newEl = wrapEl.querySelector("div");
|
||||||
|
document.querySelector("#messages").prepend(newEl);
|
||||||
|
|
||||||
|
// Return References (Used in sendMessage)
|
||||||
|
return { name: nameEl, content: contentEl };
|
||||||
|
}
|
||||||
|
|
||||||
|
function sendMessage(message) {
|
||||||
|
// Set Loading
|
||||||
|
let { name, content } = appendMessageElement("Assistant", "");
|
||||||
|
content.innerHTML = LOADING_SVG;
|
||||||
|
|
||||||
|
// Request API
|
||||||
|
apiCall({
|
||||||
|
url: "./api/v1/query",
|
||||||
|
method: "POST",
|
||||||
|
data: { message },
|
||||||
|
})
|
||||||
|
.then((data) => {
|
||||||
|
console.log("SUCCESS:", data);
|
||||||
|
content.innerText = data.choices[0].message.content;
|
||||||
|
})
|
||||||
|
.catch((e) => {
|
||||||
|
console.log("ERROR:", e);
|
||||||
|
content.innerText = "[API ERROR]";
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function initListeners() {
|
||||||
|
let messageBox = document.querySelector("[contenteditable]");
|
||||||
|
messageBox.addEventListener("keydown", (evt) => {
|
||||||
|
if (evt.keyCode != 13) return;
|
||||||
|
|
||||||
|
// Send Message & Add to DOM
|
||||||
|
let textContent = evt.target.innerText;
|
||||||
|
appendMessageElement("User", textContent);
|
||||||
|
sendMessage(textContent);
|
||||||
|
|
||||||
|
// Reset
|
||||||
|
evt.target.innerHTML = "";
|
||||||
|
evt.preventDefault();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
initListeners();
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
75
minyma/vdb.py
Normal file
75
minyma/vdb.py
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
from chromadb.api import API
|
||||||
|
from itertools import islice
|
||||||
|
from os import path
|
||||||
|
from tqdm.auto import tqdm
|
||||||
|
from typing import Any, cast
|
||||||
|
import chromadb
|
||||||
|
|
||||||
|
from minyma.normalizer import DataNormalizer
|
||||||
|
|
||||||
|
"""
|
||||||
|
Given an iterable, chunk it by `chunk_size`
|
||||||
|
"""
|
||||||
|
def chunk(iterable, chunk_size: int):
|
||||||
|
iterator = iter(iterable)
|
||||||
|
while batch := list(islice(iterator, chunk_size)):
|
||||||
|
yield batch
|
||||||
|
|
||||||
|
"""
|
||||||
|
VectorDB Interface
|
||||||
|
"""
|
||||||
|
class VectorDB:
|
||||||
|
def load_documents(self, normalizer: DataNormalizer):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_related(self, question: str) -> Any:
|
||||||
|
pass
|
||||||
|
|
||||||
|
"""
|
||||||
|
ChromaDV VectorDB Type
|
||||||
|
"""
|
||||||
|
class ChromaDB(VectorDB):
|
||||||
|
def __init__(self, base_path: str):
|
||||||
|
chroma_path = path.join(base_path, "chroma")
|
||||||
|
self.client: API = chromadb.PersistentClient(path=chroma_path)
|
||||||
|
self.word_limit = 1000
|
||||||
|
self.collection_name: str = "vdb"
|
||||||
|
self.collection: chromadb.Collection = self.client.create_collection(name=self.collection_name, get_or_create=True)
|
||||||
|
|
||||||
|
def get_related(self, question) -> Any:
|
||||||
|
"""Returns line separated related docs"""
|
||||||
|
results = self.collection.query(
|
||||||
|
query_texts=[question],
|
||||||
|
n_results=2
|
||||||
|
)
|
||||||
|
|
||||||
|
all_docs: list = cast(list, results.get("documents", [[]]))[0]
|
||||||
|
all_distances: list = cast(list, results.get("distances", [[]]))[0]
|
||||||
|
all_ids: list = cast(list, results.get("ids", [[]]))[0]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"distances":all_distances,
|
||||||
|
"docs": all_docs,
|
||||||
|
"ids": all_ids
|
||||||
|
}
|
||||||
|
|
||||||
|
def load_documents(self, normalizer: DataNormalizer):
|
||||||
|
# 10 Item Chunking
|
||||||
|
for items in tqdm(chunk(normalizer, 50)):
|
||||||
|
ids = []
|
||||||
|
documents = []
|
||||||
|
|
||||||
|
# Limit words per document to accommodate context token limits
|
||||||
|
for item in items:
|
||||||
|
doc = " ".join(item.get("doc").split()[:self.word_limit])
|
||||||
|
documents.append(doc)
|
||||||
|
ids.append(item.get("id"))
|
||||||
|
|
||||||
|
# Ideally we parse out metadata from each document
|
||||||
|
# and pass to the metadata kwarg. However, each
|
||||||
|
# document appears to have a slightly different format,
|
||||||
|
# so it's difficult to parse out.
|
||||||
|
self.collection.add(
|
||||||
|
documents=documents,
|
||||||
|
ids=ids
|
||||||
|
)
|
25
pyproject.toml
Normal file
25
pyproject.toml
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
[project]
|
||||||
|
name = "minyma"
|
||||||
|
version = "0.0.1"
|
||||||
|
description = "AI Chat Bot with Vector DB Context"
|
||||||
|
authors = [
|
||||||
|
{ name = "Evan Reichard", email = "evan@reichard.io" },
|
||||||
|
]
|
||||||
|
license = { file = "LICENSE" }
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.11"
|
||||||
|
dependencies = [
|
||||||
|
"Flask>=3.0",
|
||||||
|
"openai==0.28.1",
|
||||||
|
"openai[datalib]==0.28.1",
|
||||||
|
"tqdm",
|
||||||
|
"chromadb",
|
||||||
|
"sqlite-utils",
|
||||||
|
"click"
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
minyma = "minyma:cli"
|
||||||
|
|
||||||
|
[tool.setuptools.packages]
|
||||||
|
find = {}
|
Loading…
Reference in New Issue
Block a user