Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

updates for deployment #69

Merged
merged 3 commits into from
Jun 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@ embeddings:
cd embeddings && \
docker build --network=host -t ghcr.io/defenseunicorns/leapfrogai/embeddings:0.0.1 .

whisper:
cd models/whisper && \
docker build --network=host -t ghcr.io/defenseunicorns/leapfrogai/whisper:0.0.1 .

whisper-push:
docker push ghcr.io/defenseunicorns/leapfrogai/whisper:0.0.1

# This thing is massive, so directly pushing to the zarf registry is quicker/easier
zarf-push-api:
Expand Down
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
3. [Features](#features)
4. [Getting Started](#getting-started)
5. [Usage](#usage)
6. [Contributing](#contributing)
7. [License](LICENSE)

## Project Goal <a name="project-goal"></a>
Expand Down
6 changes: 5 additions & 1 deletion chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@ domain: bigbang.dev

istio:
enabled: false
gateway: public
gateway: tenant


monitoring:
enabled: false

models:
stablelm3b:
Expand Down
122 changes: 72 additions & 50 deletions notebooks/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,70 +12,70 @@
openai.api_key = 'Free the models'

# Point to leapfrogai
openai.api_base = "https://leapfrogai.leapfrogai.bigbang.dev"
openai.api_base = "https://leapfrogai.dd.bigbang.dev"
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(openai_api_key="foobar",
openai_api_base="https://leapfrogai.leapfrogai.bigbang.dev",
openai_api_base="https://leapfrogai.dd.bigbang.dev",
model="text-embedding-ada-002")

print(print(openai.Model.list()))
print(openai.Model.list())


from langchain.vectorstores import Weaviate
import weaviate

client = weaviate.Client(url="https://weaviate.leapfrogai.bigbang.dev",
client = weaviate.Client(url="https://weaviate.dd.bigbang.dev",
additional_headers={
'X-OpenAI-Api-Key': "foobar"
})
# client.schema.get()
# client.get_meta()

# schema = {
# "classes": [
# {
# "class": "Paragraph",
# "description": "A written paragraph",
# "vectorizer": "text2vec-transformers",
# "moduleConfig": {
# "text2vec-openai": {
# "model": "ada",
# "modelVersion": "002",
# "type": "text"
# }
# },
# "properties": [
# {
# "dataType": ["text"],
# "description": "The content of the paragraph",
# "moduleConfig": {
# "text2vec-transformers": {
# "skip": False,
# "vectorizePropertyName": False
# }
# },
# "name": "content",
# },
# {
# "dataType": ["text"],
# "description": "The source of the paragraph",
# "moduleConfig": {
# "text2vec-transformers": {
# "skip": False,
# "vectorizePropertyName": False
# }
# },
# "name": "source",
# },
# ],
# },
# ]
# }

# client.schema.create(schema)

vectordb = Weaviate(client, "Paragraph", "content", embedding=embeddings)
schema = {
"classes": [
{
"class": "Company",
"description": "A written paragraph",
"vectorizer": "text2vec-transformers",
"moduleConfig": {
"text2vec-openai": {
"model": "ada",
"modelVersion": "002",
"type": "text"
}
},
"properties": [
{
"dataType": ["text"],
"description": "The content of the paragraph",
"moduleConfig": {
"text2vec-transformers": {
"skip": False,
"vectorizePropertyName": False
}
},
"name": "content",
},
{
"dataType": ["text"],
"description": "The source of the paragraph",
"moduleConfig": {
"text2vec-transformers": {
"skip": False,
"vectorizePropertyName": False
}
},
"name": "source",
},
],
},
]
}

client.schema.create(schema)

vectordb = Weaviate(client, "Company", "content", embedding=embeddings)

from langchain.document_loaders import UnstructuredMarkdownLoader, UnstructuredPDFLoader, UnstructuredHTMLLoader, UnstructuredFileLoader
from langchain.document_loaders import PyPDFLoader, CSVLoader, Docx2txtLoader, UnstructuredPowerPointLoader
Expand All @@ -86,6 +86,20 @@

import os

def clean_string(text):
# Split the string by spaces.
# This gives us a list where multi-spaces will be represented as ''.
text_list = text.split(' ')

# Rejoin with ' ' instead of ''
cleaned_text = ''.join([' ' if x == '' else x for x in text_list ])
return cleaned_text.replace(" ", " ")
def percentage_of_char(input_string, char):
count_char = input_string.count(char)
total_chars = len(input_string)
percentage = (count_char / total_chars) * 100
return percentage

def load_file(file_path) -> List[Document]:
_, file_extension = os.path.splitext(file_path)
data: List[Document]
Expand All @@ -111,13 +125,21 @@ def load_file(file_path) -> List[Document]:
# Perform action for other files or skip
return UnstructuredFileLoader(file_path).load()

def process_file(file_path, chunk_size=400, chunk_overlap=200):
def process_file(file_path, chunk_size=1000, chunk_overlap=400):
# text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
try:
data = load_file(file_path=file_path)
texts = text_splitter.split_documents(data)

for t in texts:
if percentage_of_char(t.page_content, ' ') > 25:
print("REPLACING: ")
print(t.page_content)
print("WITH:")
clean = clean_string(t.page_content)
print(clean)
print("---------")
t.page_content = clean
contents = [d.page_content for d in texts]
metadatas = [d.metadata for d in texts]
vectordb.add_texts(
Expand Down
38 changes: 35 additions & 3 deletions values.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,40 @@
domain: bigbang.dev
#https://github.com/weaviate/weaviate-helm/blob/1e8492e03e337b500d5f6fd9a055e2f1ad146398/weaviate/values.yaml

image:
# registry where weaviate image is stored
registry: ghcr.io
# Tag of weaviate image to deploy
# Note: We strongly recommend you overwrite this value in your own values.yaml.
# Otherwise a mere upgrade of the chart could lead to an unexpected upgrade
# of weaviate. In accordance with Infra-as-code, you should pin this value
# down and only change it if you explicitly want to upgrade the Weaviate
# version.
tag: 0.0.3
repo: defenseunicorns/leapfrogai/weaviate


env:
# maybe set this to the internal URL:
# OPENAI_BASE_URL: "http://api.leapfrogai.svc:8080"
OPENAI_BASE_URL: "https://leapfrogai.dd.bigbang.dev"

modules:
DEFAULT_VECTORIZER_MODULE: text2vec-transformers
text2vec-transformers:
enabled: true
text2vec-openai:
enabled: true
storage:
size: 32Gi # default

domain: dd.bigbang.dev

istio:
enabled: false
gateway: public
enabled: true

monitoring:
enabled: true


models:
stablelm3b:
Expand Down
2 changes: 1 addition & 1 deletion weaviate/manifests/helmrelease.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ spec:
cleanupOnFail: false
values:
env:
OPENAI_BASE: "https://leapfrogai.tom.bigbang.dev"
OPENAI_BASE: "https://leapfrogai.dd.bigbang.dev"
valuesFrom:
- kind: Secret
name: "bigbang-###ZARF_CONST_NAME###-values"
4 changes: 2 additions & 2 deletions weaviate/manifests/vs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ metadata:
namespace: weaviate
spec:
gateways:
- istio-system/public
- istio-system/tenant
hosts:
- weaviate.leapfrogai.bigbang.dev
- weaviate.dd.bigbang.dev
http:
- route:
- destination:
Expand Down
7 changes: 1 addition & 6 deletions weaviate/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,10 @@ image:
tag: 0.0.3
repo: defenseunicorns/leapfrogai/weaviate

env:
# maybe set this to the internal URL:
# OPENAI_BASE_URL: "http://api.leapfrogai.svc:8080"
OPENAI_BASE_URL: "https://leapfrogai.leapfrogai.bigbang.dev"

modules:
DEFAULT_VECTORIZER_MODULE: text2vec-transformers
text2vec-transformers:
enabled: true
enabled: false
text2vec-openai:
enabled: true
storage:
Expand Down
1 change: 1 addition & 0 deletions weaviate/zarf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,4 @@ components:
- "semitechnologies/weaviate:1.18.3"
- "ghcr.io/defenseunicorns/leapfrogai/weaviate:0.0.3"
- semitechnologies/transformers-inference:sentence-transformers-all-MiniLM-L6-v2
- semitechnologies/transformers-inference:distilbert-base-uncased
11 changes: 0 additions & 11 deletions zarf-config.toml

This file was deleted.

10 changes: 10 additions & 0 deletions zarf-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
package:
create:
set:
repo: "https://github.com/weaviate/weaviate-helm"
tag: "v16.1.0"
name: "weaviate"
values_file: "../values.yaml"
path: "./weaviate"
namespace: "weaviate"
max_package_size: "1000000000"
16 changes: 10 additions & 6 deletions zarf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,16 @@ components:
- name: leapfrogai
required: true
charts:
- name: leapfrogai
namespace: leapfrogai
localPath: chart
version: 0.0.1
valuesFiles:
- "values.yaml"
- name: leapfrogai
namespace: leapfrogai
localPath: chart
version: 0.0.1
valuesFiles:
- "values.yaml"
# images:
# - "ghcr.io/defenseunicorns/leapfrogai/api:0.0.1"
# # - "ghcr.io/defenseunicorns/leapfrogai/stablelm-3b:0.0.1"
# - "ghcr.io/defenseunicorns/leapfrogai/embeddings:0.0.1"
- name: dcgm-exporter
required: true
charts:
Expand Down