defenseunicorns · runyontr · Jun 8, 2023 · Jun 8, 2023 · Jun 8, 2023 · Jun 8, 2023
@@ -24,6 +24,12 @@ embeddings:
 	cd embeddings && \
 	docker build --network=host -t ghcr.io/defenseunicorns/leapfrogai/embeddings:0.0.1 .
 
+whisper:
+	cd models/whisper && \
+	docker build --network=host -t ghcr.io/defenseunicorns/leapfrogai/whisper:0.0.1 .
+
+whisper-push:
+	docker push ghcr.io/defenseunicorns/leapfrogai/whisper:0.0.1
 
 # This thing is massive, so directly pushing to the zarf registry is quicker/easier
 zarf-push-api:

@@ -6,7 +6,6 @@
 3. [Features](#features)
 4. [Getting Started](#getting-started)
 5. [Usage](#usage)
-6. [Contributing](#contributing)
 7. [License](LICENSE)
 
 ## Project Goal <a name="project-goal"></a>

@@ -2,7 +2,11 @@ domain: bigbang.dev
 
 istio:
   enabled: false
-  gateway: public
+  gateway: tenant
+
+
+monitoring:
+  enabled: false
 
 models:
   stablelm3b:

@@ -12,70 +12,70 @@
 openai.api_key = 'Free the models'
 
 # Point to leapfrogai
-openai.api_base = "https://leapfrogai.leapfrogai.bigbang.dev"
+openai.api_base = "https://leapfrogai.dd.bigbang.dev"
 from langchain.embeddings import OpenAIEmbeddings
 
 embeddings = OpenAIEmbeddings(openai_api_key="foobar",
-                              openai_api_base="https://leapfrogai.leapfrogai.bigbang.dev",
+                              openai_api_base="https://leapfrogai.dd.bigbang.dev",
                               model="text-embedding-ada-002")
 
-print(print(openai.Model.list()))
+print(openai.Model.list())
 
 
 from langchain.vectorstores import Weaviate
 import weaviate
 
-client = weaviate.Client(url="https://weaviate.leapfrogai.bigbang.dev",
+client = weaviate.Client(url="https://weaviate.dd.bigbang.dev",
                          additional_headers={
         'X-OpenAI-Api-Key': "foobar"
     })
 # client.schema.get()
 # client.get_meta()
 
-# schema = {
-#     "classes": [
-#         {
-#             "class": "Paragraph",
-#             "description": "A written paragraph",
-#             "vectorizer": "text2vec-transformers",
-#               "moduleConfig": {
-#                 "text2vec-openai": {
-#                   "model": "ada",
-#                   "modelVersion": "002",
-#                   "type": "text"
-#                 }
-#               },
-#             "properties": [
-#                 {
-#                     "dataType": ["text"],
-#                     "description": "The content of the paragraph",
-#                     "moduleConfig": {
-#                         "text2vec-transformers": {
-#                           "skip": False,
-#                           "vectorizePropertyName": False
-#                         }
-#                       },
-#                     "name": "content",
-#                 },
-#                 {
-#                     "dataType": ["text"],
-#                     "description": "The source of the paragraph",
-#                     "moduleConfig": {
-#                         "text2vec-transformers": {
-#                           "skip": False,
-#                           "vectorizePropertyName": False
-#                         }
-#                       },
-#                     "name": "source",
-#                 },
-#             ],
-#         },
-#     ]
-# }
-
-# client.schema.create(schema)
-
-vectordb = Weaviate(client, "Paragraph", "content", embedding=embeddings)
+schema = {
+    "classes": [
+        {
+            "class": "Company",
+            "description": "A written paragraph",
+            "vectorizer": "text2vec-transformers",
+              "moduleConfig": {
+                "text2vec-openai": {
+                  "model": "ada",
+                  "modelVersion": "002",
+                  "type": "text"
+                }
+              },
+            "properties": [
+                {
+                    "dataType": ["text"],
+                    "description": "The content of the paragraph",
+                    "moduleConfig": {
+                        "text2vec-transformers": {
+                          "skip": False,
+                          "vectorizePropertyName": False
+                        }
+                      },
+                    "name": "content",
+                },
+                {
+                    "dataType": ["text"],
+                    "description": "The source of the paragraph",
+                    "moduleConfig": {
+                        "text2vec-transformers": {
+                          "skip": False,
+                          "vectorizePropertyName": False
+                        }
+                      },
+                    "name": "source",
+                },
+            ],
+        },
+    ]
+}
+
+client.schema.create(schema)
+
+vectordb = Weaviate(client, "Company", "content", embedding=embeddings)
 
 from langchain.document_loaders import UnstructuredMarkdownLoader, UnstructuredPDFLoader, UnstructuredHTMLLoader, UnstructuredFileLoader
 from langchain.document_loaders import PyPDFLoader, CSVLoader, Docx2txtLoader, UnstructuredPowerPointLoader
@@ -86,6 +86,20 @@
 
 import os
 
+def clean_string(text):
+    # Split the string by spaces.
+    # This gives us a list where multi-spaces will be represented as ''.
+    text_list = text.split(' ')
+
+    # Rejoin with ' ' instead of ''
+    cleaned_text = ''.join([' ' if x == '' else x for x in text_list ])
+    return cleaned_text.replace("  ", " ")
+def percentage_of_char(input_string, char):
+    count_char = input_string.count(char)
+    total_chars = len(input_string)
+    percentage = (count_char / total_chars) * 100
+    return percentage
+
 def load_file(file_path) -> List[Document]:
     _, file_extension = os.path.splitext(file_path)
     data: List[Document]
@@ -111,13 +125,21 @@ def load_file(file_path) -> List[Document]:
         # Perform action for other files or skip
         return UnstructuredFileLoader(file_path).load()
 
-def process_file(file_path, chunk_size=400, chunk_overlap=200):
+def process_file(file_path, chunk_size=1000, chunk_overlap=400):
     # text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
     text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
     try:
         data = load_file(file_path=file_path)
         texts = text_splitter.split_documents(data)
-
+        for t in texts:
+            if percentage_of_char(t.page_content, ' ') > 25:
+                print("REPLACING: ")
+                print(t.page_content)
+                print("WITH:")
+                clean = clean_string(t.page_content)
+                print(clean)
+                print("---------")
+                t.page_content = clean
         contents = [d.page_content for d in texts]
         metadatas = [d.metadata for d in texts] 
         vectordb.add_texts(

@@ -1,8 +1,40 @@
-domain: bigbang.dev
+#https://github.com/weaviate/weaviate-helm/blob/1e8492e03e337b500d5f6fd9a055e2f1ad146398/weaviate/values.yaml
+
+image:
+  # registry where weaviate image is stored
+  registry: ghcr.io
+  # Tag of weaviate image to deploy
+  # Note: We strongly recommend you overwrite this value in your own values.yaml.
+  # Otherwise a mere upgrade of the chart could lead to an unexpected upgrade
+  # of weaviate. In accordance with Infra-as-code, you should pin this value
+  # down and only change it if you explicitly want to upgrade the Weaviate
+  # version.
+  tag: 0.0.3
+  repo: defenseunicorns/leapfrogai/weaviate
+
+
+env:
+  # maybe set this to the internal URL:
+  # OPENAI_BASE_URL: "http://api.leapfrogai.svc:8080"
+  OPENAI_BASE_URL: "https://leapfrogai.dd.bigbang.dev"
+
+modules:
+  DEFAULT_VECTORIZER_MODULE: text2vec-transformers
+  text2vec-transformers:
+    enabled: true
+  text2vec-openai:
+    enabled: true
+storage:
+  size: 32Gi # default
+
+domain: dd.bigbang.dev
 
 istio:
-  enabled: false
-  gateway: public
+  enabled: true
+
+monitoring:
+  enabled: true
+
 
 models:
   stablelm3b:

@@ -27,7 +27,7 @@ spec:
     cleanupOnFail: false
   values:
     env:
-      OPENAI_BASE: "https://leapfrogai.tom.bigbang.dev"
+      OPENAI_BASE: "https://leapfrogai.dd.bigbang.dev"
   valuesFrom:   
     - kind: Secret
       name: "bigbang-###ZARF_CONST_NAME###-values"
@@ -5,9 +5,9 @@ metadata:
   namespace: weaviate
 spec:
   gateways:
-  - istio-system/public
+  - istio-system/tenant
   hosts:
-  - weaviate.leapfrogai.bigbang.dev
+  - weaviate.dd.bigbang.dev
   http:
   - route:
     - destination:

@@ -12,15 +12,10 @@ image:
   tag: 0.0.3
   repo: defenseunicorns/leapfrogai/weaviate
 
-env:
-  # maybe set this to the internal URL:
-  # OPENAI_BASE_URL: "http://api.leapfrogai.svc:8080"
-  OPENAI_BASE_URL: "https://leapfrogai.leapfrogai.bigbang.dev"
-
 modules:
   DEFAULT_VECTORIZER_MODULE: text2vec-transformers
   text2vec-transformers:
-    enabled: true
+    enabled: false
   text2vec-openai:
     enabled: true
 storage:

@@ -54,3 +54,4 @@ components:
     - "semitechnologies/weaviate:1.18.3"
     - "ghcr.io/defenseunicorns/leapfrogai/weaviate:0.0.3"
     - semitechnologies/transformers-inference:sentence-transformers-all-MiniLM-L6-v2
+    - semitechnologies/transformers-inference:distilbert-base-uncased
@@ -0,0 +1,10 @@
+package:
+  create: 
+    set:
+      repo: "https://github.com/weaviate/weaviate-helm"
+      tag: "v16.1.0"
+      name: "weaviate"
+      values_file: "../values.yaml"
+      path: "./weaviate"
+      namespace: "weaviate"
+    max_package_size: "1000000000"
@@ -21,12 +21,16 @@ components:
   - name: leapfrogai
     required: true
     charts:
-      - name: leapfrogai
-        namespace: leapfrogai
-        localPath: chart
-        version: 0.0.1
-        valuesFiles:
-          - "values.yaml"
+    - name: leapfrogai
+      namespace: leapfrogai
+      localPath: chart
+      version: 0.0.1
+      valuesFiles:
+        - "values.yaml"
+    # images:
+    #   - "ghcr.io/defenseunicorns/leapfrogai/api:0.0.1"
+    #   # - "ghcr.io/defenseunicorns/leapfrogai/stablelm-3b:0.0.1"
+    #   - "ghcr.io/defenseunicorns/leapfrogai/embeddings:0.0.1"
   - name: dcgm-exporter
     required: true
     charts: