diff --git a/.github/workflows/test-code-syntactic_concept_extractor.yml b/.github/workflows/test-code-syntactic_concept_extractor.yml new file mode 100644 index 000000000..7f95b90a8 --- /dev/null +++ b/.github/workflows/test-code-syntactic_concept_extractor.yml @@ -0,0 +1,124 @@ +# +# DO NOT EDIT THIS FILE: it is generated from test-transform.template, Edit there and run make to change these files +# +name: Test - transforms/code/syntactic_concept_extractor + +on: + workflow_dispatch: + push: + branches: + - "dev" + - "releases/**" + tags: + - "*" + paths: + - "transforms/code/syntactic_concept_extractor/**" + - "data-processing-lib/**" + - "!transforms/code/syntactic_concept_extractor/**/kfp_ray/**" # This is/will be tested in separate workflow + - "!data-processing-lib/**/test/**" + - "!data-processing-lib/**/test-data/**" + - "!**.md" + - "!**/doc/**" + - "!**/images/**" + - "!**.gitignore" + pull_request: + branches: + - "dev" + - "releases/**" + paths: + - "transforms/code/syntactic_concept_extractor/**" + - "data-processing-lib/**" + - "!transforms/code/syntactic_concept_extractor/**/kfp_ray/**" # This is/will be tested in separate workflow + - "!data-processing-lib/**/test/**" + - "!data-processing-lib/**/test-data/**" + - "!**.md" + - "!**/doc/**" + - "!**/images/**" + - "!**.gitignore" + +jobs: + check_if_push_image: + # check whether the Docker images should be pushed to the remote repository + # The images are pushed if it is a merge to dev branch or a new tag is created. + # The latter being part of the release process. + # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. + runs-on: ubuntu-22.04 + outputs: + publish_images: ${{ steps.version.outputs.publish_images }} + steps: + - id: version + run: | + publish_images='false' + if [[ ${GITHUB_REF} == refs/heads/dev && ${GITHUB_EVENT_NAME} != 'pull_request' && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ; + then + publish_images='true' + fi + if [[ ${GITHUB_REF} == refs/tags/* && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ; + then + publish_images='true' + fi + echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" + test-src: + runs-on: ubuntu-22.04 + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Free up space in github runner + # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 + run: | + df -h + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup + sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true + df -h + - name: Test transform source in transforms/code/syntactic_concept_extractor + run: | + if [ -e "transforms/code/syntactic_concept_extractor/Makefile" ]; then + make -C transforms/code/syntactic_concept_extractor DOCKER=docker test-src + else + echo "transforms/code/syntactic_concept_extractor/Makefile not found - source testing disabled for this transform." + fi + test-image: + needs: [check_if_push_image] + runs-on: ubuntu-22.04 + timeout-minutes: 120 + env: + DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} + DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }} + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Free up space in github runner + # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 + run: | + df -h + sudo rm -rf /opt/ghc + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup + sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true + df -h + - name: Test transform image in transforms/code/syntactic_concept_extractor + run: | + if [ -e "transforms/code/syntactic_concept_extractor/Makefile" ]; then + if [ -d "transforms/code/syntactic_concept_extractor/spark" ]; then + make -C data-processing-lib/spark DOCKER=docker image + fi + make -C transforms/code/syntactic_concept_extractor DOCKER=docker test-image + else + echo "transforms/code/syntactic_concept_extractor/Makefile not found - testing disabled for this transform." + fi + - name: Print space + # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 + run: | + df -h + docker images + - name: Publish images + if: needs.check_if_push_image.outputs.publish_images == 'true' + run: | + if [ -e "transforms/code/syntactic_concept_extractor/Makefile" ]; then + make -C transforms/code/syntactic_concept_extractor publish + else + echo "transforms/code/syntactic_concept_extractor/Makefile not found - publishing disabled for this transform." + fi diff --git a/.make.versions b/.make.versions index 6c9bbc08d..73eec8542 100644 --- a/.make.versions +++ b/.make.versions @@ -109,6 +109,10 @@ HTML2PARQUET_PYTHON_VERSION=$(DPK_VERSION) DPK_TRANSFORMS_VERSION=$(DPK_VERSION) +SYNTACTIC_CONCEPT_EXTRACTOR_PYTHON_VERSION=$(DPK_VERSION) +SYNTACTIC_CONCEPT_EXTRACTOR_RAY_VERSION=$(DPK_VERSION) + + ################## ################## ################## ################## ################## ################## # Begin versions that the repo depends on. diff --git a/transforms/code/Makefile b/transforms/code/Makefile index 17afe2785..b5d5c7bbe 100644 --- a/transforms/code/Makefile +++ b/transforms/code/Makefile @@ -27,26 +27,10 @@ image:: @# Help: Recursively make $@ in all subdirs @$(MAKE) RULE=$@ .recurse -test-image:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - publish:: @# Help: Recursively make $@ in all subdirs @$(MAKE) RULE=$@ .recurse -kind-load-image:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -docker-load-image:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -docker-save-image:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - set-versions: @# Help: Recursively $@ in all subdirs @$(MAKE) RULE=$@ .recurse diff --git a/transforms/code/syntactic_concept_extractor/Makefile b/transforms/code/syntactic_concept_extractor/Makefile index 351da91d5..3cc939591 100644 --- a/transforms/code/syntactic_concept_extractor/Makefile +++ b/transforms/code/syntactic_concept_extractor/Makefile @@ -21,6 +21,9 @@ publish:: @# Help: Recursively make $@ in all subdirs @$(MAKE) RULE=$@ .recurse +test-image: + @echo "Skipping test-image step as per configuration." + test:: @# Help: Recursively make $@ in all subdirs @$(MAKE) RULE=$@ .recurse diff --git a/transforms/code/syntactic_concept_extractor/README.md b/transforms/code/syntactic_concept_extractor/README.md index 36b1e57b8..eb1b181ea 100644 --- a/transforms/code/syntactic_concept_extractor/README.md +++ b/transforms/code/syntactic_concept_extractor/README.md @@ -54,3 +54,10 @@ The implementation for UI-based offline customization tool is present [here](pyt `streamlit run LLM_runner_app.py` +The high-level system design is as follows: + +![White Background Image](sys-overview.png) + +For each new target language, the offline phase is utilized to create deterministic rules by harnessing the capabilities of LLMs and working with exemplar code samples from the target language. In this process, Workflow W1 facilitates the creation of rules around syntactic structures based on exemplar code samples, while Workflow W2 is used to establish semantic dimensions for profiling. Subsequently, we derive rules that connect syntactic constructs to the predefined semantic concepts. These rules are then stored in a rule database, ready to be employed during the online phase. + +In the online phase, the system dynamically generates profiling outputs for any incoming code snippets. This is achieved by extracting concepts from the snippets using the rules in the database and storing these extractions in a tabular format. The structured tabular format allows for generating additional concept columns, which are then utilized to create comprehensive profiling reports. \ No newline at end of file diff --git a/transforms/code/syntactic_concept_extractor/input/multi-package.parquet b/transforms/code/syntactic_concept_extractor/input/multi-package.parquet index 8ef9f7cc2..fc96e51fa 100644 Binary files a/transforms/code/syntactic_concept_extractor/input/multi-package.parquet and b/transforms/code/syntactic_concept_extractor/input/multi-package.parquet differ diff --git a/transforms/code/syntactic_concept_extractor/notebook_example/code-profiler.ipynb b/transforms/code/syntactic_concept_extractor/notebook_example/code-profiler.ipynb new file mode 100644 index 000000000..f98f50b55 --- /dev/null +++ b/transforms/code/syntactic_concept_extractor/notebook_example/code-profiler.ipynb @@ -0,0 +1,1224 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 61, + "id": "08ad3138-a87c-4fb3-80fd-8a64cdc27eaf", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/f7/736gtk714sg36xlcqnjx16xw0000gn/T/ipykernel_45869/3873726698.py:4: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display\n", + " from IPython.core.display import display, HTML\n" + ] + } + ], + "source": [ + "import os\n", + "from IPython.display import display, HTML # Updated import\n", + "import subprocess\n", + "from IPython.core.display import display, HTML\n", + "import shutil\n", + "\n", + "# Following are the utility functions for running the transforms sequencially\n", + "\n", + "def run_make_command(target_dir, command):\n", + " \"\"\"Function to change directory and run a make command with real-time log output.\"\"\"\n", + " if os.path.exists(target_dir):\n", + " # Change the current working directory to the target directory\n", + " os.chdir(target_dir)\n", + " print(f\"Changed directory to: {os.getcwd()}\")\n", + "\n", + " # Run the make command and stream logs\n", + " try:\n", + " process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)\n", + " for stdout_line in iter(process.stdout.readline, \"\"):\n", + " print(stdout_line, end=\"\") # Print stdout line-by-line in real-time\n", + " process.stdout.close()\n", + " process.wait()\n", + "\n", + " # Check for errors and handle stderr\n", + " if process.returncode != 0:\n", + " print(\"Error Output:\")\n", + " for stderr_line in iter(process.stderr.readline, \"\"):\n", + " print(stderr_line, end=\"\")\n", + " process.stderr.close()\n", + " else:\n", + " print(\"Process completed successfully.\")\n", + " except subprocess.CalledProcessError as e:\n", + " print(f\"Error occurred while running the make command: {e}\")\n", + " else:\n", + " print(f\"Directory does not exist: {target_dir}\")\n", + "\n", + "def check_directories_exist(directories):\n", + " results = {}\n", + " for directory in directories:\n", + " if os.path.isdir(directory):\n", + " print(f\"The directory '{directory}' exists.\")\n", + " results[directory] = True\n", + " else:\n", + " print(f\"The directory '{directory}' does not exist.\")\n", + " results[directory] = False\n", + " return results\n", + "\n", + "def display_html_file(hosp_code_dir):\n", + " # Construct the path to the HTML file\n", + " html_file_path = os.path.join(hosp_code_dir, 'src', 'output.html')\n", + " \n", + " # Check if the file exists\n", + " if not os.path.exists(html_file_path):\n", + " raise FileNotFoundError(f\"The file '{html_file_path}' does not exist.\")\n", + " \n", + " # Read the HTML file\n", + " with open(html_file_path, 'r', encoding='utf-8') as file:\n", + " html_content = file.read()\n", + " \n", + " # Display the HTML content in the notebook\n", + " display(HTML(html_content))\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "665c28d4-2485-4cf4-aca7-6d0f6ac2353c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The directory '/Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/input' exists.\n", + "The directory '/Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python' exists.\n", + "The directory '/Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/output' exists.\n", + "The directory '/Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/semantic_profiler/python/test-data/input' exists.\n", + "The directory '/Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/semantic_profiler/python' exists.\n", + "The directory '/Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/semantic_profiler/python/output' exists.\n", + "The directory '/Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/higher_order_syntactic_profiler/python/test-data/input' exists.\n", + "The directory '/Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/higher_order_syntactic_profiler/python' exists.\n", + "The directory '/Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/higher_order_syntactic_profiler/python/output' exists.\n", + "Checking directory exists or not\n", + "/Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/input: Exists\n", + "/Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python: Exists\n", + "/Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/output: Exists\n", + "/Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/semantic_profiler/python/test-data/input: Exists\n", + "/Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/semantic_profiler/python: Exists\n", + "/Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/semantic_profiler/python/output: Exists\n", + "/Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/higher_order_syntactic_profiler/python/test-data/input: Exists\n", + "/Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/higher_order_syntactic_profiler/python: Exists\n", + "/Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/higher_order_syntactic_profiler/python/output: Exists\n" + ] + } + ], + "source": [ + "# Specify the target directory where your transforms Makefile and output are located\n", + "user_local_dir = '/Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk'\n", + "\n", + "transforms_dir = user_local_dir + '/data-prep-kit/transforms'\n", + "\n", + "sce_input_dir = transforms_dir + '/code/syntactic_concept_extractor/input'\n", + "sce_code_dir = transforms_dir + '/code/syntactic_concept_extractor/python'\n", + "sce_output_dir = transforms_dir + '/code/syntactic_concept_extractor/output'\n", + "\n", + "sp_input_dir = transforms_dir + '/code/semantic_profiler/python/test-data/input'\n", + "sp_code_dir = transforms_dir + '/code/semantic_profiler/python'\n", + "sp_output_dir = transforms_dir + '/code/semantic_profiler/python/output'\n", + "\n", + "hosp_input_dir = transforms_dir + '/code/higher_order_syntactic_profiler/python/test-data/input'\n", + "hosp_code_dir = transforms_dir + '/code/higher_order_syntactic_profiler/python'\n", + "hosp_output_dir = transforms_dir + '/code/higher_order_syntactic_profiler/python/output'\n", + "\n", + "directories_to_check = [sce_input_dir, sce_code_dir, sce_output_dir, sp_input_dir, sp_code_dir, sp_output_dir, hosp_input_dir, hosp_code_dir, hosp_output_dir]\n", + "check_results = check_directories_exist(directories_to_check)\n", + "\n", + "print(\"Checking directory exists or not\")\n", + "for directory, exists in check_results.items():\n", + " print(f\"{directory}: {'Exists' if exists else 'Does not exist'}\")\n", + " \n", + " if not exists:\n", + " if 'output' in directory:\n", + " os.makedirs(directory, exist_ok=True)\n", + " print(f\"Directory '{directory}' did not exist, so it was created.\")\n", + " else:\n", + " raise FileNotFoundError(f\"The directory '{directory}' does not exist.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "65ad6896-8ecc-492f-836d-81ae146c0b22", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Changed directory to: /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python\n", + "python -m venv venv\n", + "Requirement already satisfied: pip in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (24.2)\n", + "Requirement already satisfied: wheel in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (0.44.0)\n", + "Requirement already satisfied: pytest in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (8.3.3)\n", + "Requirement already satisfied: pytest-cov in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (5.0.0)\n", + "Requirement already satisfied: iniconfig in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from pytest) (2.0.0)\n", + "Requirement already satisfied: packaging in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from pytest) (24.0)\n", + "Requirement already satisfied: pluggy<2,>=1.5 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from pytest) (1.5.0)\n", + "Requirement already satisfied: coverage>=5.2.1 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from coverage[toml]>=5.2.1->pytest-cov) (7.6.1)\n", + "Installing Python data processing library source to existing venv\n", + "pip uninstall -y data-prep-toolkit \n", + "Found existing installation: data_prep_toolkit 0.2.1.dev0\n", + "Uninstalling data_prep_toolkit-0.2.1.dev0:\n", + " Successfully uninstalled data_prep_toolkit-0.2.1.dev0\n", + "Begin installing source from ../../../../data-processing-lib/python into venv\n", + "Obtaining file:///Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/data-processing-lib/python\n", + " Installing build dependencies: started\n", + " Installing build dependencies: finished with status 'done'\n", + " Checking if build backend supports build_editable: started\n", + " Checking if build backend supports build_editable: finished with status 'done'\n", + " Getting requirements to build editable: started\n", + " Getting requirements to build editable: finished with status 'done'\n", + " Preparing editable metadata (pyproject.toml): started\n", + " Preparing editable metadata (pyproject.toml): finished with status 'done'\n", + "Requirement already satisfied: numpy<1.29.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from data_prep_toolkit==0.2.2.dev0) (1.26.4)\n", + "Requirement already satisfied: pyarrow==16.1.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from data_prep_toolkit==0.2.2.dev0) (16.1.0)\n", + "Requirement already satisfied: boto3==1.34.69 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from data_prep_toolkit==0.2.2.dev0) (1.34.69)\n", + "Collecting argparse (from data_prep_toolkit==0.2.2.dev0)\n", + " Using cached argparse-1.4.0-py2.py3-none-any.whl.metadata (2.8 kB)\n", + "Requirement already satisfied: mmh3 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from data_prep_toolkit==0.2.2.dev0) (5.0.1)\n", + "Requirement already satisfied: botocore<1.35.0,>=1.34.69 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from boto3==1.34.69->data_prep_toolkit==0.2.2.dev0) (1.34.162)\n", + "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from boto3==1.34.69->data_prep_toolkit==0.2.2.dev0) (1.0.1)\n", + "Requirement already satisfied: s3transfer<0.11.0,>=0.10.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from boto3==1.34.69->data_prep_toolkit==0.2.2.dev0) (0.10.2)\n", + "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from botocore<1.35.0,>=1.34.69->boto3==1.34.69->data_prep_toolkit==0.2.2.dev0) (2.9.0.post0)\n", + "Requirement already satisfied: urllib3!=2.2.0,<3,>=1.25.4 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from botocore<1.35.0,>=1.34.69->boto3==1.34.69->data_prep_toolkit==0.2.2.dev0) (2.2.2)\n", + "Requirement already satisfied: six>=1.5 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.35.0,>=1.34.69->boto3==1.34.69->data_prep_toolkit==0.2.2.dev0) (1.16.0)\n", + "Using cached argparse-1.4.0-py2.py3-none-any.whl (23 kB)\n", + "Building wheels for collected packages: data_prep_toolkit\n", + " Building editable for data_prep_toolkit (pyproject.toml): started\n", + " Building editable for data_prep_toolkit (pyproject.toml): finished with status 'done'\n", + " Created wheel for data_prep_toolkit: filename=data_prep_toolkit-0.2.2.dev0-0.editable-py3-none-any.whl size=2269 sha256=5abf7df19a437f63a327ddb160b27c2383d0094a831298eff28e89ad4edcad45\n", + " Stored in directory: /private/var/folders/f7/736gtk714sg36xlcqnjx16xw0000gn/T/pip-ephem-wheel-cache-5fkx2tua/wheels/eb/18/bc/2b7f918df1f95453f58d1ccde4ae0ea1bfd48079af6ea338f8\n", + "Successfully built data_prep_toolkit\n", + "Installing collected packages: argparse, data_prep_toolkit\n", + "Successfully installed argparse-1.4.0 data_prep_toolkit-0.2.2.dev0\n", + "Done installing source from ../../../../data-processing-lib/python into venv\n", + "Installed source from Python processing library for /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/bin/python\n", + "Installing from pyproject.toml\n", + "Obtaining file:///Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python\n", + " Installing build dependencies: started\n", + " Installing build dependencies: finished with status 'done'\n", + " Checking if build backend supports build_editable: started\n", + " Checking if build backend supports build_editable: finished with status 'done'\n", + " Getting requirements to build editable: started\n", + " Getting requirements to build editable: finished with status 'done'\n", + " Preparing editable metadata (pyproject.toml): started\n", + " Preparing editable metadata (pyproject.toml): finished with status 'done'\n", + "Collecting data-prep-toolkit==0.2.1.dev0 (from dpk_syntactic_concept_extractor_transform_python==1.0.0)\n", + " Using cached data_prep_toolkit-0.2.1.dev0-py3-none-any.whl.metadata (1.9 kB)\n", + "Requirement already satisfied: parameterized in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (0.9.0)\n", + "Requirement already satisfied: pandas in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (2.2.2)\n", + "Requirement already satisfied: aiolimiter==1.1.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (1.1.0)\n", + "Requirement already satisfied: altair==5.3.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (5.3.0)\n", + "Requirement already satisfied: annotated-types==0.7.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (0.7.0)\n", + "Requirement already satisfied: anyio==4.4.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (4.4.0)\n", + "Requirement already satisfied: appnope==0.1.4 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (0.1.4)\n", + "Requirement already satisfied: asttokens==2.4.1 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (2.4.1)\n", + "Requirement already satisfied: attrs==23.2.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (23.2.0)\n", + "Requirement already satisfied: blinker==1.8.2 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (1.8.2)\n", + "Requirement already satisfied: cachetools==5.3.3 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (5.3.3)\n", + "Requirement already satisfied: certifi==2024.6.2 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (2024.6.2)\n", + "Requirement already satisfied: charset-normalizer==3.3.2 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (3.3.2)\n", + "Requirement already satisfied: click==8.1.7 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (8.1.7)\n", + "Requirement already satisfied: comm==0.2.2 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (0.2.2)\n", + "Requirement already satisfied: contourpy==1.2.1 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (1.2.1)\n", + "Requirement already satisfied: cycler==0.12.1 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (0.12.1)\n", + "Requirement already satisfied: debugpy==1.8.1 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (1.8.1)\n", + "Requirement already satisfied: decorator==5.1.1 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (5.1.1)\n", + "Requirement already satisfied: Deprecated==1.2.14 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (1.2.14)\n", + "Requirement already satisfied: executing==2.0.1 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (2.0.1)\n", + "Requirement already satisfied: fonttools==4.53.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (4.53.0)\n", + "Requirement already satisfied: gitdb==4.0.11 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (4.0.11)\n", + "Requirement already satisfied: GitPython==3.1.43 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (3.1.43)\n", + "Requirement already satisfied: h11==0.14.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (0.14.0)\n", + "Requirement already satisfied: htbuilder==0.6.2 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (0.6.2)\n", + "Requirement already satisfied: httpcore==1.0.5 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (1.0.5)\n", + "Requirement already satisfied: httpx==0.27.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (0.27.0)\n", + "Requirement already satisfied: httpx-sse==0.4.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (0.4.0)\n", + "Requirement already satisfied: ibm-generative-ai==3.0.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (3.0.0)\n", + "Requirement already satisfied: idna==3.7 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (3.7)\n", + "Requirement already satisfied: ipykernel==6.29.4 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (6.29.4)\n", + "Requirement already satisfied: ipython==8.25.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (8.25.0)\n", + "Requirement already satisfied: jedi==0.19.1 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (0.19.1)\n", + "Requirement already satisfied: Jinja2==3.1.4 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (3.1.4)\n", + "Requirement already satisfied: jsonschema==4.22.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (4.22.0)\n", + "Requirement already satisfied: jsonschema-specifications==2023.12.1 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (2023.12.1)\n", + "Requirement already satisfied: jupyter-client==8.6.2 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (8.6.2)\n", + "Requirement already satisfied: jupyter-core==5.7.2 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (5.7.2)\n", + "Requirement already satisfied: kiwisolver==1.4.5 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (1.4.5)\n", + "Requirement already satisfied: markdown-it-py==3.0.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (3.0.0)\n", + "Requirement already satisfied: MarkupSafe==2.1.5 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (2.1.5)\n", + "Requirement already satisfied: matplotlib==3.9.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (3.9.0)\n", + "Requirement already satisfied: matplotlib-inline==0.1.7 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (0.1.7)\n", + "Requirement already satisfied: mdurl==0.1.2 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (0.1.2)\n", + "Requirement already satisfied: more-itertools==10.3.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (10.3.0)\n", + "Requirement already satisfied: nest-asyncio==1.6.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (1.6.0)\n", + "Requirement already satisfied: networkx==3.3 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (3.3)\n", + "Requirement already satisfied: numpy==1.26.4 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (1.26.4)\n", + "Requirement already satisfied: packaging==24.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (24.0)\n", + "Requirement already satisfied: parso==0.8.4 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (0.8.4)\n", + "Requirement already satisfied: pexpect==4.9.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (4.9.0)\n", + "Requirement already satisfied: pillow==10.3.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (10.3.0)\n", + "Requirement already satisfied: platformdirs==4.2.2 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (4.2.2)\n", + "Requirement already satisfied: prompt-toolkit==3.0.45 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (3.0.45)\n", + "Requirement already satisfied: protobuf==5.27.2 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (5.27.2)\n", + "Requirement already satisfied: psutil==5.9.8 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (5.9.8)\n", + "Requirement already satisfied: ptyprocess==0.7.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (0.7.0)\n", + "Requirement already satisfied: pure-eval==0.2.2 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (0.2.2)\n", + "Requirement already satisfied: pyarrow==16.1.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (16.1.0)\n", + "Requirement already satisfied: pydantic==2.7.4 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (2.7.4)\n", + "Requirement already satisfied: pydantic-core==2.18.4 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (2.18.4)\n", + "Requirement already satisfied: pydeck==0.9.1 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (0.9.1)\n", + "Requirement already satisfied: Pygments==2.18.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (2.18.0)\n", + "Requirement already satisfied: pyparsing==3.1.2 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (3.1.2)\n", + "Requirement already satisfied: python-dateutil==2.9.0.post0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (2.9.0.post0)\n", + "Requirement already satisfied: pytz==2024.1 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (2024.1)\n", + "Requirement already satisfied: pyzmq==26.0.3 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (26.0.3)\n", + "Requirement already satisfied: referencing==0.35.1 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (0.35.1)\n", + "Requirement already satisfied: regex==2024.5.15 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (2024.5.15)\n", + "Requirement already satisfied: requests==2.32.3 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (2.32.3)\n", + "Requirement already satisfied: rich==13.7.1 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (13.7.1)\n", + "Requirement already satisfied: rpds-py==0.18.1 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (0.18.1)\n", + "Requirement already satisfied: seaborn==0.13.2 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (0.13.2)\n", + "Requirement already satisfied: six==1.16.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (1.16.0)\n", + "Requirement already satisfied: smmap==5.0.1 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (5.0.1)\n", + "Requirement already satisfied: sniffio==1.3.1 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (1.3.1)\n", + "Requirement already satisfied: st-annotated-text==4.0.1 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (4.0.1)\n", + "Requirement already satisfied: stack-data==0.6.3 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (0.6.3)\n", + "Requirement already satisfied: streamlit==1.36.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (1.36.0)\n", + "Requirement already satisfied: tenacity==8.4.2 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (8.4.2)\n", + "Requirement already satisfied: toml==0.10.2 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (0.10.2)\n", + "Requirement already satisfied: toolz==0.12.1 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (0.12.1)\n", + "Requirement already satisfied: tornado==6.4 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (6.4)\n", + "Requirement already satisfied: traitlets==5.14.3 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (5.14.3)\n", + "Requirement already satisfied: tree-sitter==0.21.3 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (0.21.3)\n", + "Requirement already satisfied: tree-sitter-cpp==0.22.1 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (0.22.1)\n", + "Requirement already satisfied: tree-sitter-java==0.21.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (0.21.0)\n", + "Requirement already satisfied: tree-sitter-languages==1.10.2 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (1.10.2)\n", + "Requirement already satisfied: tree-sitter-php==0.22.5 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (0.22.5)\n", + "Requirement already satisfied: typing-extensions==4.12.2 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (4.12.2)\n", + "Requirement already satisfied: tzdata==2024.1 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (2024.1)\n", + "Requirement already satisfied: urllib3==2.2.2 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (2.2.2)\n", + "Requirement already satisfied: wcwidth==0.2.13 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (0.2.13)\n", + "Requirement already satisfied: wrapt==1.16.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from dpk_syntactic_concept_extractor_transform_python==1.0.0) (1.16.0)\n", + "Requirement already satisfied: boto3==1.34.69 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from data-prep-toolkit==0.2.1.dev0->dpk_syntactic_concept_extractor_transform_python==1.0.0) (1.34.69)\n", + "Collecting argparse (from data-prep-toolkit==0.2.1.dev0->dpk_syntactic_concept_extractor_transform_python==1.0.0)\n", + " Using cached argparse-1.4.0-py2.py3-none-any.whl.metadata (2.8 kB)\n", + "Requirement already satisfied: mmh3 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from data-prep-toolkit==0.2.1.dev0->dpk_syntactic_concept_extractor_transform_python==1.0.0) (5.0.1)\n", + "Requirement already satisfied: botocore<1.35.0,>=1.34.69 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from boto3==1.34.69->data-prep-toolkit==0.2.1.dev0->dpk_syntactic_concept_extractor_transform_python==1.0.0) (1.34.162)\n", + "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from boto3==1.34.69->data-prep-toolkit==0.2.1.dev0->dpk_syntactic_concept_extractor_transform_python==1.0.0) (1.0.1)\n", + "Requirement already satisfied: s3transfer<0.11.0,>=0.10.0 in /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/venv/lib/python3.11/site-packages (from boto3==1.34.69->data-prep-toolkit==0.2.1.dev0->dpk_syntactic_concept_extractor_transform_python==1.0.0) (0.10.2)\n", + "Using cached data_prep_toolkit-0.2.1.dev0-py3-none-any.whl (63 kB)\n", + "Using cached argparse-1.4.0-py2.py3-none-any.whl (23 kB)\n", + "Building wheels for collected packages: dpk_syntactic_concept_extractor_transform_python\n", + " Building editable for dpk_syntactic_concept_extractor_transform_python (pyproject.toml): started\n", + " Building editable for dpk_syntactic_concept_extractor_transform_python (pyproject.toml): finished with status 'done'\n", + " Created wheel for dpk_syntactic_concept_extractor_transform_python: filename=dpk_syntactic_concept_extractor_transform_python-1.0.0-0.editable-py3-none-any.whl size=3593 sha256=510f2fb9ae185f2fcfa6010c7e0be48b529dcbc230d91aa22a089c3e6970c686\n", + " Stored in directory: /private/var/folders/f7/736gtk714sg36xlcqnjx16xw0000gn/T/pip-ephem-wheel-cache-1u2gu5sh/wheels/05/e5/80/5ba70e3ece1b81ec352ba240e5a1e67a07c1d70e012b20ab06\n", + "Successfully built dpk_syntactic_concept_extractor_transform_python\n", + "Installing collected packages: argparse, data-prep-toolkit, dpk_syntactic_concept_extractor_transform_python\n", + " Attempting uninstall: data-prep-toolkit\n", + " Found existing installation: data_prep_toolkit 0.2.2.dev0\n", + " Uninstalling data_prep_toolkit-0.2.2.dev0:\n", + " Successfully uninstalled data_prep_toolkit-0.2.2.dev0\n", + " Attempting uninstall: dpk_syntactic_concept_extractor_transform_python\n", + " Found existing installation: dpk_syntactic_concept_extractor_transform_python 1.0.0\n", + " Uninstalling dpk_syntactic_concept_extractor_transform_python-1.0.0:\n", + " Successfully uninstalled dpk_syntactic_concept_extractor_transform_python-1.0.0\n", + "Successfully installed argparse-1.4.0 data-prep-toolkit-0.2.1.dev0 dpk_syntactic_concept_extractor_transform_python-1.0.0\n", + "Process completed successfully.\n", + "Changed directory to: /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python\n", + "/Library/Developer/CommandLineTools/usr/bin/make RUN_FILE=syntactic_concept_extractor_local.py .transforms.run-src-file\n", + "source venv/bin/activate;\t\\\n", + "\tcd src;\t\t\t\t\\\n", + "\tpython syntactic_concept_extractor_local.py \"\"\n", + "Loaded dictionary: {'input': 'multi-package.parquet', 'contents': 'Contents', 'language': 'Language'}\n", + "Syntactic constructs extraction started\n", + "Checking directory: /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python/src\n", + "Checking directory: /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/python\n", + "Checking directory: /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor\n", + "Checking directory: /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code\n", + "Checking directory: /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms\n", + "Checking directory: /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit\n", + "Project root found: /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit\n", + "Bindings path: /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/input/tree-sitter-bindings/mach-arm64\n", + "input table: pyarrow.Table\n", + "Language: string\n", + "Category: string\n", + "Library: string\n", + "Contents: string\n", + "Weight: double\n", + "----\n", + "Language: [[\"C\",\"C\",\"C\",\"C\",\"C\",...,\"Go\",\"Go\",\"Go\",\"Go\",\"Go\"]]\n", + "Category: [[\"Algorithms and Data Structures\",\"Algorithms and Data Structures\",\"Algorithms and Data Structures\",\"Algorithms and Data Structures\",\"Algorithms and Data Structures\",...,\"Testing\",\"Web Development\",\"Web Development\",\"Web Development\",\"Web Development\"]]\n", + "Library: [[\"dotnet-mgcb-compute, Akade.IndexedSet, Akka.DistributedData\",\"dotnet-mgcb-compute\",\"Akade.IndexedSet\",\"dotnet-mgcb-compute, QuantConnect.Algorithm.CSharp\",\"QuantConnect.Algorithm.CSharp\",...,\"github.com/stretchr/testify, gopkg.in/check.v1\",\"github.com/gorilla/sessions, github.com/labstack/echo/v4\",\"github.com/labstack/echo/v4, github.com/gorilla/sessions, github.com/gin-gonic/gin\",\"github.com/labstack/echo/v4, github.com/gin-gonic/gin\",\"github.com/gin-gonic/gin, github.com/labstack/echo/v4, github.com/gorilla/sessions\"]]\n", + "Contents: [[\"#include \"dotnet-mgcb-compute.h\"\n", + "#include \"Akade.IndexedSet.h\"\n", + "#include \"Akka.DistributedData.h\"\n", + "// Function with parameters\n", + "void baz1(int x) {\n", + " /* TODO: Implement this */\n", + "}\n", + "int main() {\n", + " baz1(10);\n", + " return 0;\n", + "}\n", + "\",\"#include \"dotnet-mgcb-compute.h\"\n", + "// Function example\n", + "void foo2() {\n", + " /* TODO: Add your code here */\n", + "}\n", + "int main() {\n", + " foo2();\n", + " return 0;\n", + "}\n", + "\",\"#include \"Akade.IndexedSet.h\"\n", + "// Function with return\n", + "int quux3() {\n", + " return 42;\n", + "}\n", + "int main() {\n", + " int result = quux3();\n", + " return 0;\n", + "}\n", + "\",\"#include \"dotnet-mgcb-compute.h\"\n", + "#include \"QuantConnect.Algorithm.CSharp.h\"\n", + "// Function with parameters\n", + "void baz4(int x) {\n", + " /* TODO: Implement this */\n", + "}\n", + "int main() {\n", + " baz4(10);\n", + " return 0;\n", + "}\n", + "\",\"#include \"QuantConnect.Algorithm.CSharp.h\"\n", + "// Function with return\n", + "int quux5() {\n", + " return 42;\n", + "}\n", + "int main() {\n", + " int result = quux5();\n", + " return 0;\n", + "}\n", + "\",...,\"import \"github.com/stretchr/testify\"\n", + "import \"gopkg.in/check.v1\"\n", + "// Another example function\n", + "func bar396() int {\n", + " return 0;\n", + "}\n", + "\",\"import \"github.com/gorilla/sessions\"\n", + "import \"github.com/labstack/echo/v4\"\n", + "// Function with parameters\n", + "func baz397(x int) {\n", + " /* TODO: Implement this */\n", + "}\n", + "\",\"import \"github.com/labstack/echo/v4\"\n", + "import \"github.com/gorilla/sessions\"\n", + "import \"github.com/gin-gonic/gin\"\n", + "// Function example\n", + "func foo398() {\n", + " /* TODO: Add your code here */\n", + "}\n", + "\",\"import \"github.com/labstack/echo/v4\"\n", + "import \"github.com/gin-gonic/gin\"\n", + "// Function with parameters\n", + "func baz399(x int) {\n", + " /* TODO: Implement this */\n", + "}\n", + "\",\"import \"github.com/gin-gonic/gin\"\n", + "import \"github.com/labstack/echo/v4\"\n", + "import \"github.com/gorilla/sessions\"\n", + "// Function with return\n", + "func quux400() int {\n", + " return 42;\n", + "}\n", + "\"]]\n", + "Weight: [[0.0015624999999999999,0.0005208333333333334,0.0005208333333333334,0.0010416666666666669,0.0005208333333333334,...,0.0009259259259259261,0.0008333333333333334,0.0012500000000000002,0.0008333333333333334,0.0012500000000000002]]\n", + "other_val: 0\n", + "tranforming the the input dataframe\n", + "uni-algo\n", + "tries\n", + "uni-algo\n", + "tries\n", + "dynamic_bitset\n", + "algorithm\n", + "tries\n", + "clangd\n", + "clang-tidy\n", + "cquery\n", + "clangd\n", + "cppcheck\n", + "cquery\n", + "cppcheck\n", + "clangd\n", + "cquery\n", + "cppcheck\n", + "clang-tidy\n", + "cppcheck\n", + "clang-tidy\n", + "cquery\n", + "armadillo\n", + "dlib\n", + "oneapi/dal\n", + "blitz\n", + "armadillo\n", + "armadillo\n", + "blitz\n", + "oneapi/dal\n", + "blitz\n", + "oneapi/dal\n", + "boost.serialization\n", + "frozen\n", + "cppcodec\n", + "glaze\n", + "cppcodec\n", + "frozen\n", + "boost.serialization\n", + "cppcodec\n", + "frozen\n", + "boost.serialization\n", + "glaze\n", + "sqlite\n", + "libpqxx\n", + "clickhouse\n", + "sqlite\n", + "leveldb\n", + "clickhouse\n", + "sqlite\n", + "llfio\n", + "tinydir\n", + "llfio\n", + "tinydir\n", + "filesystem\n", + "tinydir\n", + "llfio\n", + "llfio\n", + "filesystem\n", + "tinydir\n", + "wxwidgets\n", + "cegui\n", + "nanogui\n", + "gtk\n", + "cegui\n", + "wxwidgets\n", + "gtk\n", + "cegui\n", + "nanogui\n", + "wxwidgets\n", + "cegui\n", + "wxwidgets\n", + "workflow\n", + "taskflow\n", + "workflow\n", + "libthrift\n", + "taskflow\n", + "infra\n", + "libthrift\n", + "infra\n", + "libthrift\n", + "libthrift\n", + "spdlog\n", + "reckless\n", + "spdlog\n", + "boost.log\n", + "spdlog\n", + "glog\n", + "boost.log\n", + "glog\n", + "spdlog\n", + "reckless\n", + "linalg\n", + "blaze\n", + "cnl\n", + "cnl\n", + "eigen\n", + "eigen\n", + "blaze\n", + "eigen\n", + "cnl\n", + "cnl\n", + "linalg\n", + "eigen\n", + "azmq\n", + "zmq\n", + "azmq\n", + "boost.asio\n", + "cpp-netlib\n", + "boost.asio\n", + "time\n", + "thread-pool\n", + "concurrencpp\n", + "time\n", + "chrono\n", + "concurrencpp\n", + "concurrencpp\n", + "time\n", + "chrono\n", + "thread-pool\n", + "libgcrypt\n", + "digestpp\n", + "libgcrypt\n", + "digestpp\n", + "libressl\n", + "libgcrypt\n", + "digestpp\n", + "libressl\n", + "digestpp\n", + "libgcrypt\n", + "cpputest\n", + "ctest\n", + "cpputest\n", + "ctest\n", + "benchmark\n", + "ctest\n", + "boost.test\n", + "cpputest\n", + "cpputest\n", + "libfv\n", + "jwt-cpp\n", + "jwt-cpp\n", + "libfv\n", + "libonion\n", + "libfv\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "list index out of range\n", + "1\n", + "\n", + "output table has 400 rows and 6 columns\n", + "output metadata : {'source_documents': 5, 'result_documents': 6}\n", + "Table 0 saved to /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/syntactic_concept_extractor/output/uast_table_part_0.parquet\n", + "Process completed successfully.\n" + ] + }, + { + "data": { + "text/plain": [ + "'/Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/semantic_profiler/python/test-data/input/uast_table_part_0.parquet'" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# runnning the syntactic_concept_extractor transform\n", + "run_make_command(sce_code_dir, ['make', 'venv'])\n", + "run_make_command(sce_code_dir, ['make', 'run-local-sample'])\n", + "\n", + "# Copy the output of the syntactic_concept_extractor transform to the input folder of the semantic_profiler\n", + "output_pq = '/uast_table_part_0.parquet'\n", + "shutil.copy2(sce_output_dir + output_pq, sp_input_dir + output_pq)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "b6458fa0-e459-47b9-951b-a209e59c4481", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Changed directory to: /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/semantic_profiler/python\n", + "python -m venv venv\n", + "Requirement already satisfied: pip in ./venv/lib/python3.11/site-packages (24.2)\n", + "Requirement already satisfied: wheel in ./venv/lib/python3.11/site-packages (0.44.0)\n", + "Requirement already satisfied: pytest in ./venv/lib/python3.11/site-packages (8.3.3)\n", + "Requirement already satisfied: pytest-cov in ./venv/lib/python3.11/site-packages (5.0.0)\n", + "Requirement already satisfied: iniconfig in ./venv/lib/python3.11/site-packages (from pytest) (2.0.0)\n", + "Requirement already satisfied: packaging in ./venv/lib/python3.11/site-packages (from pytest) (24.1)\n", + "Requirement already satisfied: pluggy<2,>=1.5 in ./venv/lib/python3.11/site-packages (from pytest) (1.5.0)\n", + "Requirement already satisfied: coverage>=5.2.1 in ./venv/lib/python3.11/site-packages (from coverage[toml]>=5.2.1->pytest-cov) (7.6.1)\n", + "Installing Python data processing library source to existing venv\n", + "pip uninstall -y data-prep-toolkit \n", + "Found existing installation: data_prep_toolkit 0.2.1.dev0\n", + "Uninstalling data_prep_toolkit-0.2.1.dev0:\n", + " Successfully uninstalled data_prep_toolkit-0.2.1.dev0\n", + "Begin installing source from ../../../../data-processing-lib/python into venv\n", + "Obtaining file:///Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/data-processing-lib/python\n", + " Installing build dependencies: started\n", + " Installing build dependencies: finished with status 'done'\n", + " Checking if build backend supports build_editable: started\n", + " Checking if build backend supports build_editable: finished with status 'done'\n", + " Getting requirements to build editable: started\n", + " Getting requirements to build editable: finished with status 'done'\n", + " Preparing editable metadata (pyproject.toml): started\n", + " Preparing editable metadata (pyproject.toml): finished with status 'done'\n", + "Requirement already satisfied: numpy<1.29.0 in ./venv/lib/python3.11/site-packages (from data_prep_toolkit==0.2.2.dev0) (1.26.4)\n", + "Requirement already satisfied: pyarrow==16.1.0 in ./venv/lib/python3.11/site-packages (from data_prep_toolkit==0.2.2.dev0) (16.1.0)\n", + "Requirement already satisfied: boto3==1.34.69 in ./venv/lib/python3.11/site-packages (from data_prep_toolkit==0.2.2.dev0) (1.34.69)\n", + "Collecting argparse (from data_prep_toolkit==0.2.2.dev0)\n", + " Using cached argparse-1.4.0-py2.py3-none-any.whl.metadata (2.8 kB)\n", + "Requirement already satisfied: mmh3 in ./venv/lib/python3.11/site-packages (from data_prep_toolkit==0.2.2.dev0) (5.0.1)\n", + "Requirement already satisfied: botocore<1.35.0,>=1.34.69 in ./venv/lib/python3.11/site-packages (from boto3==1.34.69->data_prep_toolkit==0.2.2.dev0) (1.34.162)\n", + "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in ./venv/lib/python3.11/site-packages (from boto3==1.34.69->data_prep_toolkit==0.2.2.dev0) (1.0.1)\n", + "Requirement already satisfied: s3transfer<0.11.0,>=0.10.0 in ./venv/lib/python3.11/site-packages (from boto3==1.34.69->data_prep_toolkit==0.2.2.dev0) (0.10.2)\n", + "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in ./venv/lib/python3.11/site-packages (from botocore<1.35.0,>=1.34.69->boto3==1.34.69->data_prep_toolkit==0.2.2.dev0) (2.9.0.post0)\n", + "Requirement already satisfied: urllib3!=2.2.0,<3,>=1.25.4 in ./venv/lib/python3.11/site-packages (from botocore<1.35.0,>=1.34.69->boto3==1.34.69->data_prep_toolkit==0.2.2.dev0) (2.2.3)\n", + "Requirement already satisfied: six>=1.5 in ./venv/lib/python3.11/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.35.0,>=1.34.69->boto3==1.34.69->data_prep_toolkit==0.2.2.dev0) (1.16.0)\n", + "Using cached argparse-1.4.0-py2.py3-none-any.whl (23 kB)\n", + "Building wheels for collected packages: data_prep_toolkit\n", + " Building editable for data_prep_toolkit (pyproject.toml): started\n", + " Building editable for data_prep_toolkit (pyproject.toml): finished with status 'done'\n", + " Created wheel for data_prep_toolkit: filename=data_prep_toolkit-0.2.2.dev0-0.editable-py3-none-any.whl size=2269 sha256=1e48b0e59e63118611e05d6b17d1c64ad91cb4a2bc3b1d517b608040d218a3ff\n", + " Stored in directory: /private/var/folders/f7/736gtk714sg36xlcqnjx16xw0000gn/T/pip-ephem-wheel-cache-7425eo0g/wheels/eb/18/bc/2b7f918df1f95453f58d1ccde4ae0ea1bfd48079af6ea338f8\n", + "Successfully built data_prep_toolkit\n", + "Installing collected packages: argparse, data_prep_toolkit\n", + "Successfully installed argparse-1.4.0 data_prep_toolkit-0.2.2.dev0\n", + "Done installing source from ../../../../data-processing-lib/python into venv\n", + "Installed source from Python processing library for /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/semantic_profiler/python/venv/bin/python\n", + "Installing from pyproject.toml\n", + "Obtaining file:///Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/semantic_profiler/python\n", + " Installing build dependencies: started\n", + " Installing build dependencies: finished with status 'done'\n", + " Checking if build backend supports build_editable: started\n", + " Checking if build backend supports build_editable: finished with status 'done'\n", + " Getting requirements to build editable: started\n", + " Getting requirements to build editable: finished with status 'done'\n", + " Preparing editable metadata (pyproject.toml): started\n", + " Preparing editable metadata (pyproject.toml): finished with status 'done'\n", + "Collecting data-prep-toolkit==0.2.1.dev0 (from dpk_sp_transform_python==0.2.1.dev0)\n", + " Using cached data_prep_toolkit-0.2.1.dev0-py3-none-any.whl.metadata (1.9 kB)\n", + "Requirement already satisfied: pyarrow==16.1.0 in ./venv/lib/python3.11/site-packages (from data-prep-toolkit==0.2.1.dev0->dpk_sp_transform_python==0.2.1.dev0) (16.1.0)\n", + "Requirement already satisfied: boto3==1.34.69 in ./venv/lib/python3.11/site-packages (from data-prep-toolkit==0.2.1.dev0->dpk_sp_transform_python==0.2.1.dev0) (1.34.69)\n", + "Collecting argparse (from data-prep-toolkit==0.2.1.dev0->dpk_sp_transform_python==0.2.1.dev0)\n", + " Using cached argparse-1.4.0-py2.py3-none-any.whl.metadata (2.8 kB)\n", + "Requirement already satisfied: mmh3 in ./venv/lib/python3.11/site-packages (from data-prep-toolkit==0.2.1.dev0->dpk_sp_transform_python==0.2.1.dev0) (5.0.1)\n", + "Requirement already satisfied: botocore<1.35.0,>=1.34.69 in ./venv/lib/python3.11/site-packages (from boto3==1.34.69->data-prep-toolkit==0.2.1.dev0->dpk_sp_transform_python==0.2.1.dev0) (1.34.162)\n", + "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in ./venv/lib/python3.11/site-packages (from boto3==1.34.69->data-prep-toolkit==0.2.1.dev0->dpk_sp_transform_python==0.2.1.dev0) (1.0.1)\n", + "Requirement already satisfied: s3transfer<0.11.0,>=0.10.0 in ./venv/lib/python3.11/site-packages (from boto3==1.34.69->data-prep-toolkit==0.2.1.dev0->dpk_sp_transform_python==0.2.1.dev0) (0.10.2)\n", + "Requirement already satisfied: numpy>=1.16.6 in ./venv/lib/python3.11/site-packages (from pyarrow==16.1.0->data-prep-toolkit==0.2.1.dev0->dpk_sp_transform_python==0.2.1.dev0) (1.26.4)\n", + "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in ./venv/lib/python3.11/site-packages (from botocore<1.35.0,>=1.34.69->boto3==1.34.69->data-prep-toolkit==0.2.1.dev0->dpk_sp_transform_python==0.2.1.dev0) (2.9.0.post0)\n", + "Requirement already satisfied: urllib3!=2.2.0,<3,>=1.25.4 in ./venv/lib/python3.11/site-packages (from botocore<1.35.0,>=1.34.69->boto3==1.34.69->data-prep-toolkit==0.2.1.dev0->dpk_sp_transform_python==0.2.1.dev0) (2.2.3)\n", + "Requirement already satisfied: six>=1.5 in ./venv/lib/python3.11/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.35.0,>=1.34.69->boto3==1.34.69->data-prep-toolkit==0.2.1.dev0->dpk_sp_transform_python==0.2.1.dev0) (1.16.0)\n", + "Using cached data_prep_toolkit-0.2.1.dev0-py3-none-any.whl (63 kB)\n", + "Using cached argparse-1.4.0-py2.py3-none-any.whl (23 kB)\n", + "Building wheels for collected packages: dpk_sp_transform_python\n", + " Building editable for dpk_sp_transform_python (pyproject.toml): started\n", + " Building editable for dpk_sp_transform_python (pyproject.toml): finished with status 'done'\n", + " Created wheel for dpk_sp_transform_python: filename=dpk_sp_transform_python-0.2.1.dev0-0.editable-py3-none-any.whl size=3029 sha256=b974ec3554a35c3eed9404259858fe9e0f32bae721292b2720ba1d0f1554c09a\n", + " Stored in directory: /private/var/folders/f7/736gtk714sg36xlcqnjx16xw0000gn/T/pip-ephem-wheel-cache-tz9u49oy/wheels/2b/2b/a9/43d967436e095f83fb907da3c3f1e509b63e5e34561366fcc5\n", + "Successfully built dpk_sp_transform_python\n", + "Installing collected packages: argparse, data-prep-toolkit, dpk_sp_transform_python\n", + " Attempting uninstall: data-prep-toolkit\n", + " Found existing installation: data_prep_toolkit 0.2.2.dev0\n", + " Uninstalling data_prep_toolkit-0.2.2.dev0:\n", + " Successfully uninstalled data_prep_toolkit-0.2.2.dev0\n", + " Attempting uninstall: dpk_sp_transform_python\n", + " Found existing installation: dpk_sp_transform_python 0.2.1.dev0\n", + " Uninstalling dpk_sp_transform_python-0.2.1.dev0:\n", + " Successfully uninstalled dpk_sp_transform_python-0.2.1.dev0\n", + "Successfully installed argparse-1.4.0 data-prep-toolkit-0.2.1.dev0 dpk_sp_transform_python-0.2.1.dev0\n", + "Process completed successfully.\n", + "Changed directory to: /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/semantic_profiler/python\n", + "/Library/Developer/CommandLineTools/usr/bin/make RUN_FILE=sp_local_python.py .transforms.run-src-file\n", + "source venv/bin/activate;\t\\\n", + "\tcd src;\t\t\t\t\\\n", + "\tpython sp_local_python.py \"\"\n", + "Process completed successfully.\n" + ] + }, + { + "data": { + "text/plain": [ + "'/Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/higher_order_syntactic_profiler/python/test-data/input/uast_table_part_0.parquet'" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# runnning the semantic_profiler transform\n", + "run_make_command(sp_code_dir, ['make', 'venv'])\n", + "run_make_command(sp_code_dir, ['make', 'run-local-python-sample'])\n", + "\n", + "# Copy the output of the semantic_profiler transform to the input folder of the higher_order_syntactic_profiler\n", + "shutil.copy2(sp_output_dir + output_pq, hosp_input_dir + output_pq)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "9bedf5dd-92d6-4a83-abb5-f8439b80ee02", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Changed directory to: /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/higher_order_syntactic_profiler/python\n", + "python -m venv venv\n", + "Requirement already satisfied: pip in ./venv/lib/python3.11/site-packages (24.2)\n", + "Requirement already satisfied: wheel in ./venv/lib/python3.11/site-packages (0.44.0)\n", + "Requirement already satisfied: pytest in ./venv/lib/python3.11/site-packages (8.3.3)\n", + "Requirement already satisfied: pytest-cov in ./venv/lib/python3.11/site-packages (5.0.0)\n", + "Requirement already satisfied: iniconfig in ./venv/lib/python3.11/site-packages (from pytest) (2.0.0)\n", + "Requirement already satisfied: packaging in ./venv/lib/python3.11/site-packages (from pytest) (24.1)\n", + "Requirement already satisfied: pluggy<2,>=1.5 in ./venv/lib/python3.11/site-packages (from pytest) (1.5.0)\n", + "Requirement already satisfied: coverage>=5.2.1 in ./venv/lib/python3.11/site-packages (from coverage[toml]>=5.2.1->pytest-cov) (7.6.1)\n", + "Installing Python data processing library source to existing venv\n", + "pip uninstall -y data-prep-toolkit \n", + "Found existing installation: data_prep_toolkit 0.2.1.dev0\n", + "Uninstalling data_prep_toolkit-0.2.1.dev0:\n", + " Successfully uninstalled data_prep_toolkit-0.2.1.dev0\n", + "Begin installing source from ../../../../data-processing-lib/python into venv\n", + "Obtaining file:///Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/data-processing-lib/python\n", + " Installing build dependencies: started\n", + " Installing build dependencies: finished with status 'done'\n", + " Checking if build backend supports build_editable: started\n", + " Checking if build backend supports build_editable: finished with status 'done'\n", + " Getting requirements to build editable: started\n", + " Getting requirements to build editable: finished with status 'done'\n", + " Preparing editable metadata (pyproject.toml): started\n", + " Preparing editable metadata (pyproject.toml): finished with status 'done'\n", + "Requirement already satisfied: numpy<1.29.0 in ./venv/lib/python3.11/site-packages (from data_prep_toolkit==0.2.2.dev0) (1.26.4)\n", + "Requirement already satisfied: pyarrow==16.1.0 in ./venv/lib/python3.11/site-packages (from data_prep_toolkit==0.2.2.dev0) (16.1.0)\n", + "Requirement already satisfied: boto3==1.34.69 in ./venv/lib/python3.11/site-packages (from data_prep_toolkit==0.2.2.dev0) (1.34.69)\n", + "Collecting argparse (from data_prep_toolkit==0.2.2.dev0)\n", + " Using cached argparse-1.4.0-py2.py3-none-any.whl.metadata (2.8 kB)\n", + "Requirement already satisfied: mmh3 in ./venv/lib/python3.11/site-packages (from data_prep_toolkit==0.2.2.dev0) (5.0.1)\n", + "Requirement already satisfied: botocore<1.35.0,>=1.34.69 in ./venv/lib/python3.11/site-packages (from boto3==1.34.69->data_prep_toolkit==0.2.2.dev0) (1.34.162)\n", + "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in ./venv/lib/python3.11/site-packages (from boto3==1.34.69->data_prep_toolkit==0.2.2.dev0) (1.0.1)\n", + "Requirement already satisfied: s3transfer<0.11.0,>=0.10.0 in ./venv/lib/python3.11/site-packages (from boto3==1.34.69->data_prep_toolkit==0.2.2.dev0) (0.10.2)\n", + "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in ./venv/lib/python3.11/site-packages (from botocore<1.35.0,>=1.34.69->boto3==1.34.69->data_prep_toolkit==0.2.2.dev0) (2.9.0.post0)\n", + "Requirement already satisfied: urllib3!=2.2.0,<3,>=1.25.4 in ./venv/lib/python3.11/site-packages (from botocore<1.35.0,>=1.34.69->boto3==1.34.69->data_prep_toolkit==0.2.2.dev0) (2.2.3)\n", + "Requirement already satisfied: six>=1.5 in ./venv/lib/python3.11/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.35.0,>=1.34.69->boto3==1.34.69->data_prep_toolkit==0.2.2.dev0) (1.16.0)\n", + "Using cached argparse-1.4.0-py2.py3-none-any.whl (23 kB)\n", + "Building wheels for collected packages: data_prep_toolkit\n", + " Building editable for data_prep_toolkit (pyproject.toml): started\n", + " Building editable for data_prep_toolkit (pyproject.toml): finished with status 'done'\n", + " Created wheel for data_prep_toolkit: filename=data_prep_toolkit-0.2.2.dev0-0.editable-py3-none-any.whl size=2269 sha256=ca207188a4f4284ff51582aa6e5ad215097da2b72edce0a0ee5abf4b66c9753c\n", + " Stored in directory: /private/var/folders/f7/736gtk714sg36xlcqnjx16xw0000gn/T/pip-ephem-wheel-cache-o861wrsm/wheels/eb/18/bc/2b7f918df1f95453f58d1ccde4ae0ea1bfd48079af6ea338f8\n", + "Successfully built data_prep_toolkit\n", + "Installing collected packages: argparse, data_prep_toolkit\n", + "Successfully installed argparse-1.4.0 data_prep_toolkit-0.2.2.dev0\n", + "Done installing source from ../../../../data-processing-lib/python into venv\n", + "Installed source from Python processing library for /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/higher_order_syntactic_profiler/python/venv/bin/python\n", + "Installing from pyproject.toml\n", + "Obtaining file:///Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/higher_order_syntactic_profiler/python\n", + " Installing build dependencies: started\n", + " Installing build dependencies: finished with status 'done'\n", + " Checking if build backend supports build_editable: started\n", + " Checking if build backend supports build_editable: finished with status 'done'\n", + " Getting requirements to build editable: started\n", + " Getting requirements to build editable: finished with status 'done'\n", + " Preparing editable metadata (pyproject.toml): started\n", + " Preparing editable metadata (pyproject.toml): finished with status 'done'\n", + "Collecting data-prep-toolkit==0.2.1.dev0 (from dpk_hosp_transform_python==0.2.1.dev0)\n", + " Using cached data_prep_toolkit-0.2.1.dev0-py3-none-any.whl.metadata (1.9 kB)\n", + "Requirement already satisfied: networkx==3.0.0 in ./venv/lib/python3.11/site-packages (from dpk_hosp_transform_python==0.2.1.dev0) (3.0)\n", + "Requirement already satisfied: jinja2==3.1.2 in ./venv/lib/python3.11/site-packages (from dpk_hosp_transform_python==0.2.1.dev0) (3.1.2)\n", + "Requirement already satisfied: plotly==5.15.0 in ./venv/lib/python3.11/site-packages (from dpk_hosp_transform_python==0.2.1.dev0) (5.15.0)\n", + "Requirement already satisfied: matplotlib==3.9.0 in ./venv/lib/python3.11/site-packages (from dpk_hosp_transform_python==0.2.1.dev0) (3.9.0)\n", + "Requirement already satisfied: matplotlib-inline==0.1.7 in ./venv/lib/python3.11/site-packages (from dpk_hosp_transform_python==0.2.1.dev0) (0.1.7)\n", + "Requirement already satisfied: pyarrow==16.1.0 in ./venv/lib/python3.11/site-packages (from data-prep-toolkit==0.2.1.dev0->dpk_hosp_transform_python==0.2.1.dev0) (16.1.0)\n", + "Requirement already satisfied: boto3==1.34.69 in ./venv/lib/python3.11/site-packages (from data-prep-toolkit==0.2.1.dev0->dpk_hosp_transform_python==0.2.1.dev0) (1.34.69)\n", + "Collecting argparse (from data-prep-toolkit==0.2.1.dev0->dpk_hosp_transform_python==0.2.1.dev0)\n", + " Using cached argparse-1.4.0-py2.py3-none-any.whl.metadata (2.8 kB)\n", + "Requirement already satisfied: mmh3 in ./venv/lib/python3.11/site-packages (from data-prep-toolkit==0.2.1.dev0->dpk_hosp_transform_python==0.2.1.dev0) (5.0.1)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in ./venv/lib/python3.11/site-packages (from jinja2==3.1.2->dpk_hosp_transform_python==0.2.1.dev0) (2.1.5)\n", + "Requirement already satisfied: contourpy>=1.0.1 in ./venv/lib/python3.11/site-packages (from matplotlib==3.9.0->dpk_hosp_transform_python==0.2.1.dev0) (1.3.0)\n", + "Requirement already satisfied: cycler>=0.10 in ./venv/lib/python3.11/site-packages (from matplotlib==3.9.0->dpk_hosp_transform_python==0.2.1.dev0) (0.12.1)\n", + "Requirement already satisfied: fonttools>=4.22.0 in ./venv/lib/python3.11/site-packages (from matplotlib==3.9.0->dpk_hosp_transform_python==0.2.1.dev0) (4.54.1)\n", + "Requirement already satisfied: kiwisolver>=1.3.1 in ./venv/lib/python3.11/site-packages (from matplotlib==3.9.0->dpk_hosp_transform_python==0.2.1.dev0) (1.4.7)\n", + "Requirement already satisfied: numpy>=1.23 in ./venv/lib/python3.11/site-packages (from matplotlib==3.9.0->dpk_hosp_transform_python==0.2.1.dev0) (1.26.4)\n", + "Requirement already satisfied: packaging>=20.0 in ./venv/lib/python3.11/site-packages (from matplotlib==3.9.0->dpk_hosp_transform_python==0.2.1.dev0) (24.1)\n", + "Requirement already satisfied: pillow>=8 in ./venv/lib/python3.11/site-packages (from matplotlib==3.9.0->dpk_hosp_transform_python==0.2.1.dev0) (10.4.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in ./venv/lib/python3.11/site-packages (from matplotlib==3.9.0->dpk_hosp_transform_python==0.2.1.dev0) (3.1.4)\n", + "Requirement already satisfied: python-dateutil>=2.7 in ./venv/lib/python3.11/site-packages (from matplotlib==3.9.0->dpk_hosp_transform_python==0.2.1.dev0) (2.9.0.post0)\n", + "Requirement already satisfied: traitlets in ./venv/lib/python3.11/site-packages (from matplotlib-inline==0.1.7->dpk_hosp_transform_python==0.2.1.dev0) (5.14.3)\n", + "Requirement already satisfied: tenacity>=6.2.0 in ./venv/lib/python3.11/site-packages (from plotly==5.15.0->dpk_hosp_transform_python==0.2.1.dev0) (9.0.0)\n", + "Requirement already satisfied: botocore<1.35.0,>=1.34.69 in ./venv/lib/python3.11/site-packages (from boto3==1.34.69->data-prep-toolkit==0.2.1.dev0->dpk_hosp_transform_python==0.2.1.dev0) (1.34.162)\n", + "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in ./venv/lib/python3.11/site-packages (from boto3==1.34.69->data-prep-toolkit==0.2.1.dev0->dpk_hosp_transform_python==0.2.1.dev0) (1.0.1)\n", + "Requirement already satisfied: s3transfer<0.11.0,>=0.10.0 in ./venv/lib/python3.11/site-packages (from boto3==1.34.69->data-prep-toolkit==0.2.1.dev0->dpk_hosp_transform_python==0.2.1.dev0) (0.10.2)\n", + "Requirement already satisfied: six>=1.5 in ./venv/lib/python3.11/site-packages (from python-dateutil>=2.7->matplotlib==3.9.0->dpk_hosp_transform_python==0.2.1.dev0) (1.16.0)\n", + "Requirement already satisfied: urllib3!=2.2.0,<3,>=1.25.4 in ./venv/lib/python3.11/site-packages (from botocore<1.35.0,>=1.34.69->boto3==1.34.69->data-prep-toolkit==0.2.1.dev0->dpk_hosp_transform_python==0.2.1.dev0) (2.2.3)\n", + "Using cached data_prep_toolkit-0.2.1.dev0-py3-none-any.whl (63 kB)\n", + "Using cached argparse-1.4.0-py2.py3-none-any.whl (23 kB)\n", + "Building wheels for collected packages: dpk_hosp_transform_python\n", + " Building editable for dpk_hosp_transform_python (pyproject.toml): started\n", + " Building editable for dpk_hosp_transform_python (pyproject.toml): finished with status 'done'\n", + " Created wheel for dpk_hosp_transform_python: filename=dpk_hosp_transform_python-0.2.1.dev0-0.editable-py3-none-any.whl size=2998 sha256=3271dc1db830b824f1eaa39f145d1d27cdc76ad17ad5c80bdd29632bd33e9474\n", + " Stored in directory: /private/var/folders/f7/736gtk714sg36xlcqnjx16xw0000gn/T/pip-ephem-wheel-cache-44v7a13k/wheels/42/6a/97/42ad1db1552a59fdbe3c8110698b7fd0e6308328b1ce136aa7\n", + "Successfully built dpk_hosp_transform_python\n", + "Installing collected packages: argparse, data-prep-toolkit, dpk_hosp_transform_python\n", + " Attempting uninstall: data-prep-toolkit\n", + " Found existing installation: data_prep_toolkit 0.2.2.dev0\n", + " Uninstalling data_prep_toolkit-0.2.2.dev0:\n", + " Successfully uninstalled data_prep_toolkit-0.2.2.dev0\n", + " Attempting uninstall: dpk_hosp_transform_python\n", + " Found existing installation: dpk_hosp_transform_python 0.2.1.dev0\n", + " Uninstalling dpk_hosp_transform_python-0.2.1.dev0:\n", + " Successfully uninstalled dpk_hosp_transform_python-0.2.1.dev0\n", + "Successfully installed argparse-1.4.0 data-prep-toolkit-0.2.1.dev0 dpk_hosp_transform_python-0.2.1.dev0\n", + "Process completed successfully.\n", + "Changed directory to: /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/higher_order_syntactic_profiler/python\n", + "/Library/Developer/CommandLineTools/usr/bin/make RUN_FILE=hosp_local_python.py .transforms.run-src-file\n", + "source venv/bin/activate;\t\\\n", + "\tcd src;\t\t\t\t\\\n", + "\tpython hosp_local_python.py \"\"\n", + "HTML file generated: /Users/pankajthorat/challenges/7122-DP4genAI/os-data-prep-kit/v2-dpk/demo-dpk/data-prep-kit/transforms/code/higher_order_syntactic_profiler/python/src/output.html\n", + "Process completed successfully.\n" + ] + } + ], + "source": [ + "# runnning the higher_order_syntactic_profiler transform\n", + "run_make_command(hosp_code_dir, ['make', 'venv'])\n", + "run_make_command(hosp_code_dir, ['make', 'run-local-python-sample'])" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "96166877-6c6d-4959-ba95-84a0716a0c5f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + " \n", + " Profiler Report\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "
\n", + "
\n", + "

Syntactic and Semantic Profile

\n", + "

This report presents the detailed profiling report of the input dataset.

\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "

Available Metrics

\n", + " \n", + "
\n", + "\n", + " \n", + "
\n", + "
\n", + " \n", + "
\n", + "

Library

\n", + "

\n", + " \n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + "
\n", + "

Language

\n", + "

\n", + " \n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + "
\n", + "

Concepts

\n", + "

\n", + " \n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + "
\n", + "

CCR

\n", + "

\n", + " \n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "\n", + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# display the output html report\n", + "display_html_file(hosp_code_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07ae0e0d-896e-4396-b63a-207fe7fa6141", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dataprepkit", + "language": "python", + "name": "data-prep-kit" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/code/syntactic_concept_extractor/python/Makefile b/transforms/code/syntactic_concept_extractor/python/Makefile index 0062cdd7d..f81a7e709 100644 --- a/transforms/code/syntactic_concept_extractor/python/Makefile +++ b/transforms/code/syntactic_concept_extractor/python/Makefile @@ -11,7 +11,9 @@ TRANSFORM_NAME=syntactic_concept_extractor include $(REPOROOT)/transforms/.make.transforms -venv:: .transforms.python-venv +export RUNTIME_HOST_ARCH=x86_64 + +venv:: .transforms.python-venv test:: .transforms.python-test @@ -39,6 +41,9 @@ build-dist:: .defaults.build-dist publish-dist:: .defaults.publish-dist +test-image: + @echo "Skipping test-image step as per configuration." + # Ensure RUN_ARGS has a default value RUN_ARGS ?= "" diff --git a/transforms/code/syntactic_concept_extractor/python/pyproject.toml b/transforms/code/syntactic_concept_extractor/python/pyproject.toml index dac490ec8..38b65a615 100644 --- a/transforms/code/syntactic_concept_extractor/python/pyproject.toml +++ b/transforms/code/syntactic_concept_extractor/python/pyproject.toml @@ -104,6 +104,7 @@ dependencies = [ "typing_extensions==4.12.2", "tzdata==2024.1", "urllib3==2.2.2", + "uuid", "wcwidth==0.2.13", "wrapt==1.16.0", ] diff --git a/transforms/code/syntactic_concept_extractor/python/src/LLM_runner_app.py b/transforms/code/syntactic_concept_extractor/python/src/LLM_runner_app.py deleted file mode 100644 index 90a7b3c08..000000000 --- a/transforms/code/syntactic_concept_extractor/python/src/LLM_runner_app.py +++ /dev/null @@ -1,498 +0,0 @@ -# import neccesary packages -from genai.client import Client -from genai.credentials import Credentials -from tree_sitter import Parser, Language -import json -from tree_sitter_languages import get_language -import glob -import os -from time import sleep -import streamlit as st -from annotated_text import annotated_text -import re -from config_LLM_runner_app import API_ENDPOINT, API_KEY - -# Flag to dictate if it is concept-level pruning -GET_CONCEPTS_ONLY = False -# Flag to dictate if it is text based input -TEXT_TEST_CONCEPT = False - -# enter your BAM API key here, or alternatively use os.environ -# You can alternatively, switch this to any model API. You have to change the request simlultaneously -if 'client' not in st.session_state: - credentials = Credentials(api_key= API_KEY, api_endpoint = API_ENDPOINT) - st.session_state['client'] = Client(credentials=credentials) - -# load the cached requirements. This JSON contains important information about Concept nodes and language mapping to binding name. -if 'cached_requirements' not in st.session_state: - st.session_state['cached_requirements'] = json.load(open('cached_requirements.json', 'r')) - -# Load the neccesary maps. You can change them in the cached_requirements JSON and it will change dynamically. -### -formal_language_example_map = st.session_state['cached_requirements']['formal_language_example_map'] -formal_language_map = st.session_state['cached_requirements']['formal_language_map'] -formal_concept_map = st.session_state['cached_requirements']['formal_concept_map'] -formal_model_card_map = st.session_state['cached_requirements']['formal_model_card_map'] -concept_to_node_map = st.session_state['cached_requirements']['concept_to_node_map'] -### - -# option to select the few-shot examples -example_languages = st.sidebar.multiselect("Select the known languages to give few shot examples", list(formal_language_example_map.keys())) - -# option to choose the test language. If it is not present here, look at the 'Adding new language' section in the documentation. -test_language = st.sidebar.selectbox("Select the unknown language you want to test", list(set(formal_language_map.keys()) - set(example_languages))) - -# option to select the input method. If it is not present locally, change it to text-input -test_method = st.sidebar.selectbox("How do you want to test?", ["Local Files", "User Input"]) - -# set the flag for text-based input -if (test_method == "User Input"): - TEXT_TEST_CONCEPT = True - -# initialise the snippet -test_code_snippet = None - -# get input -if TEXT_TEST_CONCEPT: - test_code_snippet = st.sidebar.text_area("Enter code snippet of the language used", height= 200) - -# choose the concept to give ti=o extract rules for -test_concept = st.sidebar.selectbox("Select the UAST concept you want to extract", list(formal_concept_map.keys())) - -# get the current few_shot examples present within the data. -present_examples = os.listdir('./data/few_shot_outputs/') - -# file numbers are important as there can be multiple relevant nodes. -test_file_num = 0 - -# option to choose the model. -model = st.sidebar.selectbox("Select the model you want to run the query on", list(formal_model_card_map.keys())) - -# choose the pruning method. -pruning_method = st.sidebar.selectbox("Select the pruning method to apply to the example ASTs", ["Concept-Level Pruning", "No Pruning", "Depth-Level Pruning"]) - -# set to infinity for No-pruning. -max_depth = float('inf') - -# set flags and depth levels for different techniques. Giving the option to choose depth. -if (pruning_method == "Depth-Level Pruning"): - max_depth = st.sidebar.slider('Select the pruning depth of the AST', min_value= 1, max_value= 5, value = 3) - -elif (pruning_method == "Concept-Level Pruning"): - GET_CONCEPTS_ONLY = True - max_depth = st.sidebar.slider('Select the pruning depth of the test AST', min_value = 1, max_value = 5, value= 3) - -# few-shot example languages -example_languages = [formal_language_map[lang] for lang in example_languages] - -# test language. -test_language = formal_language_map[test_language] - -# get the formal concept name -test_concept = formal_concept_map[test_concept] - -# get the full model name -model = formal_model_card_map[model] - -# map to store number of present examples. -if 'number_of_examples' not in st.session_state: - st.session_state['number_of_examples'] = dict() - -# save in session state -st.session_state['Languages'] = example_languages - -# if its to fetch from local storage, append the test to the example-languages. -if not TEXT_TEST_CONCEPT: - st.session_state['Languages'] = example_languages + [test_language] - - -""" -Function to convert and AST node into a string with requiring only relevant data. -Requires the ID of the node, the node type, the code snippet and the parent id. -""" -def create_node(id, node, parent_id): - req_string = f"< node_id = {id}, node_type = {node.type}, code_snippet = {repr(node.text.decode('utf8'))}, parent_id = {parent_id} >" - return req_string - -""" -Function to recursively assign ID and preprocess the AST in a concept-level pruning manner to get it into a parse-able format to pass to the LLM. -dfs_id() function allocates a unique ID on preorder traversal basis to the treenode. -_dfs() function recursively parses the tree to the relevant node, while storing the code snippet relevant to a unique ID node. -""" -def get_concept_tree(tree, language): - ast_repr = [] - code_snippets = dict() - id_dictionary = dict() - - def dfs_id(node): - id_dictionary[node] = len(id_dictionary) - for child in node.children: - dfs_id(child) - - dfs_id(tree.root_node) - - def _dfs(node, parent): - if (node.type in concept_to_node_map[language][test_concept]): - ast_repr.append(create_node(id_dictionary[node], node, id_dictionary[parent])) - code_snippets[id_dictionary[node]] = node.text.decode("utf8") - for child in node.children: - _dfs(child, node) - - for child in tree.root_node.children: - _dfs(child, tree.root_node) - - return ast_repr, code_snippets - - -""" -Function to recursively assign ID and preprocess the AST in a K-level-depth pruning manner to get it into a parse-able format to pass to the LLM. -dfs_id() function allocates a unique ID on preorder traversal basis to the treenode. -_dfs() function recursively parses the tree to the relevant node, while storing the code snippet relevant to a unique ID node. -""" -def get_tree(tree, k): - ast_repr = [] - code_snippets = dict() - id_dictionary = dict() - - def dfs_id(node): - id_dictionary[node] = len(id_dictionary) - for child in node.children: - dfs_id(child) - - dfs_id(tree.root_node) - - def _dfs(node, depth, parent): - if (depth >= k): - return - ast_repr.append(create_node(id_dictionary[node], node, id_dictionary[parent])) - code_snippets[id_dictionary[node]] = node.text.decode("utf8") - for child in node.children: - _dfs(child, depth + 1, node) - - # _dfs(tree.root_node, -1, tree.root_node) - for child in tree.root_node.children: - _dfs(child, 0, tree.root_node) - - return ast_repr, code_snippets - -# initialise an AST parser. -parser = Parser() - -# use bindings from tree_sitter_language library. -if 'language_binding' not in st.session_state: - st.session_state['language_binding'] = { - "cpp" : get_language("cpp"), - "py" : get_language('python'), - "java" : get_language("java"), - "go" : get_language("go"), - "js" : get_language("javascript"), - "ts" : get_language("typescript"), - "perl" : get_language("perl"), - "php" : get_language("php"), - "ocaml" : get_language("ocaml") - } - # uising the normal tree-sitter bindings locally for the laguages present in the cached_requirements json. - for binding in os.listdir('../../input/tree-sitter-bindings'): - name = binding.split('-bindings', 1)[0] - # print(name) - if name in st.session_state['language_binding']: - continue - try: - language_obj = Language('tree-sitter-bindings/' + binding, name) - except Exception as e: - print(e) - print(name) - exit() - st.session_state['language_binding'][name] = language_obj - -#initialize session states to contain all the outputs. -if 'all_few_shot_outputs' not in st.session_state: - st.session_state['all_few_shot_outputs'] = dict() - -if 'all_asts' not in st.session_state: - st.session_state['all_asts'] = dict() - -if 'all_code_snippets' not in st.session_state: - st.session_state['all_code_snippets'] = dict() - -if 'all_concept_code_json' not in st.session_state: - st.session_state['all_concept_code_json'] = dict() - - -# get all the few_shot LLM output examples present locally -def get_all_few_shot(example_languages, test_concept, language): - for language in example_languages: - programs = os.listdir(f"./data/few_shot_outputs/uast_{test_concept}/{language}") - names = [os.path.basename(file).split('.')[0] for file in programs] - for i in range(len(programs)): - if (language not in st.session_state['all_few_shot_outputs']): - st.session_state['all_few_shot_outputs'][language] = dict() - - content = open(f"./data/few_shot_outputs/uast_{test_concept}/{language}/{programs[i]}", "r").read() - st.session_state['all_few_shot_outputs'][language][names[i]] = content - -""" get all the few_shot code examples present locally and their corresponding AST with given max depth. -This function also calls the AST preprocessor to store it in a global dictionary to retrieve in one step. -""" -def get_all_asts_code(test_concept, max_depth = 0): - for language in st.session_state['Languages']: - parser.set_language(st.session_state['language_binding'][language]) - programs = os.listdir(f"./data/Concept_dataset/uast_{test_concept}/{language}") - names = [os.path.basename(file).split('.')[0] for file in programs] - st.session_state['number_of_examples'][language] = len(programs) - for i in range(len(programs)): - if (language not in st.session_state['all_asts']): - st.session_state['all_asts'][language] = dict() - st.session_state['all_code_snippets'][language] = dict() - st.session_state['all_concept_code_json'][language] = dict() - - content = open(f"./data/Concept_dataset/uast_{test_concept}/{language}/{programs[i]}", "r").read() - st.session_state['all_code_snippets'][language][names[i]] = content - ast = parser.parse(bytes(content, "utf8")) - all_ast, all_code = None, None - if (GET_CONCEPTS_ONLY and (language != test_language)): - all_ast, all_code = get_concept_tree(ast, language) - else: - all_ast, all_code = get_tree(ast, max_depth) - st.session_state['all_asts'][language][names[i]] = str(all_ast) - st.session_state['all_concept_code_json'][language][names[i]] = all_code - -""" get all the corresponding AST with given max depth of the given text-input. -This function also calls the AST preprocessor to store it in a global dictionary to retrieve in one step. -""" -def get_text_test_example(language, test_code_snippet): - parser.set_language(st.session_state['language_binding'][language]) - if (language not in st.session_state['all_asts']): - st.session_state['all_asts'][language] = dict() - st.session_state['all_code_snippets'][language] = dict() - st.session_state['all_concept_code_json'][language] = dict() - st.session_state['all_code_snippets'][language]['0'] = test_code_snippet - ast = parser.parse(bytes(test_code_snippet, "utf8")) - all_ast, all_code = get_tree(ast, max_depth) - st.session_state['all_asts'][language]['0'] = str(all_ast) - st.session_state['all_concept_code_json'][language]['0'] = all_code - -# load the prompt for the concept -category_prompt_file = f"./data/prompts/{test_concept}.txt" -st.session_state['prompt'] = open(category_prompt_file, "r").read() - -# preprocessor for using the AST and code to convert it into a string -def example_builder(lang, program_num): - return f"\n{st.session_state['all_code_snippets'][lang][str(program_num)]}\n\n\n{st.session_state['all_asts'][lang][str(program_num)]}" - -# get the fewshot examples in a pluggable form to the LLM. -def get_few_shot(): - few_shot_examples = [] - for lang in example_languages: - for program_num in range(st.session_state['number_of_examples'][lang]): - few_shot_examples.append( - { - "input" : f"{example_builder(lang, program_num)}", - "output" : f"{st.session_state['all_few_shot_outputs'][lang][str(program_num)]}" - } - ) - return few_shot_examples - -# call funtions to get all such examples, codes and ASTs. -get_all_asts_code(test_concept, max_depth) -get_all_few_shot(example_languages, test_concept, test_language) -st.markdown("### Enter prompt here") - -# make a modifiable prompt -st.session_state['prompt'] = st.text_area("prompt", st.session_state['prompt'], height= 700, label_visibility="collapsed") - -# if its text-based call the function to get the AST. -if TEXT_TEST_CONCEPT: - get_text_test_example(test_language, test_code_snippet) -st.session_state['test_input'] = f"{example_builder(test_language, '0')}" - -# display the few-shot examples JSON -st.write('Training examples:') -st.write(get_few_shot()) - -# display the test JSON -st.write("Test example:") -st.write([st.session_state['test_input']]) - -""" -function to extract rule from the response. -This works because of LLM alignment to generate response in a format, with the help of few-shot examples. -""" -def get_rule_py(output_text): - content = output_text.split('```py', 1)[1].split('```', 1)[0].strip() - return content - -""" -function to extract node type from the response. -This works because of LLM alignment to generate response in a format, with the help of few-shot examples. -""" -def extract_node_type(output_text): - content = output_text.split('see that the', 1)[1].split('nodes', 1)[0].strip() - return content.strip('\'"') - -""" -function to extract IDs of all the relevant nodes from the response. -Returns a list of relevant node IDs. -This works because of LLM alignment to generate response in a format, with the help of few-shot examples. -""" -def extract_node_id(output_text): - content = None - try: - content = output_text.split('with ids = [', 1)[1].split(']', 1)[0].strip() - except: - try: - content = output_text.split('with id = ', 1)[1].split(',', 1)[0].strip() - except: - st.write("cant be extracted") - - if (',') not in content: - return [int(content)] - - id_strings = content.split(',') - return [int(id.strip()) for id in id_strings] - -""" -function to save the output generated by the LLM. -""" -def save_rule(language, node_type, rule, prompt, output, concept, ruleset_path, example_path, example_languages, test_code, max_depth): - ruleset_files = os.listdir(ruleset_path) - print(ruleset_files) - - # if the file is already present then just add a new mapping from the relevant node type to its corresponding rule. - if (f'UAST_rules_{language}.json' in ruleset_files): - rule_dict = json.load(open(f'{ruleset_path}/UAST_rules_{language}.json', 'r')) - rule_dict[node_type] = { - "uast_node_type": f"uast_{concept}", - "extractor": rule - } - # if it is not, then make a new dictionary with the same. - else: - rule_dict = { - node_type : { - "uast_node_type": f"uast_{concept}", - "extractor": rule - } - } - - print("saving rule for",language) - try: - try: - # try to save the rule dictionary - json.dump(rule_dict, open(f'{ruleset_path}/UAST_rules_{language}.json', 'w'), indent = 4) - print("json saved") - except Exception as e: - print("could not save rule JSON :", end = " ") - print(e) - - # make the directory to save the output. - os.makedirs(example_path + '/' + concept + '/' + language, exist_ok= True) - files_present = os.listdir(f"{example_path}/{concept}/{language}") - - # loop to check already present files. This is because of multiple relevant nodes. - counter = 0 - while(f"{counter}.txt" in files_present): - counter += 1 - - # saving the LLM output, input code, few-shot languages and the prompt. - with open(f"{example_path}/{concept}/{language}/{counter}.txt", "w") as f: - f.write(output) - - with open(f"{example_path}/{concept}/{language}/prompt_{counter}.txt", "w") as f: - f.write(prompt) - - with open(f"{example_path}/{concept}/{language}/example_languages_{counter}.txt", "w") as f: - f.write(str(example_languages) + '\n' + 'max_depth = '+ str(max_depth)) - - with open(f"{example_path}/{concept}/{language}/test_code_{counter}.txt", "w") as f: - f.write(test_code) - - os.makedirs(f"./data/few_shot_outputs/uast_{concept}/{language}", exist_ok= True) - os.makedirs(f"./data/Concept_dataset/uast_{concept}/{language}", exist_ok= True) - - # save the output as another few-shot example. - with open(f"./data/few_shot_outputs/uast_{concept}/{language}/{counter}.txt", "w") as f: - f.write(output) - - with open(f"./data/Concept_dataset/uast_{concept}/{language}/{counter}.txt", "w") as f: - f.write(test_code) - - # if everything is successful, display balloons on the screen!. - st.balloons() - print("Voila! prompt worked before i did 8410510369114 attempts! ") - except Exception as e: - print("COULD NOT SAVE FOR", language, "because :", e) - - # add concept nodes in the cached_requirements and save it. - if (concept in st.session_state['cached_requirements']['concept_to_node_map'][language]) : - if (node_type not in st.session_state['cached_requirements']['concept_to_node_map'][language][concept]): - st.session_state['cached_requirements']['concept_to_node_map'][language][concept].append(node_type) - else : - st.session_state['cached_requirements']['concept_to_node_map'][language][concept] = [node_type] - - - concept_to_node_map = st.session_state['cached_requirements']['concept_to_node_map'] - json.dump(st.session_state['cached_requirements'], open("cached_requirements.json", "w"), indent= 4) - -# remove new-line comments frmo the code that the LLM generates. This is done to reduce memory consumption, as the output is saved already for documentation purposes. -def remove_comments(text): - return re.sub(r"^(#.*?$)\n", "", text, flags = re.MULTILINE) - -# change the extracted keyword to self.extracted keyword to make it work for the parser. -def process_rule(text): - return remove_comments(text).replace("extracted", "self.extracted") - -# function to enable stream generation through yielding tokens. -response = None -def stream_data(): - for token in response: - yield token.results[0].generated_text - -# if the submit button is clicked, perform the subsequent operations: -if st.sidebar.button('Submit'): - - # Invoke the query to the LLM after collecting the pluggable codes and ASTs. - with st.spinner('Language model is working ...'): - response = st.session_state['client'].text.generation.create_stream( - model_id= model, - parameters = { - "decoding_method": "greedy", - "min_new_tokens": 1, - "max_new_tokens": 1024 - }, - moderations = dict(), - prompt_id = "prompt_builder", - data = { - "input": st.session_state['test_input'], - "instruction": st.session_state['prompt'], - "input_prefix": "Input:", - "output_prefix": "Output:", - "examples": get_few_shot() - } - ) - st.markdown('### Response:') - # stream output - ans = st.write_stream(stream_data) - - st.write('----------------------------------------------') - - # extract the nodes and IDs. - nodes = extract_node_id(ans) - - # extract the rule. - rule = get_rule_py(ans) - - # get the relevant code snippets from the IDs it extracted. - code_snippets = [st.session_state['all_concept_code_json'][test_language][str(test_file_num)][node] for node in nodes] - extracted = None - - # run the code for each snippet. - for i in range(len(code_snippets)): - code_snippet = code_snippets[i] - exec(rule) - st.write(f'for Node with ID = {nodes[i]} and code') - st.write(f'```{test_language}\n{code_snippet}') - annotated_text('The extracted part is', (extracted,'', 'rgba(10,50,170,0.5)')) - st.write('----------------------------------------------') - - # One-click acceptance of rule. - st.sidebar.button("Accept the given rule?", on_click= save_rule, args= [test_language, extract_node_type(ans), process_rule(rule), st.session_state['prompt'], ans, test_concept, "./ruleset", "./data/final_UI_outputs", example_languages, st.session_state['all_code_snippets'][test_language]['0'], max_depth]) \ No newline at end of file diff --git a/transforms/code/syntactic_concept_extractor/python/src/cached_requirements.json b/transforms/code/syntactic_concept_extractor/python/src/cached_requirements.json deleted file mode 100644 index 1893b0724..000000000 --- a/transforms/code/syntactic_concept_extractor/python/src/cached_requirements.json +++ /dev/null @@ -1,335 +0,0 @@ -{ - "concept_to_node_map": { - "py": { - "package": [ - "import_statement", - "import_from_statement" - ], - "function": [ - "function_definition", - "function_definition' node with id = 1, represents the definition of a function in the code. Incorporating this node, I can make a general rule to extract the definitions.\n\nThis python script can be executed:\n\n```py\n# we see that the function name is directly before the argument list, hence we get the snippet just before the first bracket of the argument list.\ntemp_0 = code_snippet.split('(')[0].strip() \n# as our required function name, from the snippet is the last one in this string, we split and get the last snippet, which is our function.\nextracted = temp_0.split(' ')[-1].strip()\n```\n\nThis script will extract the function name 'foo' from the given code snippet." - ], - "comment": [ - "comment" - ] - }, - "cpp": { - "package": [ - "preproc_include", - "code snippet includes a package. Hence I consider the string after the first '#include'.\ntest = code_snippet.split('#include', 1)[1].strip()\n# In the case that there are any comments, we remove them.\ntest = test.split('//')[0].strip()\nextracted = test.split('/*')[0].strip()\n# Remove angle brackets and quotes\nextracted = extracted.replace('<', '').replace('>', '').replace('\"', '').replace(\"'\", '')\n# Remove semicolons and asterisks\nextracted = extracted.replace(';', '').replace('*', '')\nprint(extracted)\n```\n\nThis script will extract the imported packages from the code snippet, removing any comments, angle brackets, quotes, semicolons, and asterisks. The output will be:\n\n```\ncassert\nclimits\niostream\nvector\n```", - "code snippet includes a package. Hence I consider the string after the first '#include'.\ntest = code_snippet.split('#include', 1)[1].strip()\n# In the case that there are any comments, we remove them.\ntest = test.split('//')[0].strip()\nextracted = test.split('/*')[0].strip()\n# Remove angle brackets and quotes\nextracted = extracted.replace('<', '').replace('>', '').replace('\"', '').replace(\"'\", '')\n# Remove semicolons and asterisks\nextracted = extracted.replace(';', '').replace('*', '')\nprint(extracted)\n```\n\nThis script will extract the imported packages from the code snippet, removing any comments, angle brackets, quotes, semicolons, and asterisks. The output will be:\n\n```\nvector\nsubstab\ncassert\nclimits\niostream\nvector\nvector\n```" - ], - "function": [ - "function_declaration", - "function_definition" - ], - "comment": [ - "comment" - ] - }, - "java": { - "package": [ - "import_declaration" - ], - "function": [ - "method_declaration" - ], - "comment": [ - "line_comment", - "block_comment" - ] - }, - "js": { - "package": [ - "import_statement" - ], - "function": [ - "function_declaration" - ], - "comment": [ - "comment" - ] - }, - "go": { - "package": [ - "import_declaration" - ], - "function": [ - "function_declaration" - ], - "comment": [ - "comment" - ] - }, - "agda": { - "package": [ - "open" - ], - "function": [ - "function" - ], - "comment": [ - "comment" - ] - }, - "c": { - "package": [ - "preproc_include" - ], - "function": [ - "function_definition" - ], - "comment": [ - "comment" - ] - }, - "c_sharp": { - "package": [ - "using_directive" - ], - "comment": [ - "comment" - ], - "function": [ - "local_function_statement" - ] - }, - "d": { - "package": [ - "import_declaration" - ], - "function": [ - "function_declaration" - ], - "comment": [ - "comment" - ] - }, - "dart": { - "package": [ - "import_or_export" - ], - "function": [ - "function_signature" - ], - "comment": [ - "comment", - "documentation_comment' node with id = 1, represents a comment in the code. Incorporating this node, I can make a general rule to extract the comments.\n\nThis python script can be executed:\n\n```py\n# if the first three characters are '///' we can simply remove the first three characters and get the remaining string\nif (code_snippet[0:3] == '///'):\n extracted = code_snippet[3:].strip()\n```\n\nThis script will extract the comment from the given code snippet." - ] - }, - "elm": { - "package": [ - "import_clause" - ], - "function": [ - "function_declaration_left" - ], - "comment": [ - "line_comment", - "block_comment" - ] - }, - "haskell": { - "package": [ - "import" - ], - "function": [ - "function" - ], - "comment": [ - "comment" - ] - }, - "kotlin": { - "package": [ - "import_header" - ], - "comment": [ - "multiline_comment", - "line_comment" - ], - "function": [ - "function_declaration" - ] - }, - "nim": { - "package": [ - "import_statement", - "include_statement", - "import_from_statement" - ], - "comment": [ - "block_comment", - "comment" - ], - "function": [ - "proc_declaration" - ] - }, - "objc": { - "package": [ - "preproc_import", - "preproc_include" - ], - "function": [ - "function_definition" - ], - "comment": [ - "comment" - ] - }, - "ocaml": { - "package": [ - "open_module" - ], - "comment": [ - "comment" - ] - }, - "perl": { - "package": [ - "use_no_statement" - ], - "function": [ - "function_definition" - ] - }, - "qmljs": { - "package": [ - "ui_import" - ], - "comment": [ - "comment" - ] - }, - "rust": { - "package": [ - "use_declaration" - ], - "function": [ - "function_item" - ], - "comment": [ - "line_comment" - ] - }, - "scala": { - "package": [ - "import_declaration" - ], - "comment": [ - "comment", - "block_comment" - ], - "function": [ - "function_definition" - ] - }, - "ts": { - "package": [ - "import_statement" - ], - "comment": [ - "comment" - ], - "function": [ - "function_declaration" - ] - }, - "verilog": { - "package": [ - "package_or_generate_item_declaration", - "include_compiler_directive" - ], - "comment": [ - "comment" - ], - "function": [ - "function_identifier" - ] - }, - "vhdl": { - "package": [ - "library_clause" - ], - "comment": [ - "comment" - ], - "function": [ - "function_body" - ] - } - }, - "formal_language_example_map": { - "C++": "cpp", - "Python": "py", - "Java": "java", - "JavaScript": "js", - "Golang": "go", - "c": "c", - "c_sharp": "c_sharp", - "d": "d", - "dart": "dart", - "haskell": "haskell", - "kotlin": "kotlin", - "nim": "nim", - "objc": "objc", - "ocaml": "ocaml", - "perl": "perl", - "qml": "qmljs", - "rust": "rust", - "scala": "scala", - "TypeScript": "ts", - "verilog": "verilog", - "vhdl": "vhdl", - "agda": "agda", - "elm": "elm" - }, - "formal_language_map": { - "C++": "cpp", - "Python": "py", - "Java": "java", - "JavaScript": "js", - "Golang": "go", - "php": "php", - "bash": "bash", - "elixir": "elixir", - "clojure": "clojure", - "dot": "dot", - "COBOL": "COBOL", - "erlang": "erlang", - "r": "r", - "ruby": "ruby", - "julia": "julia", - "lua": "lua", - "svelte": "svelte", - "c": "c", - "c_sharp": "c_sharp", - "d": "d", - "dart": "dart", - "haskell": "haskell", - "kotlin": "kotlin", - "nim": "nim", - "objc": "objc", - "ocaml": "ocaml", - "perl": "perl", - "qml": "qmljs", - "rust": "rust", - "scala": "scala", - "TypeScript": "ts", - "verilog": "verilog", - "vhdl": "vhdl", - "agda": "agda", - "elm": "elm", - "pascal": "pascal" - }, - "formal_concept_map": { - "Functions": "function", - "Packages": "package", - "Comments": "comment" - }, - "formal_model_card_map": { - "Llama 3 Instruct: 80b": "meta-llama/llama-3-70b-instruct", - "Granite Code Instruct: 34b": "ibm/granite-34b-code-instruct" - } -} \ No newline at end of file diff --git a/transforms/code/syntactic_concept_extractor/python/src/config_LLM_runner_app.py b/transforms/code/syntactic_concept_extractor/python/src/config_LLM_runner_app.py deleted file mode 100644 index 0d418c629..000000000 --- a/transforms/code/syntactic_concept_extractor/python/src/config_LLM_runner_app.py +++ /dev/null @@ -1,5 +0,0 @@ -API_KEY = "Cl19NQn7D7y5ERFHfpUYNl8kWKqOTHqkGociOEI4nbsd" -API_ENDPOINT = "https://us-south.ml.cloud.ibm.com" -MODEL_ID = "meta-llama/llama-3-70b-instruct" -PROMPT_NAME = "My-prompt" -PROJECT_ID = "ba1b3e6d-5e38-4c72-9c36-4a9470cea282" \ No newline at end of file diff --git a/transforms/code/syntactic_concept_extractor/python/src/generic_LLM_runner_app.py b/transforms/code/syntactic_concept_extractor/python/src/generic_LLM_runner_app.py deleted file mode 100644 index e02cfa5a7..000000000 --- a/transforms/code/syntactic_concept_extractor/python/src/generic_LLM_runner_app.py +++ /dev/null @@ -1,550 +0,0 @@ -# Import necessary packages -from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams -from ibm_watsonx_ai.foundation_models import ModelInference -from ibm_watsonx_ai import Credentials -from tree_sitter import Parser, Language -import json -from tree_sitter_languages import get_language -import glob -import os -from time import sleep -import streamlit as st -from annotated_text import annotated_text -import re -from config_LLM_runner_app import API_ENDPOINT, API_KEY, PROJECT_ID, MODEL_ID - -# Flag to dictate if it is concept-level pruning -GET_CONCEPTS_ONLY = False -# Flag to dictate if it is text based input -TEXT_TEST_CONCEPT = False - -# Initialize the IBM Watsonx.ai client -if 'client' not in st.session_state: - # Set up credentials - credentials = Credentials(api_key=API_KEY, url=API_ENDPOINT) - # Set up parameters for the model - parameters = { - GenParams.DECODING_METHOD: "greedy", - GenParams.MAX_NEW_TOKENS: 1024, - GenParams.MIN_NEW_TOKENS: 1, - # Add other parameters as needed - } - # Initialize the model - st.session_state['client'] = ModelInference( - model_id=MODEL_ID, - params=parameters, - credentials=credentials, - project_id=PROJECT_ID - ) - -# Load the cached requirements -if 'cached_requirements' not in st.session_state: - st.session_state['cached_requirements'] = json.load(open('cached_requirements.json', 'r')) - -# Load the necessary maps -formal_language_example_map = st.session_state['cached_requirements']['formal_language_example_map'] -formal_language_map = st.session_state['cached_requirements']['formal_language_map'] -formal_concept_map = st.session_state['cached_requirements']['formal_concept_map'] -formal_model_card_map = st.session_state['cached_requirements']['formal_model_card_map'] -concept_to_node_map = st.session_state['cached_requirements']['concept_to_node_map'] - -# Option to select the few-shot examples -example_languages = st.sidebar.multiselect("Select the known languages to give few-shot examples", list(formal_language_example_map.keys())) - -# Option to choose the test language -test_language = st.sidebar.selectbox("Select the unknown language you want to test", list(set(formal_language_map.keys()) - set(example_languages))) - -# Option to select the input method -test_method = st.sidebar.selectbox("How do you want to test?", ["Local Files", "User Input"]) - -# Set the flag for text-based input -if test_method == "User Input": - TEXT_TEST_CONCEPT = True - -# Initialize the snippet -test_code_snippet = None - -# Get input -if TEXT_TEST_CONCEPT: - test_code_snippet = st.sidebar.text_area("Enter code snippet of the language used", height=200) - -# Choose the concept to extract rules for -test_concept = st.sidebar.selectbox("Select the UAST concept you want to extract", list(formal_concept_map.keys())) - -# Get the current few-shot examples present within the data -present_examples = os.listdir('./data/few_shot_outputs/') - -# File numbers are important as there can be multiple relevant nodes -test_file_num = 0 - -# Option to choose the model -model = st.sidebar.selectbox("Select the model you want to run the query on", list(formal_model_card_map.keys())) - -# Choose the pruning method -pruning_method = st.sidebar.selectbox("Select the pruning method to apply to the example ASTs", ["Concept-Level Pruning", "No Pruning", "Depth-Level Pruning"]) - -# Set to infinity for No-pruning -max_depth = float('inf') - -# Set flags and depth levels for different techniques -if pruning_method == "Depth-Level Pruning": - max_depth = st.sidebar.slider('Select the pruning depth of the AST', min_value=1, max_value=5, value=3) -elif pruning_method == "Concept-Level Pruning": - GET_CONCEPTS_ONLY = True - max_depth = st.sidebar.slider('Select the pruning depth of the test AST', min_value=1, max_value=5, value=3) - -# Few-shot example languages -example_languages = [formal_language_map[lang] for lang in example_languages] - -# Test language -test_language = formal_language_map[test_language] - -# Get the formal concept name -test_concept = formal_concept_map[test_concept] - -# Get the full model name -model = formal_model_card_map[model] - -# Map to store number of present examples -if 'number_of_examples' not in st.session_state: - st.session_state['number_of_examples'] = dict() - -# Save in session state -st.session_state['Languages'] = example_languages - -# If it's to fetch from local storage, append the test to the example languages -if not TEXT_TEST_CONCEPT: - st.session_state['Languages'] = example_languages + [test_language] - - -""" -Function to convert and AST node into a string with requiring only relevant data. -Requires the ID of the node, the node type, the code snippet and the parent id. -""" -def create_node(id, node, parent_id): - req_string = f"< node_id = {id}, node_type = {node.type}, code_snippet = {repr(node.text.decode('utf8'))}, parent_id = {parent_id} >" - return req_string - -""" -Function to recursively assign ID and preprocess the AST in a concept-level pruning manner to get it into a parse-able format to pass to the LLM. -dfs_id() function allocates a unique ID on preorder traversal basis to the treenode. -_dfs() function recursively parses the tree to the relevant node, while storing the code snippet relevant to a unique ID node. -""" -def get_concept_tree(tree, language): - ast_repr = [] - code_snippets = dict() - id_dictionary = dict() - - def dfs_id(node): - id_dictionary[node] = len(id_dictionary) - for child in node.children: - dfs_id(child) - - dfs_id(tree.root_node) - - def _dfs(node, parent): - if (node.type in concept_to_node_map[language][test_concept]): - ast_repr.append(create_node(id_dictionary[node], node, id_dictionary[parent])) - code_snippets[id_dictionary[node]] = node.text.decode("utf8") - for child in node.children: - _dfs(child, node) - - for child in tree.root_node.children: - _dfs(child, tree.root_node) - - return ast_repr, code_snippets - - -""" -Function to recursively assign ID and preprocess the AST in a K-level-depth pruning manner to get it into a parse-able format to pass to the LLM. -dfs_id() function allocates a unique ID on preorder traversal basis to the treenode. -_dfs() function recursively parses the tree to the relevant node, while storing the code snippet relevant to a unique ID node. -""" -def get_tree(tree, k): - ast_repr = [] - code_snippets = dict() - id_dictionary = dict() - - def dfs_id(node): - id_dictionary[node] = len(id_dictionary) - for child in node.children: - dfs_id(child) - - dfs_id(tree.root_node) - - def _dfs(node, depth, parent): - if (depth >= k): - return - ast_repr.append(create_node(id_dictionary[node], node, id_dictionary[parent])) - code_snippets[id_dictionary[node]] = node.text.decode("utf8") - for child in node.children: - _dfs(child, depth + 1, node) - - # _dfs(tree.root_node, -1, tree.root_node) - for child in tree.root_node.children: - _dfs(child, 0, tree.root_node) - - return ast_repr, code_snippets - -# initialise an AST parser. -parser = Parser() - -# use bindings from tree_sitter_language library. -if 'language_binding' not in st.session_state: - st.session_state['language_binding'] = { - "cpp" : get_language("cpp"), - "py" : get_language('python'), - "java" : get_language("java"), - "go" : get_language("go"), - "js" : get_language("javascript"), - "ts" : get_language("typescript"), - "perl" : get_language("perl"), - "php" : get_language("php"), - "ocaml" : get_language("ocaml") - } - BINDINGS_DIR = '../../input/tree-sitter-bindings' - # uising the normal tree-sitter bindings locally for the laguages present in the cached_requirements json. - for binding in os.listdir(BINDINGS_DIR): - print(binding) - name = binding.split('-bindings', 1)[0] - # print(name) - if name in st.session_state['language_binding']: - continue - try: - language_path = os.path.join(BINDINGS_DIR, binding) - language_obj = Language(language_path, name) - except Exception as e: - print(e) - print(name) - exit() - st.session_state['language_binding'][name] = language_obj - -#initialize session states to contain all the outputs. -if 'all_few_shot_outputs' not in st.session_state: - st.session_state['all_few_shot_outputs'] = dict() - -if 'all_asts' not in st.session_state: - st.session_state['all_asts'] = dict() - -if 'all_code_snippets' not in st.session_state: - st.session_state['all_code_snippets'] = dict() - -if 'all_concept_code_json' not in st.session_state: - st.session_state['all_concept_code_json'] = dict() - - -# get all the few_shot LLM output examples present locally -def get_all_few_shot(example_languages, test_concept, language): - for language in example_languages: - programs = os.listdir(f"./data/few_shot_outputs/uast_{test_concept}/{language}") - names = [os.path.basename(file).split('.')[0] for file in programs] - for i in range(len(programs)): - if (language not in st.session_state['all_few_shot_outputs']): - st.session_state['all_few_shot_outputs'][language] = dict() - - content = open(f"./data/few_shot_outputs/uast_{test_concept}/{language}/{programs[i]}", "r").read() - st.session_state['all_few_shot_outputs'][language][names[i]] = content - -""" get all the few_shot code examples present locally and their corresponding AST with given max depth. -This function also calls the AST preprocessor to store it in a global dictionary to retrieve in one step. -""" -def get_all_asts_code(test_concept, max_depth = 0): - for language in st.session_state['Languages']: - parser.set_language(st.session_state['language_binding'][language]) - # Define the directory path - dir_path = f"./data/Concept_dataset/uast_{test_concept}/{language}" - # Check if the directory exists - if not os.path.exists(dir_path): - print(f"No concept data for concept '{test_concept}' in language '{language}'. Skipping.") - continue # Skip this language and continue with the next - # List the programs in the directory - programs = os.listdir(dir_path) - if not programs: - print(f"No programs found for concept '{test_concept}' in language '{language}'. Skipping.") - continue # Skip if the directory is empty - names = [os.path.basename(file).split('.')[0] for file in programs] - st.session_state['number_of_examples'][language] = len(programs) - - for i in range(len(programs)): - if (language not in st.session_state['all_asts']): - st.session_state['all_asts'][language] = dict() - st.session_state['all_code_snippets'][language] = dict() - st.session_state['all_concept_code_json'][language] = dict() - - content = open(f"./data/Concept_dataset/uast_{test_concept}/{language}/{programs[i]}", "r").read() - st.session_state['all_code_snippets'][language][names[i]] = content - ast = parser.parse(bytes(content, "utf8")) - all_ast, all_code = None, None - if (GET_CONCEPTS_ONLY and (language != test_language)): - all_ast, all_code = get_concept_tree(ast, language) - else: - all_ast, all_code = get_tree(ast, max_depth) - st.session_state['all_asts'][language][names[i]] = str(all_ast) - st.session_state['all_concept_code_json'][language][names[i]] = all_code - -""" get all the corresponding AST with given max depth of the given text-input. -This function also calls the AST preprocessor to store it in a global dictionary to retrieve in one step. -""" -def get_text_test_example(language, test_code_snippet): - parser.set_language(st.session_state['language_binding'][language]) - if (language not in st.session_state['all_asts']): - st.session_state['all_asts'][language] = dict() - st.session_state['all_code_snippets'][language] = dict() - st.session_state['all_concept_code_json'][language] = dict() - st.session_state['all_code_snippets'][language]['0'] = test_code_snippet - ast = parser.parse(bytes(test_code_snippet, "utf8")) - all_ast, all_code = get_tree(ast, max_depth) - st.session_state['all_asts'][language]['0'] = str(all_ast) - st.session_state['all_concept_code_json'][language]['0'] = all_code - -# load the prompt for the concept -category_prompt_file = f"./data/prompts/{test_concept}.txt" -st.session_state['prompt'] = open(category_prompt_file, "r").read() - -def example_builder(lang, program_num): - if lang not in st.session_state['all_code_snippets']: - print(f"No code snippets available for language '{lang}'. Skipping.") - return None # Return None to indicate missing data - if str(program_num) not in st.session_state['all_code_snippets'][lang]: - print(f"No code snippet for program number '{program_num}' in language '{lang}'. Skipping.") - return None - if lang not in st.session_state['all_asts'] or str(program_num) not in st.session_state['all_asts'][lang]: - print(f"No AST available for program number '{program_num}' in language '{lang}'. Skipping.") - return None - return f"\n{st.session_state['all_code_snippets'][lang][str(program_num)]}\n\n\n{st.session_state['all_asts'][lang][str(program_num)]}" - -# get the fewshot examples in a pluggable form to the LLM. -def get_few_shot(): - few_shot_examples = [] - for lang in example_languages: - for program_num in range(st.session_state['number_of_examples'][lang]): - few_shot_examples.append( - { - "input" : f"{example_builder(lang, program_num)}", - "output" : f"{st.session_state['all_few_shot_outputs'][lang][str(program_num)]}" - } - ) - return few_shot_examples - -# call funtions to get all such examples, codes and ASTs. -get_all_asts_code(test_concept, max_depth) -get_all_few_shot(example_languages, test_concept, test_language) -st.markdown("### Enter prompt here") - -# Make a modifiable prompt -st.session_state['prompt'] = st.text_area("prompt", st.session_state['prompt'], height=700, label_visibility="collapsed") - -# If it's text-based, call the function to get the AST -if TEXT_TEST_CONCEPT: - get_text_test_example(test_language, test_code_snippet) -st.session_state['test_input'] = f"{example_builder(test_language, '0')}" - -# Display the few-shot examples JSON -st.write('Training examples:') -st.write(get_few_shot()) - -# Display the test JSON -st.write("Test example:") -st.write([st.session_state['test_input']]) - -""" -function to extract rule from the response. -This works because of LLM alignment to generate response in a format, with the help of few-shot examples. -""" -def get_rule_py(output_text): - content = output_text.split('```py', 1)[1].split('```', 1)[0].strip() - return content - -""" -function to extract node type from the response. -This works because of LLM alignment to generate response in a format, with the help of few-shot examples. -""" -def extract_node_type(output_text): - content = output_text.split('see that the', 1)[1].split('nodes', 1)[0].strip() - return content.strip('\'"') - -""" -function to extract IDs of all the relevant nodes from the response. -Returns a list of relevant node IDs. -This works because of LLM alignment to generate response in a format, with the help of few-shot examples. -""" -def extract_node_id(output_text): - content = None - try: - content = output_text.split('with ids = [', 1)[1].split(']', 1)[0].strip() - except: - try: - content = output_text.split('with id = ', 1)[1].split(',', 1)[0].strip() - except: - st.write("cant be extracted") - - if (',') not in content: - return [int(content)] - - id_strings = content.split(',') - return [int(id.strip()) for id in id_strings] - -""" -function to save the output generated by the LLM. -""" -def save_rule(language, node_type, rule, prompt, output, concept, ruleset_path, example_path, example_languages, test_code, max_depth): - ruleset_files = os.listdir(ruleset_path) - print(ruleset_files) - - # if the file is already present then just add a new mapping from the relevant node type to its corresponding rule. - if (f'UAST_rules_{language}.json' in ruleset_files): - rule_dict = json.load(open(f'{ruleset_path}/UAST_rules_{language}.json', 'r')) - rule_dict[node_type] = { - "uast_node_type": f"uast_{concept}", - "extractor": rule - } - # if it is not, then make a new dictionary with the same. - else: - rule_dict = { - node_type : { - "uast_node_type": f"uast_{concept}", - "extractor": rule - } - } - - print("saving rule for",language) - try: - try: - # try to save the rule dictionary - json.dump(rule_dict, open(f'{ruleset_path}/UAST_rules_{language}.json', 'w'), indent = 4) - print("json saved") - except Exception as e: - print("could not save rule JSON :", end = " ") - print(e) - - # make the directory to save the output. - os.makedirs(example_path + '/' + concept + '/' + language, exist_ok= True) - files_present = os.listdir(f"{example_path}/{concept}/{language}") - - # loop to check already present files. This is because of multiple relevant nodes. - counter = 0 - while(f"{counter}.txt" in files_present): - counter += 1 - - # saving the LLM output, input code, few-shot languages and the prompt. - with open(f"{example_path}/{concept}/{language}/{counter}.txt", "w") as f: - f.write(output) - - with open(f"{example_path}/{concept}/{language}/prompt_{counter}.txt", "w") as f: - f.write(prompt) - - with open(f"{example_path}/{concept}/{language}/example_languages_{counter}.txt", "w") as f: - f.write(str(example_languages) + '\n' + 'max_depth = '+ str(max_depth)) - - with open(f"{example_path}/{concept}/{language}/test_code_{counter}.txt", "w") as f: - f.write(test_code) - - os.makedirs(f"./data/few_shot_outputs/uast_{concept}/{language}", exist_ok= True) - os.makedirs(f"./data/Concept_dataset/uast_{concept}/{language}", exist_ok= True) - - # save the output as another few-shot example. - with open(f"./data/few_shot_outputs/uast_{concept}/{language}/{counter}.txt", "w") as f: - f.write(output) - - with open(f"./data/Concept_dataset/uast_{concept}/{language}/{counter}.txt", "w") as f: - f.write(test_code) - - # if everything is successful, display balloons on the screen!. - st.balloons() - print("Voila! prompt worked!") - except Exception as e: - print("COULD NOT SAVE FOR", language, "because :", e) - - # add concept nodes in the cached_requirements and save it. - if (concept in st.session_state['cached_requirements']['concept_to_node_map'][language]) : - if (node_type not in st.session_state['cached_requirements']['concept_to_node_map'][language][concept]): - st.session_state['cached_requirements']['concept_to_node_map'][language][concept].append(node_type) - else : - st.session_state['cached_requirements']['concept_to_node_map'][language][concept] = [node_type] - - - concept_to_node_map = st.session_state['cached_requirements']['concept_to_node_map'] - json.dump(st.session_state['cached_requirements'], open("cached_requirements.json", "w"), indent= 4) - -# remove new-line comments frmo the code that the LLM generates. This is done to reduce memory consumption, as the output is saved already for documentation purposes. -def remove_comments(text): - return re.sub(r"^(#.*?$)\n", "", text, flags = re.MULTILINE) - -# change the extracted keyword to self.extracted keyword to make it work for the parser. -def process_rule(text): - return remove_comments(text).replace("extracted", "self.extracted") - -# function to enable stream generation through yielding tokens. -response = None -def stream_data(): - for token in response: - yield token.results[0].generated_text - -def build_prompt(): - prompt = st.session_state['prompt'] + "\n\n" - examples = get_few_shot() - for example in examples: - prompt += "Input:\n" - prompt += example['input'] + "\n" - prompt += "Output:\n" - prompt += example['output'] + "\n\n" - prompt += "Input:\n" - prompt += st.session_state['test_input'] + "\n" - prompt += "Output:\n" - # The model is expected to generate the output here - return prompt -# If the submit button is clicked, perform the subsequent operations -if st.sidebar.button('Submit'): - - # Build the prompt - prompt_text = build_prompt() - - # Invoke the query to the LLM after collecting the pluggable codes and ASTs - with st.spinner('Language model is working ...'): - response = st.session_state['client'].generate_text(prompt_text) - st.markdown('### Response:') - ans = response # Directly assign the generated text - st.write(ans) - - st.write('----------------------------------------------') - - # Extract the nodes and IDs - nodes = extract_node_id(ans) - - # Extract the rule - rule = get_rule_py(ans) - - # Get the relevant code snippets from the IDs it extracted - code_snippets = [ - st.session_state['all_concept_code_json'][test_language][str(test_file_num)][node] - for node in nodes - ] - extracted = None - - # Run the code for each snippet - for i in range(len(code_snippets)): - code_snippet = code_snippets[i] - exec(rule) - st.write(f'For Node with ID = {nodes[i]} and code') - st.write(f'```{test_language}\n{code_snippet}') - annotated_text('The extracted part is', (extracted, '', 'rgba(10,50,170,0.5)')) - st.write('----------------------------------------------') - - # One-click acceptance of rule - st.sidebar.button( - "Accept the given rule?", - on_click=save_rule, - args=[ - test_language, - extract_node_type(ans), - process_rule(rule), - st.session_state['prompt'], - ans, - test_concept, - "./ruleset", - "./data/final_UI_outputs", - example_languages, - st.session_state['all_code_snippets'][test_language]['0'], - max_depth - ] - ) \ No newline at end of file diff --git a/transforms/code/syntactic_concept_extractor/python/src/offline-customizations/2_unknown-lib-pipeline.py b/transforms/code/syntactic_concept_extractor/python/src/offline-customizations/2_unknown-lib-pipeline.py deleted file mode 100644 index d84938e29..000000000 --- a/transforms/code/syntactic_concept_extractor/python/src/offline-customizations/2_unknown-lib-pipeline.py +++ /dev/null @@ -1,28 +0,0 @@ -import argparse -from llm_interaction import * -from config import * - - -prompt = PROMPT_TEMPLATE_1_FINAL - -parser = argparse.ArgumentParser() -parser.add_argument("-f", "--file", type=str, default=NULL_LIBS_FILE, help="File path") - -args = parser.parse_args() -file_data = read_csv(args.file) -combined_strings = gen_combined_strings(file_data) -input_data = {} - - -for combined_string in combined_strings: - input_template = prompt + f"\n\nINPUT: {combined_string} \nOUTPUT: " - response = model.generate_text(input_template) - print(response) - save_result(response,'ikb/extracted_data.csv',"") - - - - - - - diff --git a/transforms/code/syntactic_concept_extractor/python/src/offline-customizations/config.py b/transforms/code/syntactic_concept_extractor/python/src/offline-customizations/config.py deleted file mode 100644 index 8f0df3c65..000000000 --- a/transforms/code/syntactic_concept_extractor/python/src/offline-customizations/config.py +++ /dev/null @@ -1,79 +0,0 @@ -import pandas as pd # type: ignore -from io import StringIO -import os -import csv -from ibm_watsonx_ai.foundation_models.utils.enums import ModelTypes - - -INPUT_UAST = 'input' -OUTPUT_UAST = 'output' -IKB_FILE = 'ikb/ikb_model.csv' -NULL_LIBS_FILE = "null.csv" - -API_KEY = "Cl19NQn7D7y5ERFHfpUYNl8kWKqOTHqkGociOEI4nbsd" -API_ENDPOINT = "https://us-south.ml.cloud.ibm.com" -MODEL_ID = "meta-llama/llama-3-70b-instruct" -PROMPT_NAME = "My-prompt" -PROJECT_ID = "ba1b3e6d-5e38-4c72-9c36-4a9470cea282" - -NEW_CMAP_FILE = "concept_map/updated_concept_list.csv" -NEW_CMAP = open(NEW_CMAP_FILE, 'r').read() -CONCEPTS = pd.read_csv(NEW_CMAP_FILE)['Category'] - - -EXAMPLES_FILE_I = "examples/examples-i.csv" -df = pd.read_csv(EXAMPLES_FILE_I) -csv_buffer = StringIO() -df.to_csv(csv_buffer, index=False) -EXAMPLES_I = csv_buffer.getvalue() - -EXAMPLES_FILE_O = "examples/examples-o.csv" -df = pd.read_csv(EXAMPLES_FILE_O) -csv_buffer = StringIO() -df.to_csv(csv_buffer, index=False) -EXAMPLES_O = csv_buffer.getvalue() - -PROMPT_TEMPLATE_1_FINAL = '''You are responsible for classifying programming language packages based on their functionality into one of the following STRICT categories: - ''' + NEW_CMAP + ''' - - Instructions: - - 1. Input: A CSV containing two columns: - a. Library – the name of the package - b. Language – the programming language of the package - Your task is to append a third column called Category where you will classify the package's primary function into one of the following categories.\n - - 2. Output: The updated CSV with the new Category column. - - 3. Categorization Guidelines: - a. Classify each package based on its primary functionality. - b. Only use categories from the given list. Do not invent or modify categories. - - 4. Output format: Provide the updated CSV data in the exact format as shown below: - a. Columns: Library, Language, Category - b. End the response with to indicate completion. - - 5. Only use categories from the given list. Do not invent or modify categories. - - 6. Strictly do not provide any explanations or commentary or notes before and/or after the table. - - Examples: - INPUT: - ''' + str(EXAMPLES_I) + "OUTPUT:\n" + str(EXAMPLES_O).strip("\n")+"\n" - - - -def init_config(): - # Create required folders - folder_list = [OUTPUT_UAST] - for folder in folder_list: - if not os.path.exists(folder): - os.makedirs(folder) - # Create csv file - if not os.path.exists(NULL_LIBS_FILE): - with open(NULL_LIBS_FILE, 'w', newline='', encoding='utf-8') as csvfile: - fieldnames = ['Library', 'Language'] - writer = csv.writer(csvfile) - writer.writerow(fieldnames) - return - diff --git a/transforms/code/syntactic_concept_extractor/python/src/offline-customizations/generic_LLM_runner_app.py b/transforms/code/syntactic_concept_extractor/python/src/offline-customizations/generic_LLM_runner_app.py index 61d9139f1..a8f7edb9b 100644 --- a/transforms/code/syntactic_concept_extractor/python/src/offline-customizations/generic_LLM_runner_app.py +++ b/transforms/code/syntactic_concept_extractor/python/src/offline-customizations/generic_LLM_runner_app.py @@ -219,7 +219,9 @@ def _dfs(node, depth, parent): "php" : get_language("php"), "ocaml" : get_language("ocaml") } - BINDINGS_DIR = '../../../input/tree-sitter-bindings' + RUNTIME_HOST_ARCH = os.environ.get('RUNTIME_HOST_ARCH', 'x86_64') + BINDINGS_DIR = os.path.join('..', 'tree-sitter-bindings', RUNTIME_HOST_ARCH) + # uising the normal tree-sitter bindings locally for the laguages present in the cached_requirements json. for binding in os.listdir(BINDINGS_DIR): print(binding) diff --git a/transforms/code/syntactic_concept_extractor/python/src/offline-customizations/llm_interaction.py b/transforms/code/syntactic_concept_extractor/python/src/offline-customizations/llm_interaction.py deleted file mode 100644 index 71dd24514..000000000 --- a/transforms/code/syntactic_concept_extractor/python/src/offline-customizations/llm_interaction.py +++ /dev/null @@ -1,75 +0,0 @@ -import re -from io import StringIO -import pandas as pd -from config import * -from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams -from ibm_watsonx_ai.foundation_models import ModelInference -from ibm_watsonx_ai import APIClient, Credentials - - -credentials = Credentials(api_key=API_KEY, url=API_ENDPOINT) - -parameters = { - GenParams.DECODING_METHOD: "greedy", - GenParams.MAX_NEW_TOKENS: 1000, - GenParams.STOP_SEQUENCES: [""] -} - -model = ModelInference( - model_id=MODEL_ID, - params=parameters, - credentials=credentials, - project_id=PROJECT_ID) - - - -def init_concept_map(cm_file): - with open(cm_file, 'r') as file: - concept_map = file.read() - return concept_map - -def read_csv(csv_file, cols=['Library', 'Language']): - df = pd.read_csv(csv_file, usecols=cols) - data = df.to_dict(orient='records') - return data - -def gen_combined_strings(list_str): - combined_strings = [] - combined_string = "\nLibrary,Language,Category\n" - for idx, entry in enumerate(list_str, start=1): - entry_string = ",".join([f"{value}" for key, value in entry.items()]) - combined_string += f"{entry_string}\n" - if idx % 30 == 0 or idx == len(list_str): # Ensure to include the last batch - combined_strings.append(combined_string) - combined_string = "Library,Language,Category\n" - return combined_strings - - - -# def generate_response(input_template): -# result = model.generate_text(input_template) -# return result - - - -def save_result(data, filename, endtoken): - data = data.split(endtoken)[0] # Split the data at the end token and take the first part - csv_file = StringIO(data.strip()) # Remove any leading/trailing whitespace - df = pd.read_csv(csv_file) - print(df.columns) - df.to_csv(filename, mode='a', index=False, header=False) - return - -def read_examples(file): - df = pd.read_csv(file) - csv_buffer = StringIO() - df.to_csv(csv_buffer, index=False) - examples = csv_buffer.getvalue() - return examples - - -# if __name__ == "__main__": -# CONCEPT_MAP_FILE = "/Users/adrijadhar/Documents/GitHub/code-semantic-analysis/Testing/Prompt 1/examples/new_concept_map.txt" -# NEW_CMAP_FILE = "/Users/adrijadhar/Documents/GitHub/code-semantic-analysis/Testing/Prompt 1/examples/new_concept_map.txt" -# df = pd.read_csv(CONCEPT_MAP_FILE, usecols=["Functionality"]) -# df.to_csv(NEW_CMAP_FILE, index=False) \ No newline at end of file diff --git a/transforms/code/syntactic_concept_extractor/python/src/syntactic_concept_extractor_transform.py b/transforms/code/syntactic_concept_extractor/python/src/syntactic_concept_extractor_transform.py index 1d75473fc..a663cf475 100644 --- a/transforms/code/syntactic_concept_extractor/python/src/syntactic_concept_extractor_transform.py +++ b/transforms/code/syntactic_concept_extractor/python/src/syntactic_concept_extractor_transform.py @@ -10,36 +10,28 @@ # limitations under the License. ################################################################################ -import functools import os -import time +import subprocess from argparse import ArgumentParser, Namespace from typing import Any from data_processing.utils import get_logger +import uuid +import shutil +import atexit -import numpy as np -import pandas as pd import pyarrow as pa -import pyarrow.parquet as pq -import requests from data_processing.transform import AbstractTableTransform from tree_sitter import Language, Parser as TSParser -from tree_sitter_languages import get_language, get_parser +from tree_sitter_languages import get_language + -from collections import Counter -from UAST import UAST from UAST_parser import UASTParser -from concurrent.futures import ThreadPoolExecutor import json from data_processing.transform import AbstractBinaryTransform, TransformConfiguration from data_processing.utils import ( - GB, CLIArgumentProvider, - TransformUtils, - UnrecoverableException, get_logger, - str2bool, ) short_name = "SyntacticConceptExtractor" @@ -64,11 +56,28 @@ def __init__(self, config: dict[str, Any]): self.contents = self.config.get("contents") self.language = self.config.get("language") - # Compute the absolute path to the tree-sitter-bindings directory - script_dir = os.path.dirname(os.path.abspath(__file__)) - bindings_path = os.path.join(script_dir, '..', '..', 'input', 'tree-sitter-bindings') + def ensure_tree_sitter_bindings(): + # Generate a unique directory for the bindings based on a UUID + bindings_dir = f"tree-sitter-bindings-{uuid.uuid4()}" + + # Clone the bindings only if the unique directory does not exist + if not os.path.exists(bindings_dir): + print(f"Cloning tree-sitter bindings into {bindings_dir}...") + result = subprocess.run(["git", "clone", "https://github.com/pankajskku/tree-sitter-bindings.git", bindings_dir]) + if result.returncode != 0: + raise RuntimeError(f"Failed to clone tree-sitter bindings into {bindings_dir}") + return bindings_dir + + # Call this function before the main code execution + self.bindings_dir = ensure_tree_sitter_bindings() - # Verify that the bindings_path exists + # Use the correct architecture for runtime + RUNTIME_HOST_ARCH = os.environ.get('RUNTIME_HOST_ARCH', 'x86_64') + bindings_path = self.bindings_dir + '/' + RUNTIME_HOST_ARCH + print(f"Bindings bindings_dir: {self.bindings_dir}") + print(f"Bindings path: {bindings_path}") + + # Check if the bindings path exists if not os.path.exists(bindings_path): raise FileNotFoundError(f"Bindings path does not exist: {bindings_path}") @@ -81,7 +90,7 @@ def __init__(self, config: dict[str, Any]): JAVA_LANGUAGE = get_language("java") JAVASCRIPT_LANGUAGE = Language(os.path.join(bindings_path, 'js-bindings.so'), 'javascript') NIM_LANGUAGE = Language(os.path.join(bindings_path, 'nim-bindings.so'), 'nim') - OBJECTIVE_C_LANGUAGE = Language(os.path.join(bindings_path, 'objc-bindings.so'), 'objc') + #OBJECTIVE_C_LANGUAGE = Language(os.path.join(bindings_path, 'objc-bindings.so'), 'objc') OCAML_LANGUAGE = get_language("ocaml") PERL_LANGUAGE = get_language("perl") PY_LANGUAGE = get_language("python") @@ -101,7 +110,7 @@ def __init__(self, config: dict[str, Any]): "JavaScript": JAVASCRIPT_LANGUAGE, "Nim": NIM_LANGUAGE, "Ocaml": OCAML_LANGUAGE, - "Objective-C": OBJECTIVE_C_LANGUAGE, + #"Objective-C": OBJECTIVE_C_LANGUAGE, "Perl": PERL_LANGUAGE, "Python": PY_LANGUAGE, "Rust": RUST_LANGUAGE, @@ -120,7 +129,7 @@ def __init__(self, config: dict[str, Any]): "JavaScript": 'js', "Nim": 'nim', "Ocaml": 'ocaml', - "Objective-C": 'objc', + #"Objective-C": 'objc', "Perl": 'perl', "Python": 'py', "Rust": 'rust', @@ -166,6 +175,10 @@ def get_uast_parquet(): table_with_uast = get_uast_parquet() # report statistics stats = {"source_documents": table.num_columns, "result_documents": table_with_uast.num_columns} + + # Register cleanup for when the process exits + atexit.register(shutil.rmtree, self.bindings_dir) + return [table_with_uast], stats class SyntacticConceptExtractorTransformConfiguration(TransformConfiguration): diff --git a/transforms/code/syntactic_concept_extractor/python/test-data/expected/local/uast_table_part_0.parquet b/transforms/code/syntactic_concept_extractor/python/test-data/expected/local/uast_table_part_0.parquet index 247d63959..910e4cc03 100644 Binary files a/transforms/code/syntactic_concept_extractor/python/test-data/expected/local/uast_table_part_0.parquet and b/transforms/code/syntactic_concept_extractor/python/test-data/expected/local/uast_table_part_0.parquet differ diff --git a/transforms/code/syntactic_concept_extractor/python/test-data/expected/multi-package.parquet b/transforms/code/syntactic_concept_extractor/python/test-data/expected/multi-package.parquet index d9fe4bb4a..efc82e770 100644 Binary files a/transforms/code/syntactic_concept_extractor/python/test-data/expected/multi-package.parquet and b/transforms/code/syntactic_concept_extractor/python/test-data/expected/multi-package.parquet differ diff --git a/transforms/code/syntactic_concept_extractor/python/test-data/expected/python/multi-package.parquet b/transforms/code/syntactic_concept_extractor/python/test-data/expected/python/multi-package.parquet index 00ca0b62e..910e4cc03 100644 Binary files a/transforms/code/syntactic_concept_extractor/python/test-data/expected/python/multi-package.parquet and b/transforms/code/syntactic_concept_extractor/python/test-data/expected/python/multi-package.parquet differ diff --git a/transforms/code/syntactic_concept_extractor/python/test-data/input/multi-package.parquet b/transforms/code/syntactic_concept_extractor/python/test-data/input/multi-package.parquet index 8ef9f7cc2..fc96e51fa 100644 Binary files a/transforms/code/syntactic_concept_extractor/python/test-data/input/multi-package.parquet and b/transforms/code/syntactic_concept_extractor/python/test-data/input/multi-package.parquet differ diff --git a/transforms/code/syntactic_concept_extractor/python/test/test_syntactic_concept_extractor.py_python.py b/transforms/code/syntactic_concept_extractor/python/test/test_syntactic_concept_extractor.py_python.py deleted file mode 100644 index e56f09b65..000000000 --- a/transforms/code/syntactic_concept_extractor/python/test/test_syntactic_concept_extractor.py_python.py +++ /dev/null @@ -1,47 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import os - -from data_processing.runtime.pure_python import PythonTransformLauncher -from data_processing.test_support.launch.transform_test import ( - AbstractTransformLauncherTest, -) -from noop_transform import sleep_cli_param -from noop_transform_python import NOOPPythonTransformConfiguration - - -class TestPythonNOOPTransform(AbstractTransformLauncherTest): - """ - Extends the super-class to define the test data for the tests defined there. - The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. - """ - - def get_test_transform_fixtures(self) -> list[tuple]: - src_file_dir = os.path.abspath(os.path.dirname(__file__)) - fixtures = [] - - launcher = PythonTransformLauncher(NOOPPythonTransformConfiguration()) - input_dir = os.path.join(src_file_dir, "../test-data/input") - expected_dir = os.path.join(src_file_dir, "../test-data/expected") - transform_config = {sleep_cli_param: 0} - fixtures.append( - ( - launcher, - transform_config, - input_dir, - expected_dir, - [], # optional list of column names to ignore in comparing test-generated with expected. - ) - ) - - return fixtures diff --git a/transforms/code/syntactic_concept_extractor/ray/Dockerfile b/transforms/code/syntactic_concept_extractor/ray/Dockerfile index 3c127aefc..c3d4cc21e 100644 --- a/transforms/code/syntactic_concept_extractor/ray/Dockerfile +++ b/transforms/code/syntactic_concept_extractor/ray/Dockerfile @@ -6,11 +6,6 @@ RUN pip install --upgrade --no-cache-dir pip # install pytest RUN pip install --no-cache-dir pytest -# Create a user and use it to run the transform -RUN useradd -ms /bin/bash dpk -USER dpk -WORKDIR /home/dpk - # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). COPY --chown=ray:users data-processing-lib-python/ data-processing-lib-python/ @@ -34,8 +29,8 @@ COPY ./src/syntactic_concept_extractor_transform_ray.py . COPY ./src/syntactic_concept_extractor_local_ray.py local/ # copy test -# COPY test/ test/ -# COPY test-data/ test-data/ +COPY test/ test/ +COPY test-data/ test-data/ # Set environment ENV PYTHONPATH /home/ray diff --git a/transforms/code/syntactic_concept_extractor/ray/Makefile b/transforms/code/syntactic_concept_extractor/ray/Makefile index 301b0c9af..129b40f5e 100644 --- a/transforms/code/syntactic_concept_extractor/ray/Makefile +++ b/transforms/code/syntactic_concept_extractor/ray/Makefile @@ -13,6 +13,8 @@ TRANSFORM_NAME=syntactic_concept_extractor BASE_IMAGE=${RAY_BASE_IMAGE} venv:: .transforms.ray-venv +venv:: .transforms.python-venv + test:: .transforms.ray-test clean:: .transforms.clean @@ -44,9 +46,19 @@ publish-dist:: .defaults.publish-dist # Ensure RUN_ARGS has a default value RUN_ARGS ?= "" -# run-cli-sample: .transforms.run-cli-ray-sample +run-cli-sample: .transforms.run-cli-ray-sample + +run-local-sample: .transforms.run-local-ray-sample + +#run-local-ray-sample: +# @echo "Running local ray sample..." +# python src/syntactic_concept_extractor_local_ray.py \ +# --data_local_config "{ 'input_folder' : './input', 'output_folder' : './output'}" + +minio-start: .minio-start + +kind-load-image:: .transforms.kind-load-image + +docker-load-image: .defaults.docker-load-image -run-local-ray-sample: - @echo "Running local ray sample..." - python src/syntactic_concept_extractor_local_ray.py \ - --data_local_config "{ 'input_folder' : './input', 'output_folder' : './output'}" +docker-save-image: .defaults.docker-save-image diff --git a/transforms/code/syntactic_concept_extractor/ray/src/syntactic_concept_extractor_local_ray.py b/transforms/code/syntactic_concept_extractor/ray/src/syntactic_concept_extractor_local_ray.py index d424f974a..50f7bc9f1 100644 --- a/transforms/code/syntactic_concept_extractor/ray/src/syntactic_concept_extractor_local_ray.py +++ b/transforms/code/syntactic_concept_extractor/ray/src/syntactic_concept_extractor_local_ray.py @@ -17,12 +17,10 @@ from syntactic_concept_extractor_transform_ray import SyntacticConceptExtractorRayTransformConfiguration # Define default input and output directories relative to the script's location -default_input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../", "../", "input")) -default_output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../", "../", "output")) -# Use environment variables if provided, otherwise use the defaults -input_folder = os.getenv('INPUT_FOLDER', default_input_folder) -output_folder = os.getenv('OUTPUT_FOLDER', default_output_folder) +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "expected")) # Create local configuration local_conf = { @@ -31,16 +29,24 @@ "contents": "Contents", "language": "Language" } - +worker_options = {"num_cpus": 1} # Code location and parameters code_location = {"github": "github", "commit_hash": "12345", "path": "path"} params = { + # where to run + "run_locally": True, + # Data access. Only required parameters are specified "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # orchestrator + "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), + "runtime_num_workers": 3, "runtime_pipeline_id": "pipeline_id", "runtime_job_id": "job_id", - "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + "runtime_creation_delay": 0, + "runtime_code_location": ParamsUtils.convert_to_ast(code_location) } + if __name__ == "__main__": # Set the simulated command line args sys.argv = ParamsUtils.dict_to_req(d=params) diff --git a/transforms/code/syntactic_concept_extractor/ray/test-data/expected/multi-package.parquet b/transforms/code/syntactic_concept_extractor/ray/test-data/expected/multi-package.parquet index 00ca0b62e..910e4cc03 100644 Binary files a/transforms/code/syntactic_concept_extractor/ray/test-data/expected/multi-package.parquet and b/transforms/code/syntactic_concept_extractor/ray/test-data/expected/multi-package.parquet differ diff --git a/transforms/code/syntactic_concept_extractor/ray/test-data/input/multi-package.parquet b/transforms/code/syntactic_concept_extractor/ray/test-data/input/multi-package.parquet index 8ef9f7cc2..fc96e51fa 100644 Binary files a/transforms/code/syntactic_concept_extractor/ray/test-data/input/multi-package.parquet and b/transforms/code/syntactic_concept_extractor/ray/test-data/input/multi-package.parquet differ diff --git a/transforms/code/syntactic_concept_extractor/sys-overview.png b/transforms/code/syntactic_concept_extractor/sys-overview.png new file mode 100644 index 000000000..410086429 Binary files /dev/null and b/transforms/code/syntactic_concept_extractor/sys-overview.png differ