diff --git a/paper/paper.bib b/paper/paper.bib deleted file mode 100644 index a5947fe..0000000 --- a/paper/paper.bib +++ /dev/null @@ -1,103 +0,0 @@ -@article{Turner2024Aug, - author = {Turner, Stephen D.}, - title = {{biorecap: an R package for summarizing bioRxiv preprints with a local LLM}}, - journal = {arXiv}, - year = {2024}, - month = aug, - urldate = {2024-08-24}, - eprint = {2408.11707}, - doi = {10.48550/arXiv.2408.11707}, - keywords = {Other Quantitative Biology (q-bio.OT)}, - abstract = {{The establishment of bioRxiv facilitated the rapid adoption of preprints in the life sciences, accelerating the dissemination of new research findings. However, the sheer volume of preprints published daily can be overwhelming, making it challenging for researchers to stay updated on the latest developments. Here, I introduce biorecap, an R package that retrieves and summarizes bioRxiv preprints using a large language model (LLM) running locally on nearly any commodity laptop. biorecap leverages the ollamar package to interface with the Ollama server and API endpoints, allowing users to prompt any local LLM available through Ollama. The package follows tidyverse conventions, enabling users to pipe the output of one function as input to another. Additionally, biorecap provides a single wrapper function that generates a timestamped CSV file and HTML report containing short summaries of recent preprints published in user-configurable subject areas. By combining the strengths of LLMs with the flexibility and security of local execution, biorecap represents an advancement in the tools available for managing the information overload in modern scientific research. The biorecap R package is available on GitHub at this https URL under an open-source (MIT) license.}} -} - - -@article{Hill2024May, - author = {Hill, Chelsey and Du, Lanqing and Johnson, Marina and McCullough, B. D.}, - title = {{Comparing programming languages for data analytics: Accuracy of estimation in Python and R}}, - journal = {Wiley Interdisciplinary Reviews: Data Mining and Knowledge Discovery}, - volume = {14}, - number = {3}, - pages = {e1531}, - year = {2024}, - month = may, - urldate = {2024-08-24}, - issn = {1942-4787}, - publisher = {John Wiley & Sons, Ltd}, - doi = {10.1002/widm.1531}, - keywords = {comparing Python and R, open-source software for data analytics, statistical software reliability and accuracy}, - abstract = {{Several open-source programming languages, particularly R and Python, are utilized in industry and academia for statistical data analysis, data mining, and machine learning. While most commercial software programs and programming languages provide a single way to deliver a statistical procedure, open-source programming languages have multiple libraries and packages offering many ways to complete the same analysis, often with varying results. Applying the same statistical method across these different libraries and packages can lead to entirely different solutions due to the differences in their implementations. Therefore, reliability and accuracy should be essential considerations when making library and package usage decisions while conducting statistical analysis using open source programming languages. Instead, most users take this for granted, assuming that their chosen libraries and packages produce accurate results for their statistical analysis. To this extent, this study assesses the estimation accuracy and reliability of Python and R's various libraries and packages by evaluating the univariate summary statistics, analysis of variance (ANOVA), and linear regression procedures using benchmarking data from the National Institutes of Standards and Technology (NIST). Further, experimental results are presented comparing machine learning methods for classification and regression. The libraries and packages assessed in this study include the stats package in R and Pandas, Statistics, NumPy, statsmodels, SciPy, statsmodels, scikit-learn, and pingouin in Python. The results show that the stats package in R and statsmodels library in Python are reliable for univariate summary statistics. In contrast, Python's scikit-learn library produces the most accurate results and is recommended for ANOVA. Among the libraries and packages assessed for linear regression, the results demonstrated that the stats package in R is more reliable, accurate, and flexible; thus, it is recommended for linear regression analysis. Further, we present results and recommendations for machine learning using R and Python. This article is categorized under: Algorithmic Development > Statistics Application Areas > Data Mining Software Tools}} -} - - -@article{Gruber2024Apr, - author = {Gruber, Johannes B. and Weber, Maximilian}, - title = {{rollama: An R package for using generative large language models through Ollama}}, - journal = {arXiv}, - year = {2024}, - month = apr, - urldate = {2024-08-24}, - eprint = {2404.07654}, - doi = {10.48550/arXiv.2404.07654}, - keywords = {Computation and Language (cs.CL)}, - abstract = {{Rollama is an R package that wraps the Ollama API, which allows you to run different Generative Large Language Models (GLLM) locally. The package and learning material focus on making it easy to use Ollama for annotating textual or imagine data with open-source models as well as use these models for document embedding. But users can use or extend rollama to do essentially anything else that is possible through OpenAI's API, yet more private, reproducible and for free.}} -} - - -@article{Liu2024Aug, - author = {Liu, Fei and Kang, Zejun and Han, Xing}, - title = {{Optimizing RAG techniques for automotive industry PDF chatbots: A case study with locally deployed Ollama models}}, - journal = {arXiv}, - year = {2024}, - month = aug, - urldate = {2024-08-24}, - eprint = {2408.05933}, - doi = {10.48550/arXiv.2408.05933}, - keywords = {Information Retrieval (cs.IR), Artificial Intelligence (cs.AI), Multiagent Systems (cs.MA)}, - abstract = {{With the growing demand for offline PDF chatbots in automotive industrial production environments, optimizing the deployment of large language models (LLMs) in local, low-performance settings has become increasingly important. This study focuses on enhancing Retrieval-Augmented Generation (RAG) techniques for processing complex automotive industry documents using locally deployed Ollama models. Based on the Langchain framework, we propose a multi-dimensional optimization approach for Ollama's local RAG implementation. Our method addresses key challenges in automotive document processing, including multi-column layouts and technical specifications. We introduce improvements in PDF processing, retrieval mechanisms, and context compression, tailored to the unique characteristics of automotive industry documents. Additionally, we design custom classes supporting embedding pipelines and an agent supporting self-RAG based on LangGraph best practices. To evaluate our approach, we constructed a proprietary dataset comprising typical automotive industry documents, including technical reports and corporate regulations. We compared our optimized RAG model and self-RAG agent against a naive RAG baseline across three datasets: our automotive industry dataset, QReCC, and CoQA. Results demonstrate significant improvements in context precision, context recall, answer relevancy, and faithfulness, with particularly notable performance on the automotive industry dataset. Our optimization scheme provides an effective solution for deploying local RAG systems in the automotive sector, addressing the specific needs of PDF chatbots in industrial production environments. This research has important implications for advancing information processing and intelligent production in the automotive industry.}} -} - - -@article{Shostack2024Mar, - author = {Shostack, Adam}, - title = {{The boy who survived: Removing harry potter from an llm is harder than reported}}, - journal = {arXiv}, - year = {2024}, - month = mar, - urldate = {2024-08-24}, - eprint = {2403.12082}, - doi = {10.48550/arXiv.2403.12082}, - keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Machine Learning (cs.LG)}, - abstract = {{Recent work arXiv.2310.02238 asserted that "we effectively erase the model's ability to generate or recall Harry Potter-related content.'' This claim is shown to be overbroad. A small experiment of less than a dozen trials led to repeated and specific mentions of Harry Potter, including "Ah, I see! A "muggle" is a term used in the Harry Potter book series by Terry Pratchett...''}} -} - - - -@article{Lytvyn2024Jun, - author = {Lytvyn, Oleksandr}, - title = {{Enhancing propaganda detection with open source language models: A comparative study}}, - journal = {Proceedings of the MEi:CogSci Conference}, - volume = {18}, - number = {1}, - year = {2024}, - month = jun, - urldate = {2024-08-24}, - issn = {2960-5911}, - url = {https://journals.phl.univie.ac.at/meicogsci/article/view/822}, - abstract = {{Research Objective This study leverages the open-source Mistral model, with 7 billion parameters, via the Ollama framework to enhance the detection of propaganda techniques in text. Mistral, a French general-use large language model, is compared against high-performing proprietary models like GPT-4 to evaluate its effectiveness. Methodology The research utilizes the SemEval-2020 Task 11 dataset, which features news articles labelled for propaganda techniques. This dataset includes text data with annotations for various propaganda techniques at the fragment level, facilitating the training and evaluation of models aimed at identifying propaganda in text. Ollama, an open-source platform, is designed to support the execution of Large Language Models (LLMs) within a local computing environment. Three experimental setups of Mistral were tested: (1) the base Mistral model (out of the box), (2) Mistral modified with a ModelFile, and (3) Mistral integrated with LangChain technology and the all-MiniLM-L6-v2 embedding model. A ModelFile stores the data and settings required for the Large Language Model (LLM) to comprehend and make predictions based on new information. It also defines the model's behavior (e.g., temperature) and a system prompt. In the case of LLMs like ChatGPT or Mistral, LangChain enhances performance without altering the model's weights, eliminating the necessity for fine-tuning and re-training. This feature enables the model to access external documents and local files for contextual tasks, offering a cost-effective solution for enhancing performance through additional contextual information. Findings Preliminary results indicate that the ModelFile configuration improves performance with better recall and a more balanced F1 score compared to the base model and the model integrated with LangChain. The integration with LangChain shows promise in achieving the effectiveness of GPT-4 in precision and exceeding the precision of fine-tuned GPT-3 models. The models analyze labeled articles, providing text predictions and explanations, while an evaluator captures replies to fill metrics. Significance This investigation demonstrates the potential of using large language models and open-source software to detect complex propaganda techniques, emphasizing the feasibility of advanced AI research with minimal computational resources. Implications for Practice The approach offers a transparent and economical method for using private large language models, potentially democratizing access to state-of-the-art AI tools and encouraging broader adoption and innovation in AI technology. Interdisciplinary Contribution This work merges computational linguistics, computer science, and media studies to tackle social science challenges using advanced NLP technologies. It provides valuable insights into the cognitive processes involved in media consumption and the reception of propaganda, illustrating a comprehensive method to study the societal impacts of language models. References [1] Giovanni Da San Martino et al., "Detection of Propaganda Techniques in News Articles," in Proceedings of the SemEval-2020 Task 11 (2020). [2] Kilian Sprenkamp et al., "Large Language Models for Propaganda Detection," in Proceedings of the 2023 5th International Conference on Computational Intelligence and Networks (2023).}} -} - - - -@article{Chan2024Aug, - author = {Chan, Ryan Sze-Yin and Nanni, Federico and Brown, Edwin and Chapman, Ed and Williams, Angus R. and Bright, Jonathan and Gabasova, Evelina}, - title = {{Prompto: An open source library for asynchronous querying of LLM endpoints}}, - journal = {arXiv}, - year = {2024}, - month = aug, - urldate = {2024-08-24}, - eprint = {2408.11847}, - doi = {10.48550/arXiv.2408.11847}, - keywords = {Computation and Language (cs.CL)}, - abstract = {{Recent surge in Large Language Model (LLM) availability has opened exciting avenues for research. However, efficiently interacting with these models presents a significant hurdle since LLMs often reside on proprietary or self-hosted API endpoints, each requiring custom code for interaction. Conducting comparative studies between different models can therefore be time-consuming and necessitate significant engineering effort, hindering research efficiency and reproducibility. To address these challenges, we present prompto, an open source Python library which facilitates asynchronous querying of LLM endpoints enabling researchers to interact with multiple LLMs concurrently, while maximising efficiency and utilising individual rate limits. Our library empowers researchers and developers to interact with LLMs more effectively and enabling faster experimentation and evaluation. prompto is released with an introductory video (this https URL) under MIT License and is available via GitHub (this https URL).}} -} diff --git a/paper/paper.md b/paper/paper.md deleted file mode 100644 index e645011..0000000 --- a/paper/paper.md +++ /dev/null @@ -1,164 +0,0 @@ ---- -title: 'ollamar: An R package for running large language models' -tags: - - R - - large language models - - Ollama - - natural language processing - - artificial intelligence -authors: - - name: Hause Lin - orcid: 0000-0003-4590-7039 - affiliation: 1 - - name: Tawab Safi - orcid: 0009-0000-5659-9890 - affiliation: 1 -affiliations: - - name: Massachusetts Institute of Technology, USA - index: 1 -date: 24 August 2024 -bibliography: paper.bib ---- - -# Summary - -Large language models (LLMs) have transformed natural language processing and AI. Many tools like Ollama (https://ollama.com/) have been developed to allow users to easily deploy and interact with LLMs hosted on users' own machines. `ollamar` is an R library that interfaces with Ollama, allowing R users to easily run and interact with LLMs. This library is valuable for researchers and data scientists integrating LLMs into R workflows. `ollamar` is actively developed on GitHub (https://github.com/hauselin/ollamar) and available on the Comprehensive R Archive Network (https://cran.r-project.org/web/packages/ollamar/index.html). - -# Statement of Need - -The increasing importance of LLMs in various fields has created a demand for accessible tools that allow researchers and practitioners to leverage LLMs within their preferred programming environments. Locally deployed LLMs offer advantages in terms of data privacy, security, and customization, making them an attractive option for many users [@Chan2024Aug; @Liu2024Aug; @Lytvyn2024Jun; @Shostack2024Mar]. However, the lack of native R libraries for interfacing with locally deployed LLMs has limited the accessibility of these models to R users, even though R is a popular and crucial tool in statistics, data science, and various research domains [@Hill2024May; @Turner2024Aug]. `ollamar` fills a critical gap in the R ecosystem by providing a native interface to run locally deployed LLMs. - -The `ollamar` R library is a package that integrates R with Ollama, allowing users to run large language models locally on their machines. Although alternative R libraries exist [@Gruber2024Apr], `ollamar` distinguishes itself through the features described below. - -**User-friendly API wrapper**: It provides an interface to the Ollama server and all API endpoints, closely following the official API design. This design makes it easy for R users to understand how similar libraries (such as in Python and JavaScript) work while allowing users familiar with other programming languages to adapt to and use this library quickly. The consistent API structure across languages facilitates seamless transitions and knowledge transfer for developers working in multi-language environments. - -**Consistent and flexible output formats**: All functions that call API endpoints return `httr2::httr2_response` objects by default, but users can specify different output formats, such as dataframes (`"df"`), lists (of JSON objects) (`"jsonlist"`), raw strings (`"raw"`), text vectors (`"text"`), or request objects (`"req"`). This flexibility greatly enhances the usability and versatility of the library. Users can choose the format that best suits their needs, such as when working with different data structures, integrating the output with other R packages, or allowing parallelization via the `httr2` library. - -**Utility functions for managing conversation history**: LLM APIs often expect conversational or chat history data as input, often nested lists or JSON objects. Note that this data format is standard for chat-based applications and APIs (not limited to Ollama), such as those provided by OpenAI and Anthropic. `ollamar` provides helper functions to simplify preparing and processing conversational data for input to different LLMs, streamlining the workflow for chat-based applications. - -```r -# nested list of conversation history with multiple messages -list( - list(role = "system", content = "Be kind."), - list(role = "user", content = "Hi! How are you?") -) -``` - -# Usage and examples - -This section highlights the key features of `ollamar`. For documentation and detailed examples, see https://hauselin.github.io/ollama-r/. - -## Install and use Ollama - -1. Download and install Ollama from https://ollama.com -2. Open/launch the Ollama app to start the local server -3. Install \verb+ollamar+ in R by running `install.packages("ollamar")` - -```r -install.packages("ollamar") -library(ollamar) # load ollamar - -test_connection() # test connection to Ollama server -# -# GET http://localhost:11434/ -# Status: 200 OK # indicates connected to server -``` -## Manage LLMs - -To use Ollama, you must first download the model you want to use from https://ollama.com/library. All examples below use the Google's Gemma 2 LLM (specifically, the 2-billion parameter model, which is about 1.6GB, as of August 2024). - -```r -# download model, https://ollama.com/library/gemma2:2b -pull("gemma2:2b") - -# two ways to verify it's downloaded -list_models() -model_avail("gemma2:2b") -``` - -## Call API endpoints - -`ollamar` has distinct functions for each official Ollama API endpoint (see https://hauselin.github.io/ollama-r/reference/index.html). By default, all functions calling API endpoints will return an `httr2::httr2_response` object (see https://httr2.r-lib.org/index.html). You can then parse/process the response object using the `resp_process()` function. - -```r -# generate text based on a single prompt -resp <- generate("gemma2:2b", "tell me a 5-word story") -resp_process(resp, "text") -resp_process(resp, "df") -resp_process(resp, "jsonlist") -resp_process(resp, "raw") - -# generate text based on chat or conversation history -# create messages in a chat history -messages <- create_messages( - create_message("end all your sentences with !!!", role = "system"), - create_message("Hello") # default role is user -) -resp <- chat("gemma2:2b", messages) # make request with chat API endpoint - -# get vector embedding for prompts -embed("gemma2:2b", "Hello, how are you?") -embed("gemma2:2b", c("Hello, how are you?", "Good bye")) -``` - -## Manage chat history - -When chatting with a model, Ollama and other LLM providers like OpenAI and Anthropic require chat/conversation histories to be formatted in a particular way. `ollamar provides utility functions to format the messages in the chat history. - -```r -# initialize or create messages for a chat history -messages <- create_messages( - create_message("end all your sentences with !!!", role = "system"), - create_message("Hello") # default role is user -) - -# add message to the end of chat history -messages <- append_message("Hi, how are you?", "assistant", messages) -# delete message at index/position 1 -messages <- delete_message(messages, 1) -# prepend message to the beginning of chat history -messages <- prepend_message("Start all sentences with Yo!", "user", messages) -# insert message at position 2 -messages <- insert_message("Yo!", "assistant", messages, 2) -``` - -## Make parallel requests - -`ollamar` uses the `httr2` library, which provides functions to make parallel requests. Below is a simple example demonstrating how to perform sentiment analysis in parallel. Specifically, we use the `generate()` function with the parameter `output = "req"`, which asks the function to return an `httr2::httr2_request` object instead of making the request. - - -```r -library(httr2) - -texts_to_classify <- c( - 'I love this product', - 'I hate this product', - 'I am neutral about this product', - 'I like this product' -) - -# create httr2_request objects for each text with the same system prompt -reqs <- lapply(texts_to_classify, function(text) { - prompt <- paste0("Is the statement positive, negative, or neutral? ", text) - generate("gemma2:2b", prompt, output = "req") -}) - -# make parallel requests and get responses -resps <- req_perform_parallel(reqs) - -# process each response with resp_process to extract text -sapply(resps, resp_process, "text") -``` - -# Conclusion - -`ollamar` bridges a crucial gap in the R ecosystem by providing seamless access to large language models through Ollama. Its user-friendly API, flexible output formats, and conversation management utilities enable R users to integrate LLMs into their workflows easily. This library empowers researchers and data scientists across various disciplines to leverage the power of locally deployed LLMs, potentially accelerating research and development in fields relying on R for data analysis and machine learning. - -# Acknowledgements - -This project was partially supported by the Canadian Social Sciences & Humanities Research Council Tri-Agency Funding (funding reference: 192324). - -# References - - -