ローカルLLM + RAG を試す Ollama使って メモ

WSL + Docker

docker build -t python312 .
docker images
docker run -it --name python312 --gpus all -h python312 -v /home/hoge:/home/hoge -v /mnt:/mnt -p 8888:8888  -p 11434:11434 python312
ssh 172.17.0.2


# Ollama インストール
ollama.com

curl -fsSL https://ollama.com/install.sh | sh
ollama serve
ollama run gemma2

# llama インストール

$ cat requirements.txt
llama_index
llama-index-llms-ollama
llama-index-embeddings-huggingface
llama-index-llms-llama-cpp
docx2txt

pip install -r requirements.txt

# プログラム実行

from llama_index.llms.ollama import Ollama

#llm = Ollama(model="phi3", request_timeout=60.0)
#llm = Ollama(model="lucas2024/llama-3-elyza-jp-8b:q5_k_m", request_timeout=60.0)
#llm = Ollama(model="llama3.1", request_timeout=60.0)
llm = Ollama(model="gemma2", request_timeout=60.0)

#response = llm.complete("What is the capital of France?")
response = llm.complete("日本で一番高い山はどこですか?")
print(response)

from llama_index.embeddings.huggingface import HuggingFaceEmbedding

#embed_model = HuggingFaceEmbedding(model_name="all-MiniLM-L6-v2")
#embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-large")
embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-small")

from llama_index.core import SimpleDirectoryReader, VectorStoreIndex

reader = SimpleDirectoryReader("data")
data = reader.load_data()
index = VectorStoreIndex.from_documents(data, embed_model=embed_model)

# クエリ
query_engine = index.as_query_engine(llm=llm, streaming=True, similarity_top_k=3)

while True:
    req_msg = input("\n\n >")
    if req_msg == "":
        continue
    res_msg = query_engine.query(req_msg)
    res_msg.print_response_stream()