ローカルLLM + RAG を試す Ollama使って メモ
WSL + Docker
docker build -t python312 . docker images docker run -it --name python312 --gpus all -h python312 -v /home/hoge:/home/hoge -v /mnt:/mnt -p 8888:8888 -p 11434:11434 python312 ssh 172.17.0.2
# Ollama インストール
ollama.com
curl -fsSL https://ollama.com/install.sh | sh ollama serve ollama run gemma2
# llama インストール
$ cat requirements.txt llama_index llama-index-llms-ollama llama-index-embeddings-huggingface llama-index-llms-llama-cpp docx2txt pip install -r requirements.txt
# プログラム実行
from llama_index.llms.ollama import Ollama #llm = Ollama(model="phi3", request_timeout=60.0) #llm = Ollama(model="lucas2024/llama-3-elyza-jp-8b:q5_k_m", request_timeout=60.0) #llm = Ollama(model="llama3.1", request_timeout=60.0) llm = Ollama(model="gemma2", request_timeout=60.0) #response = llm.complete("What is the capital of France?") response = llm.complete("日本で一番高い山はどこですか?") print(response) from llama_index.embeddings.huggingface import HuggingFaceEmbedding #embed_model = HuggingFaceEmbedding(model_name="all-MiniLM-L6-v2") #embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-large") embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-small") from llama_index.core import SimpleDirectoryReader, VectorStoreIndex reader = SimpleDirectoryReader("data") data = reader.load_data() index = VectorStoreIndex.from_documents(data, embed_model=embed_model) # クエリ query_engine = index.as_query_engine(llm=llm, streaming=True, similarity_top_k=3) while True: req_msg = input("\n\n >") if req_msg == "": continue res_msg = query_engine.query(req_msg) res_msg.print_response_stream()