-
Notifications
You must be signed in to change notification settings - Fork 268
/
09_pinecone.py
149 lines (116 loc) · 4.48 KB
/
09_pinecone.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from dotenv import load_dotenv
load_dotenv()
import os
from pathlib import Path
import requests
from bs4 import BeautifulSoup
import pinecone
from llama_index import (
SimpleDirectoryReader,
LLMPredictor,
ServiceContext,
GPTVectorStoreIndex,
QuestionAnswerPrompt,
PineconeReader
)
from llama_index.vector_stores import PineconeVectorStore
from llama_index.storage.storage_context import StorageContext
from langchain.chat_models import ChatOpenAI
# reader = PineconeReader(
# api_key=os.getenv("PINECONE_API_KEY"),
# environment="us-west4-gcp"
# )
# docs_from_pinecone = reader.load_data(index_name="nietzsche")
urls = [
"https://www.projekt-gutenberg.org/nietzsch/wanderer/wanderer.html",
"https://www.projekt-gutenberg.org/nietzsch/wanderer/wande002.html",
"https://www.projekt-gutenberg.org/nietzsch/wanderer/wande003.html",
"https://www.projekt-gutenberg.org/nietzsch/wanderer/wande004.html",
]
def scrape_book(urls):
for url in urls:
result = []
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
# keep only the heading tags up to h3, and p tags
text = soup.find_all(["h1", "h2", "h3", "p"])
# remove the tags and keep the inner text
text = [t.text for t in text]
for i in text:
try:
result.append(i.encode('latin').decode("utf-8"))
except:
pass
book_path = Path("book")
if not book_path.exists():
book_path.mkdir()
pagename = url.split("/")[-1]
with open(book_path / f"{pagename}.txt", "w") as f:
f.write("\n".join(result))
def create_pages(urls):
pages = []
for url in urls:
pagename = url.split("/")[-1]
pages.append(pagename)
return pages
def build_docs(pages):
docs = {}
for page in pages:
docs[page] = SimpleDirectoryReader(
input_files=[f"book/{page}.txt"]
).load_data()
return docs
def build_context(model_name):
llm_predictor = LLMPredictor(
llm=ChatOpenAI(temperature=0, model_name=model_name)
)
return ServiceContext.from_defaults(llm_predictor=llm_predictor)
def build_index(pages, docs):
page_indices = {}
pinecone.init(
api_key=os.getenv("PINECONE_API_KEY"),
environment="us-west4-gcp"
)
# create a Pinecone index if you don't have one
# https://openai.com/blog/new-and-improved-embedding-model (12288 -> 1536 dimensions)
# pinecone.create_index("nietzsche", dimension=1536, metric="cosine")
pinecone_index = pinecone.Index("nietzsche")
# pinecone_index.upsert("nietzsche_wandere", [1,2,3])
# pinecone_index.describe_index_stats()
# pinecone_index.delete_index()
service_context = build_context("gpt-3.5-turbo")
for page in pages:
vector_store = PineconeVectorStore(
pinecone_index=pinecone_index,
metadata_filters={"page": page}
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
page_indices[page] = GPTVectorStoreIndex.from_documents(
docs[page], storage_context=storage_context, service_context=service_context
)
page_indices[page].index_struct.index_id = page
print("Indexing complete.")
return page_indices
if __name__ == "__main__":
# uncomment this to download books from project guternberg
# scrape_book(urls)
# assuming books have already been downloaded into your local directory
pages = create_pages(urls)
docs = build_docs(pages)
# print(docs.keys())
indices = build_index(pages, docs)
# response = indices["wande002.html"].as_query_engine().query(
# "What are Nietzsche's view on religion? Answer in the original German text, and provide an English translation for the answer"
# )
PROMPT_TEMPLATE = (
"Here are the context information:"
"\n-----------------------------\n"
"{context_str}"
"\n-----------------------------\n"
"Answer the following question in the original German text, and provide an english translation and explanation in as instructive and educational way as possible: {query_str} \n"
)
QA_PROMPT = QuestionAnswerPrompt(PROMPT_TEMPLATE)
query_engine = indices["wande002.html"].as_query_engine(text_qa_template=QA_PROMPT)
response = query_engine.query("What are important things according to Nietzsche?")
print(str(response))
print(response.get_formatted_sources())