I am trying to create a chat app through which i can query pdf documents. The code for upserting document in pinecone is as follows:
def handler(event, context):dotenv.load_dotenv()s3 = boto3.client("s3") client = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) pinecone.init(api_key=os.environ["PINECONE_API_KEY"], environment=os.environ["PINECONE_ENV"])index_name = "golf-knowledge-base"bucket_name = 'aidata-pdf-files'logger.info(event)for record in event["Records"]: s3_key = unquote_plus(record["s3"]["object"]["key"]) logger.info("Extracting text:"+ s3_key) object = s3.get_object(Bucket=bucket_name,Key=s3_key) fileObject = object["Body"].read() reader = PdfReader(io.BytesIO(fileObject)) pages = [] for page in reader.pages: pages.append( Document( page_content=page.extract_text(), metadata={'source':s3_key,'page': reader.get_page_number(page) } ) ) document_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=20) documents = document_splitter.split_documents(pages) # test pinecone index logger.info("checking pinecone index") pinecone.whoami() # create index if it does not exists if index_name not in pinecone.list_indexes(): logger.info("Creating Pinecone Index : "+index_name) pinecone.create_index(name=index_name,dimension=1536, metric="cosine") # connect to index index = pinecone.Index(index_name) logger.info(index.describe_index_stats()) logger.info("creating embeddings") vectors = [] for doc in documents: embeds = get_embedding(client, text=doc.page_content) # Generate consistant keys to avoid duplicate indigest id = doc.metadata["source"].split(".")[:-1][0].replace(" ","-").lower() +"-" + str(doc.metadata["page"]) vector = {'id': id , 'values': embeds, 'metadata' : doc.metadata} vectors.append(vector) # Store embeddings in pinecone response = index.upsert(vectors) print(response)return {"status":200, "message":"stored to db"} def get_embedding(client, text, model="text-embedding-ada-002"):text = text.replace("\n", " ")return client.embeddings.create(input = [text], model=model).data[0].embedding
And this is how i am querying document.
def pinecone_query(query) :pc = Pinecone(api_key="Key")index = pc.Index("name")OPENAI_API_KEY="Key"client = OpenAI(api_key=OPENAI_API_KEY) # print(pc.whoami())print(pc.list_indexes())embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)vectorstore = vs_Pinecone(index, embeddings.embed_query, "text") docs = vectorstore.similarity_search(query)print('docs len', len(docs)) # this returns 0for doc in docs: print(f"{doc.page_content}\n")chat_model_name = "gpt-3.5-turbo"llm = ChatOpenAI(model_name=chat_model_name, openai_api_key=OPENAI_API_KEY, temperature=0.0 )chain = load_qa_chain(llm, chain_type="stuff")res = chain.run(input_documents=docs, question=query)return res
It gives me the answer. But strangely returned matched documents are empty.
And i get a warning
WARNING:langchain_community.vectorstores.pinecone: Found documentwith no
text
key. Skipping.