Spaces:

thinh111
/

ResvuChatbox

Runtime error

App Files Files Community

ResvuChatbox / process-documents.py

thinh111

initial

615a1d7 verified 3 months ago

raw

history blame

No virus

2.08 kB

	from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
	from langchain_community.document_loaders import UnstructuredFileLoader, DirectoryLoader
	from langchain_community.vectorstores import FAISS
	from langchain_community.embeddings import GPT4AllEmbeddings

	# Khai bao bien
	pdf_data_path = "/data"
	vector_db_path = "vectorstores/db_faiss"

	# Ham 1. Tao ra vector DB tu 1 doan text
	def create_db_from_text():
	raw_text = """
	Thinh created you who is a chatbox at Resvu,
	"""

	# Chia nho van ban
	text_splitter = CharacterTextSplitter(
	separator="\n",
	chunk_size=100,
	chunk_overlap=20,
	length_function=len
	)

	chunks = text_splitter.split_text(raw_text)

	# Embeding
	embedding_model = GPT4AllEmbeddings(model_name="all-MiniLM-L6-v2.gguf2.f16.gguf")

	# Dua vao Faiss Vector DB
	db = FAISS.from_texts(texts=chunks, embedding=embedding_model)
	db.save_local(vector_db_path)
	return db

	# Define the file types you want to load
	file_types = [".pdf", ".txt", ".doc", ".docx"]

	def create_db_from_files():
	# Khai bao loader de quet toan bo thu muc dataa
	# loader = DirectoryLoader(pdf_data_path, glob=file_types, loader_cls = PyPDFLoader)
	# documents = loader.load()

	# Create a loader for each file type
	loaders = []
	for file_type in file_types:
	loader = DirectoryLoader(
	pdf_data_path,
	glob=file_type,
	loader_cls=UnstructuredFileLoader
	)
	loaders.append(loader)

	# Load all documents
	documents = []
	for loader in loaders:
	documents.extend(loader.load())

	text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
	chunks = text_splitter.split_documents(documents)

	# Embeding
	embedding_model = GPT4AllEmbeddings(model_name="all-MiniLM-L6-v2.gguf2.f16.gguf")
	db = FAISS.from_documents(chunks, embedding_model)
	db.save_local(vector_db_path)
	return db