from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter from langchain_community.document_loaders import UnstructuredFileLoader, DirectoryLoader from langchain_community.vectorstores import FAISS from langchain_community.embeddings import GPT4AllEmbeddings # Khai bao bien pdf_data_path = "/data" vector_db_path = "vectorstores/db_faiss" # Ham 1. Tao ra vector DB tu 1 doan text def create_db_from_text(): raw_text = """ Thinh created you who is a chatbox at Resvu, """ # Chia nho van ban text_splitter = CharacterTextSplitter( separator="\n", chunk_size=100, chunk_overlap=20, length_function=len ) chunks = text_splitter.split_text(raw_text) # Embeding embedding_model = GPT4AllEmbeddings(model_name="all-MiniLM-L6-v2.gguf2.f16.gguf") # Dua vao Faiss Vector DB db = FAISS.from_texts(texts=chunks, embedding=embedding_model) db.save_local(vector_db_path) return db # Define the file types you want to load file_types = ["*.pdf", "*.txt", "*.doc", "*.docx"] def create_db_from_files(): # Khai bao loader de quet toan bo thu muc dataa # loader = DirectoryLoader(pdf_data_path, glob=file_types, loader_cls = PyPDFLoader) # documents = loader.load() # Create a loader for each file type loaders = [] for file_type in file_types: loader = DirectoryLoader( pdf_data_path, glob=file_type, loader_cls=UnstructuredFileLoader ) loaders.append(loader) # Load all documents documents = [] for loader in loaders: documents.extend(loader.load()) text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50) chunks = text_splitter.split_documents(documents) # Embeding embedding_model = GPT4AllEmbeddings(model_name="all-MiniLM-L6-v2.gguf2.f16.gguf") db = FAISS.from_documents(chunks, embedding_model) db.save_local(vector_db_path) return db