132 lines
4.4 KiB
Python
132 lines
4.4 KiB
Python
import json
|
||
from abc import ABC
|
||
from typing import List
|
||
from langchain.docstore.document import Document
|
||
from langchain.document_loaders.base import BaseLoader
|
||
|
||
|
||
class Person:
|
||
def __init__(self, name, age):
|
||
self.name = name
|
||
self.age = age
|
||
|
||
|
||
class Dialogue:
|
||
"""
|
||
Build an abstract dialogue model using classes and methods to represent different dialogue elements.
|
||
This class serves as a fundamental framework for constructing dialogue models.
|
||
"""
|
||
|
||
def __init__(self, file_path: str):
|
||
self.file_path = file_path
|
||
self.turns = []
|
||
|
||
def add_turn(self, turn):
|
||
"""
|
||
Create an instance of a conversation participant
|
||
:param turn:
|
||
:return:
|
||
"""
|
||
self.turns.append(turn)
|
||
|
||
def parse_dialogue(self):
|
||
"""
|
||
The parse_dialogue function reads the specified dialogue file and parses each dialogue turn line by line.
|
||
For each turn, the function extracts the name of the speaker and the message content from the text,
|
||
creating a Turn instance. If the speaker is not already present in the participants dictionary,
|
||
a new Person instance is created. Finally, the parsed Turn instance is added to the Dialogue object.
|
||
|
||
Please note that this sample code assumes that each line in the file follows a specific format:
|
||
<speaker>:\r\n<message>\r\n\r\n. If your file has a different format or includes other metadata,
|
||
you may need to adjust the parsing logic accordingly.
|
||
"""
|
||
participants = {}
|
||
speaker_name = None
|
||
message = None
|
||
|
||
with open(self.file_path, encoding='utf-8') as file:
|
||
lines = file.readlines()
|
||
for i, line in enumerate(lines):
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
|
||
if speaker_name is None:
|
||
speaker_name, _ = line.split(':', 1)
|
||
elif message is None:
|
||
message = line
|
||
if speaker_name not in participants:
|
||
participants[speaker_name] = Person(speaker_name, None)
|
||
|
||
speaker = participants[speaker_name]
|
||
turn = Turn(speaker, message)
|
||
self.add_turn(turn)
|
||
|
||
# Reset speaker_name and message for the next turn
|
||
speaker_name = None
|
||
message = None
|
||
|
||
def display(self):
|
||
for turn in self.turns:
|
||
print(f"{turn.speaker.name}: {turn.message}")
|
||
|
||
def export_to_file(self, file_path):
|
||
with open(file_path, 'w', encoding='utf-8') as file:
|
||
for turn in self.turns:
|
||
file.write(f"{turn.speaker.name}: {turn.message}\n")
|
||
|
||
def to_dict(self):
|
||
dialogue_dict = {"turns": []}
|
||
for turn in self.turns:
|
||
turn_dict = {
|
||
"speaker": turn.speaker.name,
|
||
"message": turn.message
|
||
}
|
||
dialogue_dict["turns"].append(turn_dict)
|
||
return dialogue_dict
|
||
|
||
def to_json(self):
|
||
dialogue_dict = self.to_dict()
|
||
return json.dumps(dialogue_dict, ensure_ascii=False, indent=2)
|
||
|
||
def participants_to_export(self):
|
||
"""
|
||
participants_to_export
|
||
:return:
|
||
"""
|
||
participants = set()
|
||
for turn in self.turns:
|
||
participants.add(turn.speaker.name)
|
||
return ', '.join(participants)
|
||
|
||
|
||
class Turn:
|
||
def __init__(self, speaker, message):
|
||
self.speaker = speaker
|
||
self.message = message
|
||
|
||
|
||
class DialogueLoader(BaseLoader, ABC):
|
||
"""Load dialogue."""
|
||
|
||
def __init__(self, file_path: str):
|
||
"""Initialize with dialogue."""
|
||
self.file_path = file_path
|
||
dialogue = Dialogue(file_path=file_path)
|
||
dialogue.parse_dialogue()
|
||
self.dialogue = dialogue
|
||
|
||
def load(self) -> List[Document]:
|
||
"""Load from dialogue."""
|
||
documents = []
|
||
participants = self.dialogue.participants_to_export()
|
||
|
||
for turn in self.dialogue.turns:
|
||
metadata = {"source": f"Dialogue File:{self.dialogue.file_path},"
|
||
f"speaker:{turn.speaker.name},"
|
||
f"participant:{participants}"}
|
||
turn_document = Document(page_content=turn.message, metadata=metadata.copy())
|
||
documents.append(turn_document)
|
||
|
||
return documents
|