132 lines
4.4 KiB
Python
132 lines
4.4 KiB
Python
|
|
import json
|
|||
|
|
from abc import ABC
|
|||
|
|
from typing import List
|
|||
|
|
from langchain.docstore.document import Document
|
|||
|
|
from langchain.document_loaders.base import BaseLoader
|
|||
|
|
|
|||
|
|
|
|||
|
|
class Person:
|
|||
|
|
def __init__(self, name, age):
|
|||
|
|
self.name = name
|
|||
|
|
self.age = age
|
|||
|
|
|
|||
|
|
|
|||
|
|
class Dialogue:
|
|||
|
|
"""
|
|||
|
|
Build an abstract dialogue model using classes and methods to represent different dialogue elements.
|
|||
|
|
This class serves as a fundamental framework for constructing dialogue models.
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
def __init__(self, file_path: str):
|
|||
|
|
self.file_path = file_path
|
|||
|
|
self.turns = []
|
|||
|
|
|
|||
|
|
def add_turn(self, turn):
|
|||
|
|
"""
|
|||
|
|
Create an instance of a conversation participant
|
|||
|
|
:param turn:
|
|||
|
|
:return:
|
|||
|
|
"""
|
|||
|
|
self.turns.append(turn)
|
|||
|
|
|
|||
|
|
def parse_dialogue(self):
|
|||
|
|
"""
|
|||
|
|
The parse_dialogue function reads the specified dialogue file and parses each dialogue turn line by line.
|
|||
|
|
For each turn, the function extracts the name of the speaker and the message content from the text,
|
|||
|
|
creating a Turn instance. If the speaker is not already present in the participants dictionary,
|
|||
|
|
a new Person instance is created. Finally, the parsed Turn instance is added to the Dialogue object.
|
|||
|
|
|
|||
|
|
Please note that this sample code assumes that each line in the file follows a specific format:
|
|||
|
|
<speaker>:\r\n<message>\r\n\r\n. If your file has a different format or includes other metadata,
|
|||
|
|
you may need to adjust the parsing logic accordingly.
|
|||
|
|
"""
|
|||
|
|
participants = {}
|
|||
|
|
speaker_name = None
|
|||
|
|
message = None
|
|||
|
|
|
|||
|
|
with open(self.file_path, encoding='utf-8') as file:
|
|||
|
|
lines = file.readlines()
|
|||
|
|
for i, line in enumerate(lines):
|
|||
|
|
line = line.strip()
|
|||
|
|
if not line:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
if speaker_name is None:
|
|||
|
|
speaker_name, _ = line.split(':', 1)
|
|||
|
|
elif message is None:
|
|||
|
|
message = line
|
|||
|
|
if speaker_name not in participants:
|
|||
|
|
participants[speaker_name] = Person(speaker_name, None)
|
|||
|
|
|
|||
|
|
speaker = participants[speaker_name]
|
|||
|
|
turn = Turn(speaker, message)
|
|||
|
|
self.add_turn(turn)
|
|||
|
|
|
|||
|
|
# Reset speaker_name and message for the next turn
|
|||
|
|
speaker_name = None
|
|||
|
|
message = None
|
|||
|
|
|
|||
|
|
def display(self):
|
|||
|
|
for turn in self.turns:
|
|||
|
|
print(f"{turn.speaker.name}: {turn.message}")
|
|||
|
|
|
|||
|
|
def export_to_file(self, file_path):
|
|||
|
|
with open(file_path, 'w', encoding='utf-8') as file:
|
|||
|
|
for turn in self.turns:
|
|||
|
|
file.write(f"{turn.speaker.name}: {turn.message}\n")
|
|||
|
|
|
|||
|
|
def to_dict(self):
|
|||
|
|
dialogue_dict = {"turns": []}
|
|||
|
|
for turn in self.turns:
|
|||
|
|
turn_dict = {
|
|||
|
|
"speaker": turn.speaker.name,
|
|||
|
|
"message": turn.message
|
|||
|
|
}
|
|||
|
|
dialogue_dict["turns"].append(turn_dict)
|
|||
|
|
return dialogue_dict
|
|||
|
|
|
|||
|
|
def to_json(self):
|
|||
|
|
dialogue_dict = self.to_dict()
|
|||
|
|
return json.dumps(dialogue_dict, ensure_ascii=False, indent=2)
|
|||
|
|
|
|||
|
|
def participants_to_export(self):
|
|||
|
|
"""
|
|||
|
|
participants_to_export
|
|||
|
|
:return:
|
|||
|
|
"""
|
|||
|
|
participants = set()
|
|||
|
|
for turn in self.turns:
|
|||
|
|
participants.add(turn.speaker.name)
|
|||
|
|
return ', '.join(participants)
|
|||
|
|
|
|||
|
|
|
|||
|
|
class Turn:
|
|||
|
|
def __init__(self, speaker, message):
|
|||
|
|
self.speaker = speaker
|
|||
|
|
self.message = message
|
|||
|
|
|
|||
|
|
|
|||
|
|
class DialogueLoader(BaseLoader, ABC):
|
|||
|
|
"""Load dialogue."""
|
|||
|
|
|
|||
|
|
def __init__(self, file_path: str):
|
|||
|
|
"""Initialize with dialogue."""
|
|||
|
|
self.file_path = file_path
|
|||
|
|
dialogue = Dialogue(file_path=file_path)
|
|||
|
|
dialogue.parse_dialogue()
|
|||
|
|
self.dialogue = dialogue
|
|||
|
|
|
|||
|
|
def load(self) -> List[Document]:
|
|||
|
|
"""Load from dialogue."""
|
|||
|
|
documents = []
|
|||
|
|
participants = self.dialogue.participants_to_export()
|
|||
|
|
|
|||
|
|
for turn in self.dialogue.turns:
|
|||
|
|
metadata = {"source": f"Dialogue File:{self.dialogue.file_path},"
|
|||
|
|
f"speaker:{turn.speaker.name},"
|
|||
|
|
f"participant:{participants}"}
|
|||
|
|
turn_document = Document(page_content=turn.message, metadata=metadata.copy())
|
|||
|
|
documents.append(turn_document)
|
|||
|
|
|
|||
|
|
return documents
|