check point

This commit is contained in:
vuongps38770
2025-12-25 18:06:29 +07:00
commit 31de8b0d84
34 changed files with 3209 additions and 0 deletions

44
src/__init__.py Normal file
View File

@@ -0,0 +1,44 @@
"""
src - Game Generator Core Package
"""
from src.core import GameCore
from src.game_registry import (
GameRegistry,
get_registry,
reload_games,
get_active_game_types,
get_active_type_ids,
get_game_by_id,
get_game,
id_to_type,
type_to_id
)
from src.llm_config import ModelConfig, get_llm, get_default_config, create_config
from src.validator import QuoteValidator, quick_validate
__all__ = [
# Core
"GameCore",
# Registry
"GameRegistry",
"get_registry",
"reload_games",
"get_active_game_types",
"get_active_type_ids",
"get_game_by_id",
"get_game",
"id_to_type",
"type_to_id",
# LLM Config
"ModelConfig",
"get_llm",
"get_default_config",
"create_config",
# Validator
"QuoteValidator",
"quick_validate",
]

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

513
src/core.py Normal file
View File

@@ -0,0 +1,513 @@
"""
core.py - Simple Game Generator Core
3 USE CASES:
1. run_multi() - Analyze + generate nhiều games phù hợp
2. run_single() - Analyze + generate 1 game tốt nhất (1 API call)
3. generate() - Generate 1 game cụ thể (không analyze)
"""
import os
import json
import time
from typing import List, Dict, Any, Optional, Union
from dataclasses import dataclass
from langchain_core.prompts import ChatPromptTemplate
from .llm_config import ModelConfig, get_llm, get_default_config
from .game_registry import get_registry, get_game, get_analyzer_context
from .validator import QuoteValidator
from .logger import logger
@dataclass
class TokenUsage:
prompt_tokens: int = 0
completion_tokens: int = 0
total_tokens: int = 0
def add(self, usage: Dict[str, int]):
self.prompt_tokens += usage.get("prompt_tokens", 0) or usage.get("input_tokens", 0)
self.completion_tokens += usage.get("completion_tokens", 0) or usage.get("output_tokens", 0)
self.total_tokens = self.prompt_tokens + self.completion_tokens
def to_dict(self) -> Dict[str, int]:
return {"prompt_tokens": self.prompt_tokens, "completion_tokens": self.completion_tokens, "total_tokens": self.total_tokens}
class GameCore:
"""
Simple Game Generator.
Usage:
core = GameCore()
# 1. Generate nhiều games (analyze first)
result = core.run_multi(text)
# 2. Generate 1 game tốt nhất (1 API call)
result = core.run_single(text)
# 3. Generate 1 game cụ thể
result = core.generate("quiz", text)
"""
def __init__(self, llm_config: Optional[Union[ModelConfig, Dict, str]] = None):
self.llm_config = self._parse_config(llm_config)
self.llm = get_llm(self.llm_config)
self.validator = QuoteValidator()
self.registry = get_registry()
print(f"🤖 LLM: {self.llm_config.provider}/{self.llm_config.model_name}")
def _parse_config(self, config) -> ModelConfig:
if config is None:
if os.getenv("GOOGLE_API_KEY"):
return get_default_config("gemini")
elif os.getenv("OPENAI_API_KEY"):
return get_default_config("openai")
return get_default_config("ollama")
if isinstance(config, ModelConfig):
return config
if isinstance(config, str):
return get_default_config(config)
if isinstance(config, dict):
return ModelConfig(**config)
raise ValueError(f"Invalid config: {type(config)}")
# ============== 1. RUN MULTI (Analyze + Generate nhiều games) ==============
def run_multi(
self,
text: str,
enabled_games: Optional[List[str]] = None,
max_items: int = 3,
min_score: int = 20,
validate: bool = True,
debug: bool = False
) -> Dict[str, Any]:
"""
Analyze text + Generate nhiều games phù hợp.
Returns: {success, games, results, errors, token_usage, llm}
"""
tracker = TokenUsage()
errors = []
# 1. Analyze (also returns metadata)
available = enabled_games or self.registry.get_game_types()
logger.info(f"Analyzing text for multi-gen. Available games: {available}")
games, scores, metadata, err = self._analyze(text, available, min_score, tracker, debug)
errors.extend(err)
if not games:
logger.warning("Analyzer found no suitable games matches.")
return self._result(False, [], {}, errors, tracker, metadata=metadata)
logger.info(f"Analyzer selected: {games}")
# 2. Generate
results, err = self._generate_multi(games, text, max_items, tracker, debug)
errors.extend(err)
# 3. Validate
if validate:
results = self._validate(results, text)
# Check if any game has items
has_items = any(data.get("items", []) for data in results.values() if isinstance(data, dict))
return self._result(has_items, games, results, errors, tracker, scores, metadata)
# ============== 2. RUN SINGLE (1 API call: Analyze + Generate 1 game) ==============
def run_single(
self,
text: str,
enabled_games: Optional[List[str]] = None,
max_items: int = 3,
validate: bool = True,
debug: bool = False
) -> Dict[str, Any]:
"""
1 API call: Analyze + Generate game tốt nhất.
Returns: {success, game_type, reason, items, errors, token_usage, llm}
"""
tracker = TokenUsage()
available = enabled_games or self.registry.get_game_types()
logger.info(f"Starting run_single for available games: {available}")
# Build games info
games_info = []
for gt in available:
game = get_game(gt)
if game:
example = json.dumps(game.examples[0].get('output', {}), ensure_ascii=False, indent=2) if game.examples else "{}"
games_info.append(f"### {gt}\n{game.description}\nExample output:\n{example}")
prompt = ChatPromptTemplate.from_messages([
("system", """You are an educational game generator.
1. ANALYZE text and CHOOSE the BEST game type
2. GENERATE items for that game
RULES:
- KEEP original language
- original_quote = EXACT copy from source
- ALL content from source only"""),
("human", """GAMES:
{games_info}
TEXT:
{text}
Choose BEST game from: {types}
Generate max {max_items} items.
Return JSON:
{{"game_type": "chosen", "reason": "why", "items": [...]}}""")
])
content = {"games_info": "\n\n".join(games_info), "text": text[:2000], "types": ", ".join(available), "max_items": max_items}
if debug:
print(f"\n{'='*50}\n🎯 RUN SINGLE\n{'='*50}")
try:
resp = (prompt | self.llm).invoke(content)
tracker.add(self._get_usage(resp))
data = self._parse_json(resp.content)
game_type = data.get("game_type")
items = self._post_process(data.get("items", []), game_type)
if validate and items:
items = [i for i in items if self.validator.validate_quote(i.get("original_quote", ""), text).is_valid]
return {
"success": len(items) > 0,
"game_type": game_type,
"reason": data.get("reason", ""),
"items": items,
"errors": [],
"token_usage": tracker.to_dict(),
"llm": f"{self.llm_config.provider}/{self.llm_config.model_name}"
}
except Exception as e:
return {"success": False, "game_type": None, "items": [], "errors": [str(e)], "token_usage": tracker.to_dict(), "llm": f"{self.llm_config.provider}/{self.llm_config.model_name}"}
# ============== 3. GENERATE (1 game cụ thể, không analyze) ==============
def generate(
self,
game_type: str,
text: str,
max_items: int = 3,
validate: bool = True,
debug: bool = False
) -> Dict[str, Any]:
"""Generate 1 game cụ thể"""
tracker = TokenUsage()
logger.info(f"Generating single game content: {game_type}")
game = get_game(game_type)
if not game:
return {"success": False, "game_type": game_type, "items": [], "errors": [f"Game not found: {game_type}"], "token_usage": {}, "llm": ""}
# Build Format Rules Section
format_rules_section = ""
if game.input_format_rules:
rules_str = "\n".join(f"- {r}" for r in game.input_format_rules)
format_rules_section = f"""
CRITICAL: FIRST, VALIDATE THE INPUT TEXT.
Format Rules:
{rules_str}
If the text is completely UNSUITABLE for this game type, you MUST output strictly this JSON and nothing else:
{{{{ "format_error": "Input text incompatible with game requirements." }}}}
"""
prompt = ChatPromptTemplate.from_messages([
("system", f"""{game.generated_system_prompt}
{format_rules_section}"""),
("human", """TEXT TO PROCESS:
{text}
Generate content in JSON format:
{format_instructions}""")
])
if debug:
print(f"\n{'='*50}\n🎮 GENERATE: {game_type}\n{'='*50}")
try:
resp = (prompt | self.llm).invoke({
"text": text,
"format_instructions": game.format_instructions
})
tracker.add(self._get_usage(resp))
# 1. Parse as raw JSON first to check for format_error
raw_data = None
try:
raw_data = self._parse_json(resp.content)
except:
pass
# 2. Check if it's a format_error immediately
if raw_data and raw_data.get("format_error"):
return {
"success": False,
"game_type": game_type,
"data": None,
"format_error": raw_data["format_error"],
"errors": [raw_data["format_error"]],
"token_usage": tracker.to_dict(),
"llm": f"{self.llm_config.provider}/{self.llm_config.model_name}"
}
parsed_data = raw_data
# 3. Try output_parser for structured validation if present
if game.output_parser:
try:
parsed = game.output_parser.parse(resp.content)
parsed_data = parsed.model_dump()
except Exception as pe:
if debug: print(f"⚠️ output_parser failed: {pe}")
# Keep raw_data if parser fails but we have JSON
# Check format error
if parsed_data and parsed_data.get("format_error"):
return {
"success": False,
"game_type": game_type,
"data": None,
"format_error": parsed_data["format_error"],
"errors": [parsed_data["format_error"]],
"token_usage": tracker.to_dict(),
"llm": f"{self.llm_config.provider}/{self.llm_config.model_name}"
}
# Post-process
items = parsed_data.get("items", []) if parsed_data else []
items = self._post_process(items, game_type)
if validate and items:
items = [i for i in items if self.validator.validate_quote(i.get("original_quote", ""), text).is_valid]
if not items:
return {
"success": False,
"game_type": game_type,
"data": None,
"format_error": "No items extracted",
"errors": [],
"token_usage": tracker.to_dict(),
"llm": f"{self.llm_config.provider}/{self.llm_config.model_name}"
}
if parsed_data:
parsed_data["items"] = items
return {
"success": True,
"game_type": game_type,
"data": parsed_data,
"errors": [],
"token_usage": tracker.to_dict(),
"llm": f"{self.llm_config.provider}/{self.llm_config.model_name}"
}
except Exception as e:
return {"success": False, "game_type": game_type, "data": None, "errors": [str(e)], "token_usage": tracker.to_dict(), "llm": f"{self.llm_config.provider}/{self.llm_config.model_name}"}
# ============== PRIVATE METHODS ==============
def _analyze(self, text: str, available: List[str], min_score: int, tracker: TokenUsage, debug: bool) -> tuple:
"""Analyze text để suggest games - với retry"""
# Lấy context từ game configs
context = get_analyzer_context()
prompt = ChatPromptTemplate.from_messages([
("system", """You are a game type analyzer. Score each game 0-100 based on how well the text matches the game requirements.
GAME REQUIREMENTS:
{context}
SCORING:
- 70-100: Text matches game requirements well
- 40-69: Partial match
- 0-39: Does not match requirements
IMPORTANT: You MUST use the exact game type name (e.g. 'quiz', 'sequence') in the "type" field.
Return valid JSON with scores AND metadata about the content:
{{
"scores": [
{{
"type": "NAME_OF_GAME_TYPE",
"score": 80,
"reason": "..."
}}
],
"metadata": {{
"title": "Title from source or create short title",
"description": "One sentence summary",
"grade": 1-5,
"difficulty": 1-5
}}
}}"""),
("human", """TEXT TO ANALYZE:
{text}
Analyze for games: {types}
Return JSON:""")
])
max_retries = 2
for attempt in range(max_retries):
try:
resp = (prompt | self.llm).invoke({
"context": context,
"text": text[:800],
"types": ", ".join(available)
})
tracker.add(self._get_usage(resp))
if debug:
print(f"📝 Analyzer raw: {resp.content[:300]}")
# Parse JSON với fallback
content = resp.content.strip()
if not content:
if debug:
print(f"⚠️ Empty response, retry {attempt + 1}")
continue
data = self._parse_json(content)
scores = [s for s in data.get("scores", []) if s.get("type") in available and s.get("score", 0) >= min_score]
scores.sort(key=lambda x: x.get("score", 0), reverse=True)
# Extract metadata from response
metadata = data.get("metadata", {})
if debug:
print(f"🔍 Scores: {scores}")
print(f"📋 Metadata: {metadata}")
return [s["type"] for s in scores], scores, metadata, []
except Exception as e:
if debug:
print(f"⚠️ Analyze attempt {attempt + 1} failed: {e}")
if attempt == max_retries - 1:
# Final fallback: return all games với low score
return available, [], {}, [f"Analyze error: {e}"]
return available, [], {}, ["Analyze failed after retries"]
def _generate_multi(self, games: List[str], text: str, max_items: int, tracker: TokenUsage, debug: bool) -> tuple:
"""Generate nhiều games"""
if len(games) == 1:
result = self.generate(games[0], text, max_items, validate=False, debug=debug)
tracker.add(result.get("token_usage", {}))
# Fix: generate returns {data: {items: [...]}} not {items: [...]}
data = result.get("data") or {}
items = data.get("items", []) if isinstance(data, dict) else []
return {games[0]: {"items": items, "metadata": data.get("metadata")}}, result.get("errors", [])
# Multi-game: Build schema info for each game
games_schema = []
for gt in games:
game = get_game(gt)
if game:
games_schema.append(f"""### {gt.upper()}
{game.generated_system_prompt}
REQUIRED OUTPUT FORMAT:
{game.format_instructions}""")
prompt = ChatPromptTemplate.from_messages([
("system", """You are a multi-game content generator.
Generate items for EACH game type following their EXACT schema.
IMPORTANT: Include ALL required fields for each item (image_description, image_keywords, etc.)
RULES: Keep original language, use exact quotes from text."""),
("human", """GAMES AND THEIR SCHEMAS:
{schemas}
SOURCE TEXT:
{text}
Generate items for: {types}
Return valid JSON: {{{format}}}""")
])
fmt = ", ".join([f'"{gt}": {{"items": [...], "metadata": {{...}}}}' for gt in games])
try:
resp = (prompt | self.llm).invoke({
"schemas": "\n\n".join(games_schema),
"text": text,
"types": ", ".join(games),
"format": fmt
})
tracker.add(self._get_usage(resp))
data = self._parse_json(resp.content)
results = {}
errors = []
for gt in games:
game_data = data.get(gt, {}) if isinstance(data.get(gt), dict) else {}
items = game_data.get("items", [])
items = self._post_process(items, gt)
# Thống nhất structure: {items: [...], metadata: {...}}
results[gt] = {"items": items, "metadata": game_data.get("metadata")}
if not items:
errors.append(f"No items for {gt}")
return results, errors
except Exception as e:
return {gt: {"items": [], "metadata": None} for gt in games}, [f"Generate error: {e}"]
def _validate(self, results: Dict[str, dict], text: str) -> Dict[str, dict]:
"""Validate items trong results"""
validated = {}
for gt, data in results.items():
items = data.get("items", []) if isinstance(data, dict) else []
valid_items = [i for i in items if self.validator.validate_quote(i.get("original_quote", ""), text).is_valid]
validated[gt] = {"items": valid_items, "metadata": data.get("metadata") if isinstance(data, dict) else None}
return validated
def _post_process(self, items: List, game_type: str) -> List[Dict]:
ms = int(time.time() * 1000)
result = []
for i, item in enumerate(items):
d = item if isinstance(item, dict) else (item.model_dump() if hasattr(item, 'model_dump') else {})
d["id"] = f"{game_type[:2].upper()}-{ms}-{i}"
d["game_type"] = game_type
result.append(d)
return result
def _parse_json(self, content: str) -> Dict:
if "```" in content:
content = content.split("```")[1].replace("json", "").strip()
return json.loads(content)
def _get_usage(self, resp) -> Dict:
if hasattr(resp, 'response_metadata'):
meta = resp.response_metadata
return meta.get('usage', meta.get('usage_metadata', meta.get('token_usage', {})))
return getattr(resp, 'usage_metadata', {})
def _result(self, success: bool, games: List, results: Dict, errors: List, tracker: TokenUsage, scores: List = None, metadata: Dict = None) -> Dict:
return {
"success": success,
"games": games,
"game_scores": scores or [],
"metadata": metadata or {},
"results": results,
"errors": errors,
"token_usage": tracker.to_dict(),
"llm": f"{self.llm_config.provider}/{self.llm_config.model_name}"
}

220
src/game_registry.py Normal file
View File

@@ -0,0 +1,220 @@
"""
game_registry.py - Tự động load games từ thư mục games/
Hệ thống sẽ:
1. Scan thư mục games/
2. Load mọi file .py (trừ _template.py và __init__.py)
3. Chỉ load games có active: True
4. Đăng ký tự động vào registry
THÊM GAME MỚI = TẠO FILE TRONG games/
BẬT/TẮT GAME = SỬA active: True/False trong file game
"""
import importlib.util
from pathlib import Path
from typing import Dict, List, Any, Optional
from src.games.base import GameType, create_game_type
class GameRegistry:
"""
Registry tự động load games từ thư mục games/
Chỉ load games có active: True
Supports lookup by:
- game_type (string): "quiz", "sequence"
- type_id (int): 1, 2
"""
_instance: Optional["GameRegistry"] = None
_all_games: Dict[str, GameType] = {} # Keyed by game_type
_id_map: Dict[int, str] = {} # type_id -> game_type
_loaded: bool = False
def __new__(cls):
if cls._instance is None:
cls._instance = super().__new__(cls)
cls._instance._all_games = {}
cls._instance._id_map = {}
return cls._instance
def __init__(self):
if not self._loaded:
self._load_all_games()
self._loaded = True
def _load_all_games(self):
"""Scan và load tất cả game definitions từ games/"""
games_dir = Path(__file__).parent / "games"
if not games_dir.exists():
print(f"⚠️ Games directory not found: {games_dir}")
return
for file_path in games_dir.glob("*.py"):
# Skip __init__.py và _template.py và base.py
if file_path.name.startswith("_") or file_path.name == "base.py":
continue
try:
game_def = self._load_game_from_file(file_path)
if game_def:
self._all_games[game_def.game_type] = game_def
if game_def.type_id > 0:
self._id_map[game_def.type_id] = game_def.game_type
status = "" if game_def.active else "⏸️"
print(f"{status} Loaded: {game_def.game_type} (id={game_def.type_id}, active={game_def.active})")
except Exception as e:
print(f"❌ Error loading {file_path.name}: {e}")
def _load_game_from_file(self, file_path: Path) -> Optional[GameType]:
"""Load 1 game definition từ file"""
module_name = f"games.{file_path.stem}"
spec = importlib.util.spec_from_file_location(module_name, file_path)
if spec is None or spec.loader is None:
return None
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
config = getattr(module, "GAME_CONFIG", None)
examples = getattr(module, "EXAMPLES", [])
if config is None:
return None
# Inject examples if not in config
if examples and "examples" not in config:
config["examples"] = examples
return create_game_type(config)
def reload(self):
"""Reload tất cả games"""
self._all_games.clear()
self._id_map.clear()
self._loaded = False
self._load_all_games()
self._loaded = True
# ============== PUBLIC API ==============
def get_game(self, game_type: str) -> Optional[GameType]:
"""Lấy game by game_type (chỉ active)"""
game = self._all_games.get(game_type)
return game if game and game.active else None
def get_game_by_id(self, type_id: int) -> Optional[GameType]:
"""Lấy game by type_id (chỉ active)"""
game_type = self._id_map.get(type_id)
if game_type:
return self.get_game(game_type)
return None
def get_game_type_by_id(self, type_id: int) -> Optional[str]:
"""Convert type_id -> game_type"""
return self._id_map.get(type_id)
def get_id_by_game_type(self, game_type: str) -> int:
"""Convert game_type -> type_id"""
game = self._all_games.get(game_type)
return game.type_id if game else 0
def get_all_games(self) -> Dict[str, GameType]:
"""Lấy tất cả games ACTIVE"""
return {k: v for k, v in self._all_games.items() if v.active}
def get_all_games_including_inactive(self) -> Dict[str, GameType]:
"""Lấy tất cả games (kể cả inactive)"""
return self._all_games.copy()
def get_game_types(self) -> List[str]:
"""Lấy danh sách game types ACTIVE"""
return [k for k, v in self._all_games.items() if v.active]
def get_type_ids(self) -> List[int]:
"""Lấy danh sách type_ids ACTIVE"""
return [v.type_id for v in self._all_games.values() if v.active and v.type_id > 0]
def get_analyzer_context(self) -> str:
"""Tạo context cho Analyzer (chỉ từ active games)"""
context_parts = []
for game_type, game in self._all_games.items():
if not game.active:
continue
hints = game.analyzer_rules # New field name
if hints:
hints_text = "\n - ".join(hints)
context_parts.append(
f"**{game.display_name}** (id={game.type_id}):\n"
f" Description: {game.description}\n"
f" Suitable when:\n - {hints_text}"
)
return "\n\n".join(context_parts)
def is_active(self, game_type: str) -> bool:
"""Kiểm tra game có active không"""
game = self._all_games.get(game_type)
return game.active if game else False
# ============== GLOBAL FUNCTIONS ==============
_registry: Optional[GameRegistry] = None
def get_registry() -> GameRegistry:
global _registry
if _registry is None:
_registry = GameRegistry()
return _registry
def reload_games():
"""Reload tất cả games (gọi sau khi thêm/sửa game)"""
get_registry().reload()
def get_game(game_type: str) -> Optional[GameType]:
return get_registry().get_game(game_type)
def get_active_game_types() -> List[str]:
return get_registry().get_game_types()
def get_analyzer_context() -> str:
return get_registry().get_analyzer_context()
def list_all_games() -> None:
"""In danh sách tất cả games và trạng thái"""
registry = get_registry()
print("\n📋 DANH SÁCH GAMES:")
print("-" * 50)
for game_type, game in registry.get_all_games_including_inactive().items():
status = "✅ ACTIVE" if game.active else "⏸️ INACTIVE"
print(f" [{game.type_id}] {game.display_name} ({game_type}): {status}")
print("-" * 50)
def get_game_by_id(type_id: int) -> Optional[GameType]:
"""Lấy game by type_id"""
return get_registry().get_game_by_id(type_id)
def get_active_type_ids() -> List[int]:
"""Lấy danh sách type_ids active"""
return get_registry().get_type_ids()
def id_to_type(type_id: int) -> Optional[str]:
"""Convert type_id -> game_type"""
return get_registry().get_game_type_by_id(type_id)
def type_to_id(game_type: str) -> int:
"""Convert game_type -> type_id"""
return get_registry().get_id_by_game_type(game_type)

9
src/games/__init__.py Normal file
View File

@@ -0,0 +1,9 @@
"""
games/ - Game type definitions
Mỗi game là 1 file với GAME_CONFIG dict.
Thêm game mới = thêm file mới.
"""
from .base import GameType, create_game_type
__all__ = ["GameType", "create_game_type"]

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

91
src/games/_template.py Normal file
View File

@@ -0,0 +1,91 @@
"""
games/_template.py - TEMPLATE CHO GAME MỚI
THÊM GAME MỚI CHỈ CẦN:
1. Copy file này
2. Rename thành <game_type>.py (ví dụ: matching.py)
3. Sửa nội dung bên trong
4. DONE! Hệ thống tự động nhận diện.
Không cần sửa bất kỳ file nào khác!
"""
from typing import List, Optional
from pydantic import BaseModel, Field
# ============== 1. SCHEMA ==============
# Định nghĩa structure của 1 item trong game
# BẮT BUỘC phải có: original_quote và explanation
class YourGameItem(BaseModel):
"""Schema cho 1 item của game"""
# Các trường BẮT BUỘC (để chống hallucination)
original_quote: str = Field(
description="Trích dẫn NGUYÊN VĂN từ văn bản gốc"
)
explanation: str = Field(description="Giải thích")
# Thêm các trường riêng của game ở đây
# Ví dụ:
# question: str = Field(description="Câu hỏi")
# answer: str = Field(description="Đáp án")
# ============== 2. CONFIG ==============
# Cấu hình cho game
GAME_CONFIG = {
# Key duy nhất cho game (dùng trong API)
"game_type": "your_game",
# Tên hiển thị
"display_name": "Tên Game",
# Mô tả ngắn
"description": "Mô tả game của bạn",
# Số lượng items
"max_items": 5,
# Trỏ đến schema class
"schema": YourGameItem,
# Prompt cho LLM
"system_prompt": """Bạn là chuyên gia tạo [tên game].
NHIỆM VỤ: [Mô tả nhiệm vụ]
QUY TẮC:
1. original_quote PHẢI là trích dẫn NGUYÊN VĂN
2. [Quy tắc khác]
3. [Quy tắc khác]""",
}
# ============== 3. EXAMPLES ==============
# Ví dụ input/output để:
# - Analyzer học khi nào nên suggest game này
# - Generator dùng làm few-shot
EXAMPLES = [
{
# Input text mẫu
"input": "Văn bản mẫu ở đây...",
# Output mong đợi
"output": {
"items": [
{
"original_quote": "Trích dẫn từ văn bản",
"explanation": "Giải thích",
# Các trường khác của schema...
}
]
},
# Analyzer học từ trường này
"why_suitable": "Giải thích tại sao văn bản này phù hợp với game này"
},
# Thêm 1-2 examples nữa...
]

85
src/games/base.py Normal file
View File

@@ -0,0 +1,85 @@
"""
games/base.py - Base Game Type Definition
"""
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional, Type
from pydantic import BaseModel
from langchain_core.output_parsers import PydanticOutputParser
@dataclass
class GameType:
"""
Định nghĩa cấu trúc chuẩn cho một Game.
Mọi game phải tuân thủ cấu trúc này để Core có thể xử lý tự động.
"""
# --- REQUIRED FIELDS (No default value) ---
type_id: int
game_type: str # e.g. "quiz"
display_name: str # e.g. "Multiple Choice Quiz"
description: str # e.g. "Create questions from text"
schema: Type[BaseModel] # Schema cho 1 item (e.g. QuizItem)
output_schema: Type[BaseModel] # Schema cho output (e.g. QuizOutput)
generation_rules: List[str] # Rules để tạo nội dung
analyzer_rules: List[str] # Rules để analyzer nhận diện
# --- OPTIONAL FIELDS (Has default value) ---
input_format_rules: List[str] = field(default_factory=list) # Rules validate input format (Direct Mode)
active: bool = True
max_items: int = 10
examples: List[Dict[str, Any]] = field(default_factory=list)
output_parser: Optional[PydanticOutputParser] = None
def __post_init__(self):
if self.output_parser is None and self.output_schema:
self.output_parser = PydanticOutputParser(pydantic_object=self.output_schema)
@property
def format_instructions(self) -> str:
"""Lấy hướng dẫn format JSON từ parser"""
if self.output_parser:
return self.output_parser.get_format_instructions()
return ""
@property
def generated_system_prompt(self) -> str:
"""Tự động tạo System Prompt từ rules và description"""
rules_txt = "\n".join([f"- {r}" for r in self.generation_rules])
return f"""Game: {self.display_name}
Description: {self.description}
GENERATION RULES:
{rules_txt}
Always ensure output follows the JSON schema exactly."""
def create_game_type(config: Dict[str, Any]) -> GameType:
"""Factory method to create GameType from Config Dict"""
# Backward compatibility mapping
gen_rules = config.get("generation_rules", [])
if not gen_rules and "system_prompt" in config:
# Nếu chưa có rules tách biệt, dùng tạm system_prompt cũ làm 1 rule
gen_rules = [config["system_prompt"]]
# Map analyzer_hints -> analyzer_rules
ana_rules = config.get("analyzer_rules", []) or config.get("analyzer_hints", [])
return GameType(
type_id=config.get("type_id", 0),
game_type=config["game_type"],
display_name=config["display_name"],
description=config["description"],
input_format_rules=config.get("input_format_rules", []),
active=config.get("active", True),
max_items=config.get("max_items", 10),
schema=config["schema"],
output_schema=config["output_schema"],
generation_rules=gen_rules,
analyzer_rules=ana_rules,
examples=config.get("examples", []),
output_parser=config.get("output_parser")
)

139
src/games/quiz.py Normal file
View File

@@ -0,0 +1,139 @@
"""
games/quiz.py - Quiz Game - Multiple choice questions
"""
from typing import List, Literal
import re
from pydantic import BaseModel, Field
from langchain_core.output_parsers import PydanticOutputParser
# ============== SCHEMA ==============
class QuizItem(BaseModel):
question: str = Field(description="The question based on source content")
answers: str = Field(description="The correct answer")
options: List[str] = Field(description="List of options including correct answer")
original_quote: str = Field(description="EXACT quote from source text")
image_description: str = Field(default="", description="Visual description for the question")
image_keywords: List[str] = Field(default=[], description="Keywords for image search")
image_is_complex: bool = Field(default=False, description="True if image needs precise quantities, humans, or multiple detailed objects")
class QuizMetadata(BaseModel):
"""Metadata đánh giá nội dung"""
title: str = Field(
description="Title for this content. Prefer title from source document if available and suitable, otherwise create a short descriptive title."
)
description: str = Field(
description="Short description summarizing the content/topic of the quiz."
)
grade: int = Field(
description="Estimated grade level 1-5 (1=easy/young, 5=advanced/older). Judge by vocabulary, concepts, required knowledge."
)
type: Literal["quiz"] = Field(default="quiz", description="Game type (always 'quiz')")
difficulty: int = Field(
description="Difficulty 1-5 for that grade (1=very easy, 5=very hard). Judge by question complexity, number of options, abstract concepts."
)
class QuizOutput(BaseModel):
"""Output wrapper for quiz items"""
items: List[QuizItem] = Field(description="List of quiz items generated from source text")
metadata: QuizMetadata = Field(description="Metadata about the quiz content")
# Output parser
output_parser = PydanticOutputParser(pydantic_object=QuizOutput)
# ============== CONFIG ==============
# ============== CONFIG ==============
GAME_CONFIG = {
"game_type": "quiz",
"display_name": "Quiz",
"description": "Multiple choice questions",
"type_id": 1,
"active": True,
"max_items": 10,
"schema": QuizItem,
"output_schema": QuizOutput,
"output_parser": output_parser,
"input_format_rules": [
"Text should contain facts or questions suitable for a quiz.",
"Prefer extracting existing multiple choice questions if available.",
"Text MUST contain questions with multiple choice options",
],
# 1. Recognition Rules (for Analyzer)
"analyzer_rules": [
"Text MUST contain questions with multiple choice options",
"NOT suitable if text is just a list of words with no questions",
],
# 2. Rules tạo nội dung (cho Generator)
"generation_rules": [
"KEEP ORIGINAL LANGUAGE - Do NOT translate",
"original_quote = EXACT quote from source text (full question block)",
"ALL content must come from source only - do NOT invent",
"REMOVE unnecessary numbering: 'Question 1:', '(1)', '(2)', 'A.', 'B.' from question/options/answers",
"STRICTLY CLEAN OUTPUT for 'answers': MUST contain ONLY the text content of the correct option.",
"FORBIDDEN in 'answers': Prefixes like '(1)', '(2)', 'A.', 'B.', '1.' - REMOVE THEM.",
"IMPORTANT: The 'answers' field MUST EXACTLY MATCH one of the 'options' values text-wise.",
# VISUAL FIELD COMPULSORY
"image_description: MUST be a visual description relevant to the question in ENGLISH.",
"image_keywords: MUST provide 2-3 English keywords for search.",
"image_is_complex: FALSE for simple/static objects, TRUE for quantities/humans/complex scenes",
"NEVER leave image fields empty!",
],
"examples": EXAMPLES if 'EXAMPLES' in globals() else []
}
def clean_prefix(text: str) -> str:
"""Remove prefixes like (1), (A), 1., A. from text"""
if not text: return text
# Regex: Start with ( (number/letter) ) OR number/letter dot. Followed by spaces.
return re.sub(r'^(\(\d+\)|\([A-Za-z]\)|\d+\.|[A-Za-z]\.)\s*', '', text).strip()
def post_process_quiz(items: List[dict]) -> List[dict]:
"""Clean up answers and options prefixes"""
for item in items:
# Clean answers
if item.get("answers"):
item["answers"] = clean_prefix(item["answers"])
# Clean options
if item.get("options") and isinstance(item["options"], list):
item["options"] = [clean_prefix(opt) for opt in item["options"]]
return items
# Register handler
GAME_CONFIG["post_process_handler"] = post_process_quiz
# ============== EXAMPLES ==============
EXAMPLES = [
{
"input": "The Sun is a star at the center of the Solar System.",
"output": {
"items": [{
"question": "Where is the Sun located?",
"answers": "At the center of the Solar System",
"options": ["At the center of the Solar System", "At the edge of the Solar System", "Near the Moon", "Outside the universe"],
"original_quote": "The Sun is a star at the center of the Solar System.",
"image_description": "The sun in the middle of planets",
"image_keywords": ["sun", "planets"],
"image_is_complex": False
}]
},
"why_suitable": "Has clear facts"
}
]

173
src/games/sequence.py Normal file
View File

@@ -0,0 +1,173 @@
"""
games/sequence.py - Arrange Sequence Game (Sentences OR Words)
type_id = 2
LLM tự quyết định dựa vào ngữ nghĩa:
- "good morning", "apple", "happy" → WORD
- "Hi, I'm Lisa", "The sun rises" → SENTENCE
Output trả về đúng trường: word hoặc sentence
"""
from typing import List, Literal, Optional
from pydantic import BaseModel, Field
from langchain_core.output_parsers import PydanticOutputParser
# ============== SCHEMA ==============
class SequenceItem(BaseModel):
"""Item - LLM điền word HOẶC sentence, không điền cả 2"""
word: Optional[str] = Field(default=None, description="Fill this if item is a WORD/PHRASE (not complete sentence)")
sentence: Optional[str] = Field(default=None, description="Fill this if item is a COMPLETE SENTENCE")
original_quote: str = Field(description="EXACT quote from source text")
image_description: str = Field(default="", description="Visual description of the content")
image_keywords: List[str] = Field(default=[], description="Keywords for image search")
image_is_complex: bool = Field(default=False, description="True if image needs precise quantities, humans, or complex details")
class SequenceMetadata(BaseModel):
"""Metadata đánh giá nội dung"""
title: str = Field(
description="Title for this content. Prefer title from source document if available."
)
description: str = Field(
description="Short description summarizing the content/topic."
)
grade: int = Field(
description="Estimated grade level 1-5 (1=easy/young, 5=advanced/older)."
)
type: Literal["sequence"] = Field(default="sequence", description="Game type")
sub_type: Literal["sentence", "word"] = Field(
description="LLM decides: 'word' for words/phrases, 'sentence' for complete sentences"
)
difficulty: int = Field(
description="Difficulty 1-5 for that grade."
)
class SequenceOutput(BaseModel):
"""Output wrapper for sequence items"""
items: List[SequenceItem] = Field(description="List of sequence items")
metadata: SequenceMetadata = Field(description="Metadata about the content")
# Output parser
output_parser = PydanticOutputParser(pydantic_object=SequenceOutput)
# ============== CONFIG ==============
# ============== CONFIG ==============
GAME_CONFIG = {
"game_type": "sequence",
"display_name": "Arrange Sequence",
"description": "Arrange sentences or words in order",
"type_id": 2,
"active": True,
"max_items": 10,
"schema": SequenceItem,
"output_schema": SequenceOutput,
"output_parser": output_parser,
"input_format_rules": [
"Text MUST be a list of items (words, phrases, sentences) to be ordered.",
"Do NOT generate sequence from multiple choice questions (A/B/C/D).",
"Do NOT generate sequence if the text is a quiz or test format.",
],
# 1. Recognition Rules (for Analyzer)
"analyzer_rules": [
"Text is a list of words, phrases, or sentences suitable for ordering",
"Items are separated by commas, semicolons, or newlines",
"Example: 'apple, banana, orange' or 'Sentence 1; Sentence 2'",
"NO questions required - just a list of items",
"Text is NOT a long essay or complex dialogue",
],
# 2. Rules tạo nội dung (cho Generator)
"generation_rules": [
"KEEP ORIGINAL LANGUAGE - Do NOT translate",
"Analyze text semantically to extract meaningful items",
"For each item, decide type: WORD/PHRASE or SENTENCE",
"- If item is a WORD/PHRASE (label, noun, greeting) -> Fill 'word' field",
"- If item is a COMPLETE SENTENCE (subject+verb) -> Fill 'sentence' field",
"NEVER fill both fields for the same item",
"Set metadata.sub_type = 'word' or 'sentence' (all items should match sub_type)",
"Clean up OCR noise, numbering (e.g. '1. Apple' -> 'Apple')",
# CONSISTENCY RULES
"CRITICAL: All extracted items MUST be of the SAME type.",
"Choose ONE type for the whole list: either ALL 'word' OR ALL 'sentence'.",
"If input has mixed types, pick the MAJORITY type and ignore the others.",
# VISUAL FIELD COMPULSORY
"image_description: MUST be a visual description of the item in ENGLISH. Example: 'A red apple', 'Two people shaking hands'",
"image_keywords: MUST provide 2-3 English keywords for search. Example: ['apple', 'fruit', 'red']",
],
"examples": EXAMPLES if 'EXAMPLES' in globals() else []
}
# ============== EXAMPLES ==============
EXAMPLES = [
{
"input": "apple; banana;\norange; grape;\ncat; dog;",
"output": {
"items": [
{"word": "apple", "sentence": None, "original_quote": "apple", "image_description": "A red apple", "image_keywords": ["apple"], "image_is_complex": False},
{"word": "banana", "sentence": None, "original_quote": "banana", "image_description": "A yellow banana", "image_keywords": ["banana"], "image_is_complex": False},
{"word": "orange", "sentence": None, "original_quote": "orange", "image_description": "An orange fruit", "image_keywords": ["orange"], "image_is_complex": False},
{"word": "grape", "sentence": None, "original_quote": "grape", "image_description": "Purple grapes", "image_keywords": ["grape"], "image_is_complex": False},
{"word": "cat", "sentence": None, "original_quote": "cat", "image_description": "A cat", "image_keywords": ["cat"], "image_is_complex": False},
{"word": "dog", "sentence": None, "original_quote": "dog", "image_description": "A dog", "image_keywords": ["dog"], "image_is_complex": False}
],
"metadata": {
"title": "Animals and Fruits",
"description": "Common animals and fruits",
"grade": 1,
"type": "sequence",
"sub_type": "word",
"difficulty": 1
}
},
"why": "Items are single words → use 'word' field"
},
{
"input": "Hi, I'm Lisa; Nice to meet you; How are you?",
"output": {
"items": [
{"word": None, "sentence": "Hi, I'm Lisa", "original_quote": "Hi, I'm Lisa", "image_description": "A girl introducing herself", "image_keywords": ["girl", "greeting"], "image_is_complex": True},
{"word": None, "sentence": "Nice to meet you", "original_quote": "Nice to meet you", "image_description": "Two people shaking hands", "image_keywords": ["handshake", "greeting"], "image_is_complex": True},
{"word": None, "sentence": "How are you?", "original_quote": "How are you?", "image_description": "Person asking a question", "image_keywords": ["question", "greeting"], "image_is_complex": True}
],
"metadata": {
"title": "English Greetings",
"description": "Common greeting sentences",
"grade": 2,
"type": "sequence",
"sub_type": "sentence",
"difficulty": 2
}
},
"why": "Items are complete sentences → use 'sentence' field"
},
{
"input": "good morning; good afternoon; good evening; good night",
"output": {
"items": [
{"word": "good morning", "sentence": None, "original_quote": "good morning", "image_description": "Morning sunrise", "image_keywords": ["morning", "sun"], "image_is_complex": False},
{"word": "good afternoon", "sentence": None, "original_quote": "good afternoon", "image_description": "Afternoon sun", "image_keywords": ["afternoon"], "image_is_complex": False},
{"word": "good evening", "sentence": None, "original_quote": "good evening", "image_description": "Evening sunset", "image_keywords": ["evening", "sunset"], "image_is_complex": False},
{"word": "good night", "sentence": None, "original_quote": "good night", "image_description": "Night sky with moon", "image_keywords": ["night", "moon"], "image_is_complex": False}
],
"metadata": {
"title": "Time Greetings",
"description": "Greetings for different times of day",
"grade": 1,
"type": "sequence",
"sub_type": "word",
"difficulty": 1
}
},
"why": "These are PHRASES/GREETINGS, not complete sentences → use 'word' field"
}
]

191
src/llm_config.py Normal file
View File

@@ -0,0 +1,191 @@
"""
llm_config.py - Cấu hình LLM linh hoạt
Hỗ trợ:
- Ollama (local)
- Google Gemini
- OpenAI
Sử dụng:
from llm_config import ModelConfig, get_llm
config = ModelConfig(provider="ollama", model_name="qwen2.5:14b")
llm = get_llm(config)
"""
import os
from typing import Optional
from pydantic import BaseModel, Field
from langchain_core.language_models.chat_models import BaseChatModel
class ModelConfig(BaseModel):
"""Cấu hình cho LLM"""
provider: str = Field(
default="gemini",
description="Provider: ollama, gemini, openai"
)
model_name: str = Field(
default="gemini-2.0-flash-lite",
description="Tên model"
)
api_key: Optional[str] = Field(
default=None,
description="API key (nếu None, lấy từ env)"
)
temperature: float = Field(
default=0.1,
description="Độ sáng tạo (0.0 - 1.0)"
)
base_url: Optional[str] = Field(
default=None,
description="Base URL cho Ollama"
)
class Config:
# Cho phép tạo từ dict
extra = "allow"
# ============== DEFAULT CONFIGS ==============
DEFAULT_CONFIGS = {
"ollama": ModelConfig(
provider="ollama",
model_name="qwen2.5:14b",
temperature=0.1,
base_url=None # Sẽ lấy từ OLLAMA_BASE_URL env
),
"ollama_light": ModelConfig(
provider="ollama",
model_name="qwen2.5:7b",
temperature=0.0,
base_url=None # Sẽ lấy từ OLLAMA_BASE_URL env
),
"gemini": ModelConfig(
provider="gemini",
model_name="gemini-2.0-flash-lite",
temperature=0.1
),
"gemini_light": ModelConfig(
provider="gemini",
model_name="gemini-2.0-flash-lite",
temperature=0.0
),
"openai": ModelConfig(
provider="openai",
model_name="gpt-4o-mini",
temperature=0.1
),
"openai_light": ModelConfig(
provider="openai",
model_name="gpt-4o-mini",
temperature=0.0
),
}
def get_default_config(name: str = "gemini") -> ModelConfig:
"""Lấy config mặc định theo tên"""
return DEFAULT_CONFIGS.get(name, DEFAULT_CONFIGS["gemini"])
# ============== LLM FACTORY ==============
def get_llm(config: ModelConfig) -> BaseChatModel:
"""
Factory function tạo LLM instance
Args:
config: ModelConfig object
Returns:
BaseChatModel instance
"""
provider = config.provider.lower()
if provider == "ollama":
from langchain_ollama import ChatOllama
base_url = config.base_url or os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
return ChatOllama(
model=config.model_name,
temperature=config.temperature,
base_url=base_url
)
elif provider == "gemini":
from langchain_google_genai import ChatGoogleGenerativeAI
api_key = config.api_key or os.getenv("GOOGLE_API_KEY")
if not api_key:
raise ValueError("GOOGLE_API_KEY required for Gemini. Set via env or config.api_key")
return ChatGoogleGenerativeAI(
model=config.model_name,
temperature=config.temperature,
google_api_key=api_key
)
elif provider == "openai":
from langchain_openai import ChatOpenAI
api_key = config.api_key or os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError("OPENAI_API_KEY required for OpenAI. Set via env or config.api_key")
return ChatOpenAI(
model=config.model_name,
temperature=config.temperature,
api_key=api_key
)
else:
raise ValueError(f"Provider '{provider}' không được hỗ trợ. Chọn: ollama, gemini, openai")
def get_completion_model(config: ModelConfig):
"""
Tạo completion model (non-chat) nếu cần
Hiện tại chỉ Ollama có completion model riêng
"""
if config.provider.lower() == "ollama":
from langchain_ollama.llms import OllamaLLM
base_url = config.base_url or os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
return OllamaLLM(
model=config.model_name,
temperature=config.temperature,
base_url=base_url
)
# Các provider khác dùng Chat interface
return get_llm(config)
# ============== HELPER ==============
def create_config(
provider: str = "gemini",
model_name: Optional[str] = None,
api_key: Optional[str] = None,
temperature: float = 0.1,
base_url: Optional[str] = None
) -> ModelConfig:
"""
Helper function tạo ModelConfig
Nếu không chỉ định model_name, sẽ dùng default cho provider đó
"""
default_models = {
"ollama": "qwen2.5:14b",
"gemini": "gemini-2.0-flash-lite",
"openai": "gpt-4o-mini"
}
return ModelConfig(
provider=provider,
model_name=model_name or default_models.get(provider, "gemini-2.0-flash-lite"),
api_key=api_key,
temperature=temperature,
base_url=base_url
)

37
src/logger.py Normal file
View File

@@ -0,0 +1,37 @@
import logging
import sys
import os
from logging.handlers import RotatingFileHandler
def setup_logger(name: str = "sena_gen"):
logger = logging.getLogger(name)
if logger.handlers:
return logger
logger.setLevel(logging.INFO)
# Format
formatter = logging.Formatter(
'[%(asctime)s] %(levelname)s [%(name)s:%(lineno)s] - %(message)s'
)
# Console handler
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
# File handler (Optional - based on env)
log_file = os.getenv("LOG_FILE", "logs/gen_game.log")
if log_file:
os.makedirs(os.path.dirname(log_file), exist_ok=True)
file_handler = RotatingFileHandler(
log_file, maxBytes=10*1024*1024, backupCount=5, encoding='utf-8'
)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
return logger
# Singleton logger
logger = setup_logger()

204
src/validator.py Normal file
View File

@@ -0,0 +1,204 @@
"""
validator.py - Hallucination Guardrail
Kiểm tra original_quote có thực sự nằm trong văn bản gốc không (Python-based, 0 API calls)
"""
import re
from typing import List, Dict, Any, Tuple, Optional
from dataclasses import dataclass
from difflib import SequenceMatcher
import unicodedata
@dataclass
class ValidationResult:
"""Kết quả validate một item"""
item_index: int
is_valid: bool
original_quote: str
match_found: bool
match_score: float
error_message: Optional[str] = None
@dataclass
class ValidatedGameOutput:
"""Kết quả sau khi validate một game"""
game_type: str
valid_items: List[Dict[str, Any]]
invalid_items: List[Dict[str, Any]]
validation_results: List[ValidationResult]
@property
def all_valid(self) -> bool:
return len(self.invalid_items) == 0
@property
def validity_rate(self) -> float:
total = len(self.valid_items) + len(self.invalid_items)
return len(self.valid_items) / total if total > 0 else 0.0
class QuoteValidator:
"""
Validator kiểm tra original_quote có thực sự nằm trong văn bản gốc
Sử dụng nhiều chiến lược matching: exact, fuzzy, substring
KHÔNG GỌI API - hoàn toàn Python-based
"""
def __init__(
self,
fuzzy_threshold: float = 0.85,
min_quote_length: int = 10,
normalize_whitespace: bool = True
):
self.fuzzy_threshold = fuzzy_threshold
self.min_quote_length = min_quote_length
self.normalize_whitespace = normalize_whitespace
def _normalize_text(self, text: str) -> str:
"""Chuẩn hóa text để so sánh"""
if not text:
return ""
text = unicodedata.normalize('NFC', text)
text = text.lower()
if self.normalize_whitespace:
text = re.sub(r'\s+', ' ', text).strip()
return text
def _exact_match(self, quote: str, source: str) -> bool:
"""Kiểm tra quote có nằm chính xác trong source không"""
return self._normalize_text(quote) in self._normalize_text(source)
def _fuzzy_match(self, quote: str, source: str) -> float:
"""Tìm đoạn giống nhất trong source và trả về similarity score"""
norm_quote = self._normalize_text(quote)
norm_source = self._normalize_text(source)
if not norm_quote or not norm_source:
return 0.0
if len(norm_quote) > len(norm_source):
return 0.0
best_score = 0.0
quote_len = len(norm_quote)
window_sizes = [
quote_len,
int(quote_len * 0.9),
int(quote_len * 1.1),
]
for window_size in window_sizes:
if window_size <= 0 or window_size > len(norm_source):
continue
for i in range(len(norm_source) - window_size + 1):
window = norm_source[i:i + window_size]
score = SequenceMatcher(None, norm_quote, window).ratio()
best_score = max(best_score, score)
if best_score >= self.fuzzy_threshold:
return best_score
return best_score
def validate_quote(
self,
original_quote: str,
source_text: str,
item_index: int = 0
) -> ValidationResult:
"""Validate một original_quote against source_text"""
if not original_quote:
return ValidationResult(
item_index=item_index,
is_valid=False,
original_quote=original_quote or "",
match_found=False,
match_score=0.0,
error_message="original_quote is empty"
)
if len(original_quote) < self.min_quote_length:
return ValidationResult(
item_index=item_index,
is_valid=False,
original_quote=original_quote,
match_found=False,
match_score=0.0,
error_message=f"quote too short (min: {self.min_quote_length})"
)
# Strategy 1: Exact match
if self._exact_match(original_quote, source_text):
return ValidationResult(
item_index=item_index,
is_valid=True,
original_quote=original_quote,
match_found=True,
match_score=1.0,
error_message=None
)
# Strategy 2: Fuzzy match
fuzzy_score = self._fuzzy_match(original_quote, source_text)
if fuzzy_score >= self.fuzzy_threshold:
return ValidationResult(
item_index=item_index,
is_valid=True,
original_quote=original_quote,
match_found=True,
match_score=fuzzy_score,
error_message=None
)
return ValidationResult(
item_index=item_index,
is_valid=False,
original_quote=original_quote,
match_found=False,
match_score=fuzzy_score,
error_message=f"Quote not found. Score: {fuzzy_score:.2f}"
)
def validate_game_output(
self,
game_type: str,
items: List[Dict[str, Any]],
source_text: str
) -> ValidatedGameOutput:
"""Validate tất cả items trong một game output"""
valid_items = []
invalid_items = []
validation_results = []
for i, item in enumerate(items):
original_quote = item.get("original_quote", "")
result = self.validate_quote(original_quote, source_text, i)
validation_results.append(result)
if result.is_valid:
valid_items.append(item)
else:
item["_validation_error"] = result.error_message
invalid_items.append(item)
return ValidatedGameOutput(
game_type=game_type,
valid_items=valid_items,
invalid_items=invalid_items,
validation_results=validation_results
)
def quick_validate(original_quote: str, source_text: str, threshold: float = 0.85) -> bool:
"""Hàm tiện ích validate nhanh"""
validator = QuoteValidator(fuzzy_threshold=threshold)
result = validator.validate_quote(original_quote, source_text)
return result.is_valid