|
|
"""debug_downloaded_data.py - Inspect the downloaded conversation format"""
|
|
|
|
|
|
import json
|
|
|
|
|
|
def inspect_downloaded_data():
|
|
|
"""Inspect the first few records to understand the format"""
|
|
|
|
|
|
data_path = "data/conversation_raw/OpenAssistant_oasst1_raw.jsonl"
|
|
|
|
|
|
print("๐ Inspecting downloaded OpenAssistant data...")
|
|
|
print("="*50)
|
|
|
|
|
|
try:
|
|
|
with open(data_path, 'r', encoding='utf-8') as f:
|
|
|
for i in range(5):
|
|
|
line = f.readline().strip()
|
|
|
if line:
|
|
|
record = json.loads(line)
|
|
|
print(f"\nRecord {i+1}:")
|
|
|
print(f"Top-level keys: {list(record.keys())}")
|
|
|
|
|
|
|
|
|
for key, value in record.items():
|
|
|
if isinstance(value, str) and len(value) > 100:
|
|
|
value = value[:100] + "..."
|
|
|
elif isinstance(value, dict):
|
|
|
value = f"Dict with keys: {list(value.keys())}"
|
|
|
elif isinstance(value, list):
|
|
|
value = f"List with {len(value)} items"
|
|
|
|
|
|
print(f" {key}: {value}")
|
|
|
|
|
|
|
|
|
for key in ['prompt', 'conversation', 'messages']:
|
|
|
if key in record and isinstance(record[key], (dict, list)):
|
|
|
print(f"\n Exploring {key}:")
|
|
|
nested = record[key]
|
|
|
if isinstance(nested, dict):
|
|
|
print(f" Keys: {list(nested.keys())}")
|
|
|
for nkey, nvalue in list(nested.items())[:3]:
|
|
|
if isinstance(nvalue, str) and len(nvalue) > 50:
|
|
|
nvalue = nvalue[:50] + "..."
|
|
|
print(f" {nkey}: {nvalue}")
|
|
|
elif isinstance(nested, list) and nested:
|
|
|
print(f" First item type: {type(nested[0])}")
|
|
|
if isinstance(nested, dict):
|
|
|
print(f" First item keys: {list(nested.keys())}")
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"Error reading file: {e}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
inspect_downloaded_data()
|
|
|
|