Map-NEO / debug_downloaded_data.py
Austin207's picture
Upload folder using huggingface_hub
a683148 verified
"""debug_downloaded_data.py - Inspect the downloaded conversation format"""
import json
def inspect_downloaded_data():
"""Inspect the first few records to understand the format"""
data_path = "data/conversation_raw/OpenAssistant_oasst1_raw.jsonl"
print("๐Ÿ” Inspecting downloaded OpenAssistant data...")
print("="*50)
try:
with open(data_path, 'r', encoding='utf-8') as f:
for i in range(5):
line = f.readline().strip()
if line:
record = json.loads(line)
print(f"\nRecord {i+1}:")
print(f"Top-level keys: {list(record.keys())}")
# Show sample content for each key
for key, value in record.items():
if isinstance(value, str) and len(value) > 100:
value = value[:100] + "..."
elif isinstance(value, dict):
value = f"Dict with keys: {list(value.keys())}"
elif isinstance(value, list):
value = f"List with {len(value)} items"
print(f" {key}: {value}")
# If there's a nested structure, explore it
for key in ['prompt', 'conversation', 'messages']:
if key in record and isinstance(record[key], (dict, list)):
print(f"\n Exploring {key}:")
nested = record[key]
if isinstance(nested, dict):
print(f" Keys: {list(nested.keys())}")
for nkey, nvalue in list(nested.items())[:3]:
if isinstance(nvalue, str) and len(nvalue) > 50:
nvalue = nvalue[:50] + "..."
print(f" {nkey}: {nvalue}")
elif isinstance(nested, list) and nested:
print(f" First item type: {type(nested[0])}")
if isinstance(nested, dict):
print(f" First item keys: {list(nested.keys())}")
except Exception as e:
print(f"Error reading file: {e}")
if __name__ == "__main__":
inspect_downloaded_data()