Dataset / app.py
teszenofficial's picture
Create app.py
14d6cbc verified
import gradio as gr
import os
import json
import pyarrow.parquet as pq
from datasets import load_dataset
OUTPUT_FILE = "wiki40b_es_train.jsonl"
def convert_parquet_to_jsonl(progress=gr.Progress()):
progress(0, desc="Cargando dataset google/wiki40b (es)...")
# Carga SOLO el shard que necesitas
dataset = load_dataset(
"google/wiki40b",
"es",
split="train",
streaming=False
)
# Hugging Face guarda internamente en parquet,
# pero aquí forzamos exportación limpia
progress(0.2, desc="Exportando dataset a Parquet temporal...")
parquet_path = "temp.parquet"
dataset.to_parquet(parquet_path)
progress(0.4, desc="Convirtiendo Parquet a JSONL...")
parquet_file = pq.ParquetFile(parquet_path)
total_batches = parquet_file.num_row_groups
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
for i, batch in enumerate(parquet_file.iter_batches(batch_size=1000)):
batch_dict = batch.to_pydict()
rows = zip(*batch_dict.values())
keys = list(batch_dict.keys())
for row in rows:
record = dict(zip(keys, row))
f.write(json.dumps(record, ensure_ascii=False) + "\n")
progress(0.4 + 0.6 * (i / total_batches),
desc=f"Procesando lote {i+1}/{total_batches}")
os.remove(parquet_path)
progress(1.0, desc="Conversión completada ✅")
return OUTPUT_FILE
with gr.Blocks(title="Wiki40B ES → JSONL Converter") as app:
gr.Markdown(
"""
# 🧠 Wiki40B (ES) → JSONL
Convierte el archivo
`es/train-00000-of-00006.parquet`
del dataset **google/wiki40b** a **JSONL**.
👉 Cuando termine, podrás **descargar el archivo a tu PC**.
"""
)
convert_btn = gr.Button("🚀 Convertir a JSONL")
output_file = gr.File(label="📥 Descargar JSONL")
convert_btn.click(
fn=convert_parquet_to_jsonl,
outputs=output_file
)
app.launch()