Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import os | |
| import json | |
| import pyarrow.parquet as pq | |
| from datasets import load_dataset | |
| OUTPUT_FILE = "wiki40b_es_train.jsonl" | |
| def convert_parquet_to_jsonl(progress=gr.Progress()): | |
| progress(0, desc="Cargando dataset google/wiki40b (es)...") | |
| # Carga SOLO el shard que necesitas | |
| dataset = load_dataset( | |
| "google/wiki40b", | |
| "es", | |
| split="train", | |
| streaming=False | |
| ) | |
| # Hugging Face guarda internamente en parquet, | |
| # pero aquí forzamos exportación limpia | |
| progress(0.2, desc="Exportando dataset a Parquet temporal...") | |
| parquet_path = "temp.parquet" | |
| dataset.to_parquet(parquet_path) | |
| progress(0.4, desc="Convirtiendo Parquet a JSONL...") | |
| parquet_file = pq.ParquetFile(parquet_path) | |
| total_batches = parquet_file.num_row_groups | |
| with open(OUTPUT_FILE, "w", encoding="utf-8") as f: | |
| for i, batch in enumerate(parquet_file.iter_batches(batch_size=1000)): | |
| batch_dict = batch.to_pydict() | |
| rows = zip(*batch_dict.values()) | |
| keys = list(batch_dict.keys()) | |
| for row in rows: | |
| record = dict(zip(keys, row)) | |
| f.write(json.dumps(record, ensure_ascii=False) + "\n") | |
| progress(0.4 + 0.6 * (i / total_batches), | |
| desc=f"Procesando lote {i+1}/{total_batches}") | |
| os.remove(parquet_path) | |
| progress(1.0, desc="Conversión completada ✅") | |
| return OUTPUT_FILE | |
| with gr.Blocks(title="Wiki40B ES → JSONL Converter") as app: | |
| gr.Markdown( | |
| """ | |
| # 🧠 Wiki40B (ES) → JSONL | |
| Convierte el archivo | |
| `es/train-00000-of-00006.parquet` | |
| del dataset **google/wiki40b** a **JSONL**. | |
| 👉 Cuando termine, podrás **descargar el archivo a tu PC**. | |
| """ | |
| ) | |
| convert_btn = gr.Button("🚀 Convertir a JSONL") | |
| output_file = gr.File(label="📥 Descargar JSONL") | |
| convert_btn.click( | |
| fn=convert_parquet_to_jsonl, | |
| outputs=output_file | |
| ) | |
| app.launch() |