Spaces:
Sleeping
Sleeping
| #!/usr/bin/python3 | |
| # -*- coding: utf-8 -*- | |
| import argparse | |
| from datetime import datetime | |
| from pathlib import Path | |
| import pandas as pd | |
| import requests | |
| from tqdm import tqdm | |
| from project_settings import project_path | |
| def get_args(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument( | |
| "--excel_file_dir", | |
| default=(project_path / "examples/download_wav").as_posix(), | |
| type=str | |
| ) | |
| parser.add_argument( | |
| "--start_date", | |
| default="2022-04-10 00:00:00", | |
| type=str | |
| ) | |
| parser.add_argument( | |
| "--end_date", | |
| default="2026-04-21 00:00:00", | |
| type=str | |
| ) | |
| parser.add_argument( | |
| "--output_dir", | |
| default=(project_path / "data/calling/63/wav_2ch").as_posix(), | |
| type=str | |
| ) | |
| args = parser.parse_args() | |
| return args | |
| excel_file_str = """ | |
| record_1110.csv | |
| record_1104.csv | |
| """ | |
| def main(): | |
| args = get_args() | |
| format_str = "%Y-%m-%d %H:%M:%S" | |
| start_date = datetime.strptime(args.start_date, format_str) | |
| end_date = datetime.strptime(args.end_date, format_str) | |
| excel_file_dir = Path(args.excel_file_dir) | |
| output_dir = Path(args.output_dir) | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| print(f"start_date: {start_date}") | |
| print(f"end_date: {end_date}") | |
| # finished | |
| finished = set() | |
| for filename in output_dir.glob("*.wav"): | |
| call_id = filename.stem | |
| finished.add(call_id) | |
| splits = excel_file_str.split("\n") | |
| for row in splits: | |
| name = str(row).strip() | |
| if len(name) == 0: | |
| continue | |
| excel_file = excel_file_dir / name | |
| # df = pd.read_excel(excel_file.as_posix()) | |
| df = pd.read_csv(excel_file.as_posix()) | |
| for i, row in tqdm(df.iterrows()): | |
| call_date = "2025-10-12 00:00:00" | |
| record_url = row["record_file"] | |
| call_id = Path(record_url).stem | |
| record_name = Path(record_url).name | |
| # call_date = row["Attempt time"] | |
| # call_id = row["Call ID"] | |
| # record_url = row["Recording file"] | |
| if pd.isna(record_url): | |
| continue | |
| if call_id in finished: | |
| continue | |
| finished.add(call_id) | |
| call_date = datetime.strptime(str(call_date), format_str) | |
| if not start_date < call_date < end_date: | |
| continue | |
| call_date_str = call_date.strftime("%Y%m%d") | |
| # record_url = f"https://phl-01.obs.ap-southeast-3.myhuaweicloud.com/{call_date_str}/21964/{call_id}.wav" | |
| # record_url = f"https://nxai-hk-1259196162.cos.ap-hongkong.myqcloud.com/{call_date_str}/3101/{call_id}.wav" | |
| # print(record_url) | |
| try: | |
| resp = requests.get( | |
| url=record_url, | |
| ) | |
| except (TimeoutError, requests.exceptions.ConnectionError): | |
| continue | |
| except Exception as e: | |
| print(e) | |
| continue | |
| if resp.status_code == 404: | |
| continue | |
| if resp.status_code != 200: | |
| raise AssertionError("status_code: {}; text: {}".format(resp.status_code, resp.text)) | |
| filename = output_dir / f"{record_name}" | |
| with open(filename.as_posix(), "wb") as f: | |
| f.write(resp.content) | |
| return | |
| if __name__ == "__main__": | |
| main() | |