#!/usr/bin/env python3 """ Convert parquet file to CSV with only HR field """ import pyarrow.parquet as pq import pyarrow as pa import pandas as pd import sys def convert_parquet_to_csv(parquet_file, output_file): """Convert parquet file to CSV extracting only HR data""" try: # Read the parquet file print(f"Reading {parquet_file}...") pf = pq.ParquetFile(parquet_file) print("Schema:") print(pf.schema) # Try to read just specific columns try: # Read the table with specific columns table = pf.read(columns=["hr_data"]) print("Successfully read hr_data column") # Convert to pandas, handling the nested structure carefully df = table.to_pandas() print("Converted to pandas DataFrame") except Exception as e: print(f"Error reading with pandas conversion: {e}") # Try alternative approach - read raw pyarrow table table = pf.read() print("Read raw table successfully") # Get hr_data column directly from pyarrow hr_data_column = table.column("hr_data") print(f"HR data column type: {hr_data_column.type}") # Convert the column to a list format hr_data_values = [] for i in range(len(hr_data_column)): chunk = hr_data_column.chunk(0) list_array = chunk.slice(i, 1).to_pandas().iloc[0] if list_array is not None and len(list_array) > 0: hr_data_values.extend(list_array) if hr_data_values: # Create DataFrame with HR data hr_df = pd.DataFrame({"HR": hr_data_values}) print(f"\nExtracted {len(hr_data_values)} HR values") print("Sample HR values:") print(hr_df.head(10)) # Save to CSV hr_df.to_csv(output_file, index=False) print(f"\nSaved HR data to {output_file}") return True else: print("No HR data found") return False # If we got here, the pandas conversion worked print("Columns available:") print(df.columns.tolist()) print("\nData shape:", df.shape) # Extract HR data - assuming it's in hr_data column if "hr_data" in df.columns: # Handle nested list structure hr_values = [] for row_idx in range(len(df)): hr_data = df["hr_data"].iloc[row_idx] if hr_data is not None and len(hr_data) > 0: hr_values.extend(hr_data) if hr_values: # Create a new DataFrame with HR data hr_df = pd.DataFrame({"HR": hr_values}) print(f"\nExtracted {len(hr_values)} HR values") print("Sample HR values:") print(hr_df.head(10)) # Save to CSV hr_df.to_csv(output_file, index=False) print(f"\nSaved HR data to {output_file}") return True else: print("No HR values found in the data") return False else: print("Error: 'hr_data' column not found in the data") print("Available columns:", df.columns.tolist()) return False except Exception as e: print(f"Error: {e}") import traceback traceback.print_exc() return False if __name__ == "__main__": parquet_file = "history_20250610_165414.parquet" output_file = "history_20250610_165414_HR.csv" success = convert_parquet_to_csv(parquet_file, output_file) if success: print(f"\nConversion completed successfully!") print(f"Input: {parquet_file}") print(f"Output: {output_file}") else: print("Conversion failed!") sys.exit(1)