hr_visualize/convert_parquet_to_csv.py

#!/usr/bin/env python3
"""
Convert parquet file to CSV with only HR field
"""

import pyarrow.parquet as pq
import pyarrow as pa
import pandas as pd
import sys


def convert_parquet_to_csv(parquet_file, output_file):
    """Convert parquet file to CSV extracting only HR data"""
    try:
        # Read the parquet file
        print(f"Reading {parquet_file}...")
        pf = pq.ParquetFile(parquet_file)

        print("Schema:")
        print(pf.schema)

        # Try to read just specific columns
        try:
            # Read the table with specific columns
            table = pf.read(columns=["hr_data"])
            print("Successfully read hr_data column")

            # Convert to pandas, handling the nested structure carefully
            df = table.to_pandas()
            print("Converted to pandas DataFrame")

        except Exception as e:
            print(f"Error reading with pandas conversion: {e}")
            # Try alternative approach - read raw pyarrow table
            table = pf.read()
            print("Read raw table successfully")

            # Get hr_data column directly from pyarrow
            hr_data_column = table.column("hr_data")
            print(f"HR data column type: {hr_data_column.type}")

            # Convert the column to a list format
            hr_data_values = []
            for i in range(len(hr_data_column)):
                chunk = hr_data_column.chunk(0)
                list_array = chunk.slice(i, 1).to_pandas().iloc[0]
                if list_array is not None and len(list_array) > 0:
                    hr_data_values.extend(list_array)

            if hr_data_values:
                # Create DataFrame with HR data
                hr_df = pd.DataFrame({"HR": hr_data_values})

                print(f"\nExtracted {len(hr_data_values)} HR values")
                print("Sample HR values:")
                print(hr_df.head(10))

                # Save to CSV
                hr_df.to_csv(output_file, index=False)
                print(f"\nSaved HR data to {output_file}")
                return True
            else:
                print("No HR data found")
                return False

        # If we got here, the pandas conversion worked
        print("Columns available:")
        print(df.columns.tolist())
        print("\nData shape:", df.shape)

        # Extract HR data - assuming it's in hr_data column
        if "hr_data" in df.columns:
            # Handle nested list structure
            hr_values = []
            for row_idx in range(len(df)):
                hr_data = df["hr_data"].iloc[row_idx]
                if hr_data is not None and len(hr_data) > 0:
                    hr_values.extend(hr_data)

            if hr_values:
                # Create a new DataFrame with HR data
                hr_df = pd.DataFrame({"HR": hr_values})

                print(f"\nExtracted {len(hr_values)} HR values")
                print("Sample HR values:")
                print(hr_df.head(10))

                # Save to CSV
                hr_df.to_csv(output_file, index=False)
                print(f"\nSaved HR data to {output_file}")
                return True
            else:
                print("No HR values found in the data")
                return False
        else:
            print("Error: 'hr_data' column not found in the data")
            print("Available columns:", df.columns.tolist())
            return False

    except Exception as e:
        print(f"Error: {e}")
        import traceback

        traceback.print_exc()
        return False


if __name__ == "__main__":
    parquet_file = "history_20250610_165414.parquet"
    output_file = "history_20250610_165414_HR.csv"

    success = convert_parquet_to_csv(parquet_file, output_file)

    if success:
        print(f"\nConversion completed successfully!")
        print(f"Input: {parquet_file}")
        print(f"Output: {output_file}")
    else:
        print("Conversion failed!")
        sys.exit(1)