feat: Add HR data visualization and CSV conversion scripts

2025-06-10 17:09:14 +08:00
parent 884a575d7d
commit 1d1170f19c
10 changed files with 1323 additions and 38 deletions
--- a/convert_parquet_to_csv.py
+++ b/convert_parquet_to_csv.py
@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+"""
+Convert parquet file to CSV with only HR field
+"""
+
+import pyarrow.parquet as pq
+import pyarrow as pa
+import pandas as pd
+import sys
+
+
+def convert_parquet_to_csv(parquet_file, output_file):
+    """Convert parquet file to CSV extracting only HR data"""
+    try:
+        # Read the parquet file
+        print(f"Reading {parquet_file}...")
+        pf = pq.ParquetFile(parquet_file)
+
+        print("Schema:")
+        print(pf.schema)
+
+        # Try to read just specific columns
+        try:
+            # Read the table with specific columns
+            table = pf.read(columns=["hr_data"])
+            print("Successfully read hr_data column")
+
+            # Convert to pandas, handling the nested structure carefully
+            df = table.to_pandas()
+            print("Converted to pandas DataFrame")
+
+        except Exception as e:
+            print(f"Error reading with pandas conversion: {e}")
+            # Try alternative approach - read raw pyarrow table
+            table = pf.read()
+            print("Read raw table successfully")
+
+            # Get hr_data column directly from pyarrow
+            hr_data_column = table.column("hr_data")
+            print(f"HR data column type: {hr_data_column.type}")
+
+            # Convert the column to a list format
+            hr_data_values = []
+            for i in range(len(hr_data_column)):
+                chunk = hr_data_column.chunk(0)
+                list_array = chunk.slice(i, 1).to_pandas().iloc[0]
+                if list_array is not None and len(list_array) > 0:
+                    hr_data_values.extend(list_array)
+
+            if hr_data_values:
+                # Create DataFrame with HR data
+                hr_df = pd.DataFrame({"HR": hr_data_values})
+
+                print(f"\nExtracted {len(hr_data_values)} HR values")
+                print("Sample HR values:")
+                print(hr_df.head(10))
+
+                # Save to CSV
+                hr_df.to_csv(output_file, index=False)
+                print(f"\nSaved HR data to {output_file}")
+                return True
+            else:
+                print("No HR data found")
+                return False
+
+        # If we got here, the pandas conversion worked
+        print("Columns available:")
+        print(df.columns.tolist())
+        print("\nData shape:", df.shape)
+
+        # Extract HR data - assuming it's in hr_data column
+        if "hr_data" in df.columns:
+            # Handle nested list structure
+            hr_values = []
+            for row_idx in range(len(df)):
+                hr_data = df["hr_data"].iloc[row_idx]
+                if hr_data is not None and len(hr_data) > 0:
+                    hr_values.extend(hr_data)
+
+            if hr_values:
+                # Create a new DataFrame with HR data
+                hr_df = pd.DataFrame({"HR": hr_values})
+
+                print(f"\nExtracted {len(hr_values)} HR values")
+                print("Sample HR values:")
+                print(hr_df.head(10))
+
+                # Save to CSV
+                hr_df.to_csv(output_file, index=False)
+                print(f"\nSaved HR data to {output_file}")
+                return True
+            else:
+                print("No HR values found in the data")
+                return False
+        else:
+            print("Error: 'hr_data' column not found in the data")
+            print("Available columns:", df.columns.tolist())
+            return False
+
+    except Exception as e:
+        print(f"Error: {e}")
+        import traceback
+
+        traceback.print_exc()
+        return False
+
+
+if __name__ == "__main__":
+    parquet_file = "history_20250610_165414.parquet"
+    output_file = "history_20250610_165414_HR.csv"
+
+    success = convert_parquet_to_csv(parquet_file, output_file)
+
+    if success:
+        print(f"\nConversion completed successfully!")
+        print(f"Input: {parquet_file}")
+        print(f"Output: {output_file}")
+    else:
+        print("Conversion failed!")
+        sys.exit(1)