feat: Add HR data visualization and CSV conversion scripts

This commit is contained in:
2025-06-10 17:09:14 +08:00
parent 884a575d7d
commit 1d1170f19c
10 changed files with 1323 additions and 38 deletions

120
convert_parquet_to_csv.py Normal file
View File

@ -0,0 +1,120 @@
#!/usr/bin/env python3
"""
Convert parquet file to CSV with only HR field
"""
import pyarrow.parquet as pq
import pyarrow as pa
import pandas as pd
import sys
def convert_parquet_to_csv(parquet_file, output_file):
"""Convert parquet file to CSV extracting only HR data"""
try:
# Read the parquet file
print(f"Reading {parquet_file}...")
pf = pq.ParquetFile(parquet_file)
print("Schema:")
print(pf.schema)
# Try to read just specific columns
try:
# Read the table with specific columns
table = pf.read(columns=["hr_data"])
print("Successfully read hr_data column")
# Convert to pandas, handling the nested structure carefully
df = table.to_pandas()
print("Converted to pandas DataFrame")
except Exception as e:
print(f"Error reading with pandas conversion: {e}")
# Try alternative approach - read raw pyarrow table
table = pf.read()
print("Read raw table successfully")
# Get hr_data column directly from pyarrow
hr_data_column = table.column("hr_data")
print(f"HR data column type: {hr_data_column.type}")
# Convert the column to a list format
hr_data_values = []
for i in range(len(hr_data_column)):
chunk = hr_data_column.chunk(0)
list_array = chunk.slice(i, 1).to_pandas().iloc[0]
if list_array is not None and len(list_array) > 0:
hr_data_values.extend(list_array)
if hr_data_values:
# Create DataFrame with HR data
hr_df = pd.DataFrame({"HR": hr_data_values})
print(f"\nExtracted {len(hr_data_values)} HR values")
print("Sample HR values:")
print(hr_df.head(10))
# Save to CSV
hr_df.to_csv(output_file, index=False)
print(f"\nSaved HR data to {output_file}")
return True
else:
print("No HR data found")
return False
# If we got here, the pandas conversion worked
print("Columns available:")
print(df.columns.tolist())
print("\nData shape:", df.shape)
# Extract HR data - assuming it's in hr_data column
if "hr_data" in df.columns:
# Handle nested list structure
hr_values = []
for row_idx in range(len(df)):
hr_data = df["hr_data"].iloc[row_idx]
if hr_data is not None and len(hr_data) > 0:
hr_values.extend(hr_data)
if hr_values:
# Create a new DataFrame with HR data
hr_df = pd.DataFrame({"HR": hr_values})
print(f"\nExtracted {len(hr_values)} HR values")
print("Sample HR values:")
print(hr_df.head(10))
# Save to CSV
hr_df.to_csv(output_file, index=False)
print(f"\nSaved HR data to {output_file}")
return True
else:
print("No HR values found in the data")
return False
else:
print("Error: 'hr_data' column not found in the data")
print("Available columns:", df.columns.tolist())
return False
except Exception as e:
print(f"Error: {e}")
import traceback
traceback.print_exc()
return False
if __name__ == "__main__":
parquet_file = "history_20250610_165414.parquet"
output_file = "history_20250610_165414_HR.csv"
success = convert_parquet_to_csv(parquet_file, output_file)
if success:
print(f"\nConversion completed successfully!")
print(f"Input: {parquet_file}")
print(f"Output: {output_file}")
else:
print("Conversion failed!")
sys.exit(1)