feat: Add HR data visualization and CSV conversion scripts
This commit is contained in:
120
convert_parquet_to_csv.py
Normal file
120
convert_parquet_to_csv.py
Normal file
@ -0,0 +1,120 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Convert parquet file to CSV with only HR field
|
||||
"""
|
||||
|
||||
import pyarrow.parquet as pq
|
||||
import pyarrow as pa
|
||||
import pandas as pd
|
||||
import sys
|
||||
|
||||
|
||||
def convert_parquet_to_csv(parquet_file, output_file):
|
||||
"""Convert parquet file to CSV extracting only HR data"""
|
||||
try:
|
||||
# Read the parquet file
|
||||
print(f"Reading {parquet_file}...")
|
||||
pf = pq.ParquetFile(parquet_file)
|
||||
|
||||
print("Schema:")
|
||||
print(pf.schema)
|
||||
|
||||
# Try to read just specific columns
|
||||
try:
|
||||
# Read the table with specific columns
|
||||
table = pf.read(columns=["hr_data"])
|
||||
print("Successfully read hr_data column")
|
||||
|
||||
# Convert to pandas, handling the nested structure carefully
|
||||
df = table.to_pandas()
|
||||
print("Converted to pandas DataFrame")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error reading with pandas conversion: {e}")
|
||||
# Try alternative approach - read raw pyarrow table
|
||||
table = pf.read()
|
||||
print("Read raw table successfully")
|
||||
|
||||
# Get hr_data column directly from pyarrow
|
||||
hr_data_column = table.column("hr_data")
|
||||
print(f"HR data column type: {hr_data_column.type}")
|
||||
|
||||
# Convert the column to a list format
|
||||
hr_data_values = []
|
||||
for i in range(len(hr_data_column)):
|
||||
chunk = hr_data_column.chunk(0)
|
||||
list_array = chunk.slice(i, 1).to_pandas().iloc[0]
|
||||
if list_array is not None and len(list_array) > 0:
|
||||
hr_data_values.extend(list_array)
|
||||
|
||||
if hr_data_values:
|
||||
# Create DataFrame with HR data
|
||||
hr_df = pd.DataFrame({"HR": hr_data_values})
|
||||
|
||||
print(f"\nExtracted {len(hr_data_values)} HR values")
|
||||
print("Sample HR values:")
|
||||
print(hr_df.head(10))
|
||||
|
||||
# Save to CSV
|
||||
hr_df.to_csv(output_file, index=False)
|
||||
print(f"\nSaved HR data to {output_file}")
|
||||
return True
|
||||
else:
|
||||
print("No HR data found")
|
||||
return False
|
||||
|
||||
# If we got here, the pandas conversion worked
|
||||
print("Columns available:")
|
||||
print(df.columns.tolist())
|
||||
print("\nData shape:", df.shape)
|
||||
|
||||
# Extract HR data - assuming it's in hr_data column
|
||||
if "hr_data" in df.columns:
|
||||
# Handle nested list structure
|
||||
hr_values = []
|
||||
for row_idx in range(len(df)):
|
||||
hr_data = df["hr_data"].iloc[row_idx]
|
||||
if hr_data is not None and len(hr_data) > 0:
|
||||
hr_values.extend(hr_data)
|
||||
|
||||
if hr_values:
|
||||
# Create a new DataFrame with HR data
|
||||
hr_df = pd.DataFrame({"HR": hr_values})
|
||||
|
||||
print(f"\nExtracted {len(hr_values)} HR values")
|
||||
print("Sample HR values:")
|
||||
print(hr_df.head(10))
|
||||
|
||||
# Save to CSV
|
||||
hr_df.to_csv(output_file, index=False)
|
||||
print(f"\nSaved HR data to {output_file}")
|
||||
return True
|
||||
else:
|
||||
print("No HR values found in the data")
|
||||
return False
|
||||
else:
|
||||
print("Error: 'hr_data' column not found in the data")
|
||||
print("Available columns:", df.columns.tolist())
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parquet_file = "history_20250610_165414.parquet"
|
||||
output_file = "history_20250610_165414_HR.csv"
|
||||
|
||||
success = convert_parquet_to_csv(parquet_file, output_file)
|
||||
|
||||
if success:
|
||||
print(f"\nConversion completed successfully!")
|
||||
print(f"Input: {parquet_file}")
|
||||
print(f"Output: {output_file}")
|
||||
else:
|
||||
print("Conversion failed!")
|
||||
sys.exit(1)
|
||||
Reference in New Issue
Block a user