121 lines
3.9 KiB
Python
121 lines
3.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Convert parquet file to CSV with only HR field
|
|
"""
|
|
|
|
import pyarrow.parquet as pq
|
|
import pyarrow as pa
|
|
import pandas as pd
|
|
import sys
|
|
|
|
|
|
def convert_parquet_to_csv(parquet_file, output_file):
|
|
"""Convert parquet file to CSV extracting only HR data"""
|
|
try:
|
|
# Read the parquet file
|
|
print(f"Reading {parquet_file}...")
|
|
pf = pq.ParquetFile(parquet_file)
|
|
|
|
print("Schema:")
|
|
print(pf.schema)
|
|
|
|
# Try to read just specific columns
|
|
try:
|
|
# Read the table with specific columns
|
|
table = pf.read(columns=["hr_data"])
|
|
print("Successfully read hr_data column")
|
|
|
|
# Convert to pandas, handling the nested structure carefully
|
|
df = table.to_pandas()
|
|
print("Converted to pandas DataFrame")
|
|
|
|
except Exception as e:
|
|
print(f"Error reading with pandas conversion: {e}")
|
|
# Try alternative approach - read raw pyarrow table
|
|
table = pf.read()
|
|
print("Read raw table successfully")
|
|
|
|
# Get hr_data column directly from pyarrow
|
|
hr_data_column = table.column("hr_data")
|
|
print(f"HR data column type: {hr_data_column.type}")
|
|
|
|
# Convert the column to a list format
|
|
hr_data_values = []
|
|
for i in range(len(hr_data_column)):
|
|
chunk = hr_data_column.chunk(0)
|
|
list_array = chunk.slice(i, 1).to_pandas().iloc[0]
|
|
if list_array is not None and len(list_array) > 0:
|
|
hr_data_values.extend(list_array)
|
|
|
|
if hr_data_values:
|
|
# Create DataFrame with HR data
|
|
hr_df = pd.DataFrame({"HR": hr_data_values})
|
|
|
|
print(f"\nExtracted {len(hr_data_values)} HR values")
|
|
print("Sample HR values:")
|
|
print(hr_df.head(10))
|
|
|
|
# Save to CSV
|
|
hr_df.to_csv(output_file, index=False)
|
|
print(f"\nSaved HR data to {output_file}")
|
|
return True
|
|
else:
|
|
print("No HR data found")
|
|
return False
|
|
|
|
# If we got here, the pandas conversion worked
|
|
print("Columns available:")
|
|
print(df.columns.tolist())
|
|
print("\nData shape:", df.shape)
|
|
|
|
# Extract HR data - assuming it's in hr_data column
|
|
if "hr_data" in df.columns:
|
|
# Handle nested list structure
|
|
hr_values = []
|
|
for row_idx in range(len(df)):
|
|
hr_data = df["hr_data"].iloc[row_idx]
|
|
if hr_data is not None and len(hr_data) > 0:
|
|
hr_values.extend(hr_data)
|
|
|
|
if hr_values:
|
|
# Create a new DataFrame with HR data
|
|
hr_df = pd.DataFrame({"HR": hr_values})
|
|
|
|
print(f"\nExtracted {len(hr_values)} HR values")
|
|
print("Sample HR values:")
|
|
print(hr_df.head(10))
|
|
|
|
# Save to CSV
|
|
hr_df.to_csv(output_file, index=False)
|
|
print(f"\nSaved HR data to {output_file}")
|
|
return True
|
|
else:
|
|
print("No HR values found in the data")
|
|
return False
|
|
else:
|
|
print("Error: 'hr_data' column not found in the data")
|
|
print("Available columns:", df.columns.tolist())
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
import traceback
|
|
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parquet_file = "history_20250610_165414.parquet"
|
|
output_file = "history_20250610_165414_HR.csv"
|
|
|
|
success = convert_parquet_to_csv(parquet_file, output_file)
|
|
|
|
if success:
|
|
print(f"\nConversion completed successfully!")
|
|
print(f"Input: {parquet_file}")
|
|
print(f"Output: {output_file}")
|
|
else:
|
|
print("Conversion failed!")
|
|
sys.exit(1)
|