引用库
说明以后补
from pathlib import Path
import json
from math import floor, ceil
#
import numpy as np
import pandas as pd
import h5py
from scipy.stats import norm
根据单列数据绘制直方图
说明以后补
def 直方图(col: pd.Series, bin_width):
"""返回(heights,centres,origin_data),metadata"""
# sort是为了保证每次的输出一致,保证函数的重复一致性
series = np.sort(col.to_numpy())
# ========分柱子========
# bin_min和bin_max分别是最小的柱子的中间值和最大的柱子的中间值
# 保证柱子中心除以柱子宽度得到的值是一个整数
series_min, series_max = series.min(), series.max()
bin_min = ceil(series_min / bin_width) * bin_width
bin_max = floor(series_max / bin_width) * bin_width
# edge_min和edge_max绘图软件中用于指定x轴的左右边界
edge_min, edge_max = bin_min - bin_width / 2, bin_max + bin_width / 2
edges = np.linspace(edge_min, edge_max, int((edge_max - edge_min) / bin_width) + 1)
centres = edges[:-1] + bin_width / 2
# ========总数据集========
num_count = series.size
# np.histogram只提供density,不提供fraction
heights, _ = np.histogram(
series, edges, density=False, weights=np.ones((num_count,)) / num_count
)
# ========日志========
bin_count = edges.size - 1
mean, std = series.mean(), series.std()
# cdf是正态分布的累积概率密度函数,chatGPT写的,以后补上官网链接
probability_density = norm.cdf(edges, loc=mean, scale=std)
probability_density = probability_density[1:] - probability_density[:-1]
metadata = {
"num_count": int(num_count),
"min": int(series_min),
"max": int(series_max),
"mean": round(mean, 1),
"std": round(std, 1),
"edge_min": round(edge_min, 1),
"edge_max": round(edge_max, 1),
"bin_min": int(bin_min),
"bin_max": int(bin_max),
"bin_count": int(bin_count),
"bin_width": int(bin_width),
}
return (heights, centres, probability_density, series), metadata
输出直方图到hdf5
说明以后补
def 输出数据直方图(df: pd.DataFrame, bin_width, metadata_path: Path, hdf5_path: Path):
总log = {}
with h5py.File(hdf5_path, "a") as output:
for col_name in df.columns:
result, log = 直方图(df[col_name], bin_width)
总log[col_name] = log
# https://docs.h5py.org/en/stable/high/group.html#h5py.Group.create_dataset
# 注意这里有create_group的用法
if col_name in output.keys():
del output[col_name]
gp1 = output.create_group(col_name)
gp1.create_dataset("heights", data=result[0]).attrs.update(
{"vsz_name": f"{col_name}/heights"}
)
gp1.create_dataset("centres", data=result[1]).attrs.update(
{"vsz_name": f"{col_name}/centres"}
)
gp1.create_dataset("pd", data=result[2]).attrs.update(
{"vsz_name": f"{col_name}/pd"}
)
gp1.create_dataset("origin-data", data=result[3]).attrs.update(
{"vsz_name": f"{col_name}/origin-data"}
)
Path(metadata_path).write_text(
json.dumps(总log, indent=2, ensure_ascii=False), encoding="utf-8"
)