引用库

说明以后补

from pathlib import Path
import json
from math import floor, ceil

#
import numpy as np
import pandas as pd
import h5py
from scipy.stats import norm

根据单列数据绘制直方图

说明以后补

def 直方图(col: pd.Series, bin_width):
    """返回(heights,centres,origin_data),metadata"""
    # sort是为了保证每次的输出一致,保证函数的重复一致性
    series = np.sort(col.to_numpy())
    # ========分柱子========
    # bin_min和bin_max分别是最小的柱子的中间值和最大的柱子的中间值
    # 保证柱子中心除以柱子宽度得到的值是一个整数
    series_min, series_max = series.min(), series.max()
    bin_min = ceil(series_min / bin_width) * bin_width
    bin_max = floor(series_max / bin_width) * bin_width
    # edge_min和edge_max绘图软件中用于指定x轴的左右边界
    edge_min, edge_max = bin_min - bin_width / 2, bin_max + bin_width / 2
    edges = np.linspace(edge_min, edge_max, int((edge_max - edge_min) / bin_width) + 1)
    centres = edges[:-1] + bin_width / 2
    # ========总数据集========
    num_count = series.size
    # np.histogram只提供density,不提供fraction
    heights, _ = np.histogram(
        series, edges, density=False, weights=np.ones((num_count,)) / num_count
    )
    # ========日志========
    bin_count = edges.size - 1
    mean, std = series.mean(), series.std()
    # cdf是正态分布的累积概率密度函数,chatGPT写的,以后补上官网链接
    probability_density = norm.cdf(edges, loc=mean, scale=std)
    probability_density = probability_density[1:] - probability_density[:-1]
    metadata = {
        "num_count": int(num_count),
        "min": int(series_min),
        "max": int(series_max),
        "mean": round(mean, 1),
        "std": round(std, 1),
        "edge_min": round(edge_min, 1),
        "edge_max": round(edge_max, 1),
        "bin_min": int(bin_min),
        "bin_max": int(bin_max),
        "bin_count": int(bin_count),
        "bin_width": int(bin_width),
    }
    return (heights, centres, probability_density, series), metadata

输出直方图到hdf5

说明以后补

def 输出数据直方图(df: pd.DataFrame, bin_width, metadata_path: Path, hdf5_path: Path):
    总log = {}
    with h5py.File(hdf5_path, "a") as output:
        for col_name in df.columns:
            result, log = 直方图(df[col_name], bin_width)
            总log[col_name] = log
            # https://docs.h5py.org/en/stable/high/group.html#h5py.Group.create_dataset
            # 注意这里有create_group的用法
            if col_name in output.keys():
                del output[col_name]
            gp1 = output.create_group(col_name)
            gp1.create_dataset("heights", data=result[0]).attrs.update(
                {"vsz_name": f"{col_name}/heights"}
            )
            gp1.create_dataset("centres", data=result[1]).attrs.update(
                {"vsz_name": f"{col_name}/centres"}
            )
            gp1.create_dataset("pd", data=result[2]).attrs.update(
                {"vsz_name": f"{col_name}/pd"}
            )
            gp1.create_dataset("origin-data", data=result[3]).attrs.update(
                {"vsz_name": f"{col_name}/origin-data"}
            )
    Path(metadata_path).write_text(
        json.dumps(总log, indent=2, ensure_ascii=False), encoding="utf-8"
    )