mirror of
https://github.com/pocketpy/pocketpy
synced 2025-10-20 19:40:18 +00:00
615 lines
21 KiB
C++
615 lines
21 KiB
C++
/***************************************************************************
|
|
* Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht *
|
|
* Copyright (c) QuantStack *
|
|
* *
|
|
* Distributed under the terms of the BSD 3-Clause License. *
|
|
* *
|
|
* The full license is in the file LICENSE, distributed with this software. *
|
|
****************************************************************************/
|
|
|
|
/**
|
|
* @brief construct histogram
|
|
*/
|
|
|
|
#ifndef XTENSOR_HISTOGRAM_HPP
|
|
#define XTENSOR_HISTOGRAM_HPP
|
|
|
|
#include "xset_operation.hpp"
|
|
#include "xsort.hpp"
|
|
#include "xtensor.hpp"
|
|
#include "xview.hpp"
|
|
|
|
using namespace xt::placeholders;
|
|
|
|
namespace xt
|
|
{
|
|
/**
|
|
* @ingroup digitize
|
|
* @brief Return the indices of the bins to which each value in input array belongs.
|
|
*
|
|
* @param data The data.
|
|
* @param bin_edges The bin-edges. It has to be 1-dimensional and monotonic.
|
|
* @param right Indicating whether the intervals include the right or the left bin edge.
|
|
* @return Output array of indices, of same shape as x.
|
|
*/
|
|
template <class E1, class E2>
|
|
inline auto digitize(E1&& data, E2&& bin_edges, bool right = false)
|
|
{
|
|
XTENSOR_ASSERT(bin_edges.dimension() == 1);
|
|
XTENSOR_ASSERT(bin_edges.size() >= 2);
|
|
XTENSOR_ASSERT(std::is_sorted(bin_edges.cbegin(), bin_edges.cend()));
|
|
XTENSOR_ASSERT(xt::amin(data)[0] >= bin_edges[0]);
|
|
XTENSOR_ASSERT(xt::amax(data)[0] <= bin_edges[bin_edges.size() - 1]);
|
|
|
|
return xt::searchsorted(std::forward<E2>(bin_edges), std::forward<E1>(data), right);
|
|
}
|
|
|
|
namespace detail
|
|
{
|
|
template <class R = double, class E1, class E2, class E3>
|
|
inline auto histogram_imp(E1&& data, E2&& bin_edges, E3&& weights, bool density, bool equal_bins)
|
|
{
|
|
using size_type = common_size_type_t<std::decay_t<E1>, std::decay_t<E2>, std::decay_t<E3>>;
|
|
using value_type = typename std::decay_t<E3>::value_type;
|
|
|
|
XTENSOR_ASSERT(data.dimension() == 1);
|
|
XTENSOR_ASSERT(weights.dimension() == 1);
|
|
XTENSOR_ASSERT(bin_edges.dimension() == 1);
|
|
XTENSOR_ASSERT(weights.size() == data.size());
|
|
XTENSOR_ASSERT(bin_edges.size() >= 2);
|
|
XTENSOR_ASSERT(std::is_sorted(bin_edges.cbegin(), bin_edges.cend()));
|
|
|
|
size_t n_bins = bin_edges.size() - 1;
|
|
xt::xtensor<value_type, 1> count = xt::zeros<value_type>({n_bins});
|
|
|
|
if (equal_bins)
|
|
{
|
|
std::array<typename std::decay_t<E2>::value_type, 2> bounds = xt::minmax(bin_edges)();
|
|
auto left = static_cast<double>(bounds[0]);
|
|
auto right = static_cast<double>(bounds[1]);
|
|
double norm = 1. / (right - left);
|
|
for (size_t i = 0; i < data.size(); ++i)
|
|
{
|
|
auto v = static_cast<double>(data(i));
|
|
// left and right are not bounds of data
|
|
if (v >= left && v < right)
|
|
{
|
|
auto i_bin = static_cast<size_t>(static_cast<double>(n_bins) * (v - left) * norm);
|
|
count(i_bin) += weights(i);
|
|
}
|
|
else if (v == right)
|
|
{
|
|
count(n_bins - 1) += weights(i);
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
auto sorter = xt::argsort(data);
|
|
|
|
size_type ibin = 0;
|
|
|
|
for (auto& idx : sorter)
|
|
{
|
|
auto item = data[idx];
|
|
|
|
if (item < bin_edges[0])
|
|
{
|
|
continue;
|
|
}
|
|
|
|
if (item > bin_edges[n_bins])
|
|
{
|
|
break;
|
|
}
|
|
|
|
while (item >= bin_edges[ibin + 1] && ibin < n_bins - 1)
|
|
{
|
|
++ibin;
|
|
}
|
|
|
|
count[ibin] += weights[idx];
|
|
}
|
|
}
|
|
|
|
xt::xtensor<R, 1> prob = xt::cast<R>(count);
|
|
|
|
if (density)
|
|
{
|
|
R n = static_cast<R>(data.size());
|
|
for (size_type i = 0; i < bin_edges.size() - 1; ++i)
|
|
{
|
|
prob[i] /= (static_cast<R>(bin_edges[i + 1] - bin_edges[i]) * n);
|
|
}
|
|
}
|
|
|
|
return prob;
|
|
}
|
|
|
|
} // detail
|
|
|
|
/**
|
|
* @ingroup histogram
|
|
* @brief Compute the histogram of a set of data.
|
|
*
|
|
* @param data The data.
|
|
* @param bin_edges The bin-edges. It has to be 1-dimensional and monotonic.
|
|
* @param weights Weight factors corresponding to each data-point.
|
|
* @param density If true the resulting integral is normalized to 1. [default: false]
|
|
* @return An one-dimensional xarray<double>, length: bin_edges.size()-1.
|
|
*/
|
|
template <class R = double, class E1, class E2, class E3>
|
|
inline auto histogram(E1&& data, E2&& bin_edges, E3&& weights, bool density = false)
|
|
{
|
|
return detail::histogram_imp<R>(
|
|
std::forward<E1>(data),
|
|
std::forward<E2>(bin_edges),
|
|
std::forward<E3>(weights),
|
|
density,
|
|
false
|
|
);
|
|
}
|
|
|
|
/**
|
|
* @ingroup histogram
|
|
* @brief Compute the histogram of a set of data.
|
|
*
|
|
* @param data The data.
|
|
* @param bin_edges The bin-edges.
|
|
* @param density If true the resulting integral is normalized to 1. [default: false]
|
|
* @return An one-dimensional xarray<double>, length: bin_edges.size()-1.
|
|
*/
|
|
template <class R = double, class E1, class E2>
|
|
inline auto histogram(E1&& data, E2&& bin_edges, bool density = false)
|
|
{
|
|
using value_type = typename std::decay_t<E1>::value_type;
|
|
|
|
auto n = data.size();
|
|
|
|
return detail::histogram_imp<R>(
|
|
std::forward<E1>(data),
|
|
std::forward<E2>(bin_edges),
|
|
xt::ones<value_type>({n}),
|
|
density,
|
|
false
|
|
);
|
|
}
|
|
|
|
/**
|
|
* @ingroup histogram
|
|
* @brief Compute the histogram of a set of data.
|
|
*
|
|
* @param data The data.
|
|
* @param bins The number of bins. [default: 10]
|
|
* @param density If true the resulting integral is normalized to 1. [default: false]
|
|
* @return An one-dimensional xarray<double>, length: bin_edges.size()-1.
|
|
*/
|
|
template <class R = double, class E1>
|
|
inline auto histogram(E1&& data, std::size_t bins = 10, bool density = false)
|
|
{
|
|
using value_type = typename std::decay_t<E1>::value_type;
|
|
|
|
auto n = data.size();
|
|
|
|
return detail::histogram_imp<R>(
|
|
std::forward<E1>(data),
|
|
histogram_bin_edges(data, xt::ones<value_type>({n}), bins),
|
|
xt::ones<value_type>({n}),
|
|
density,
|
|
true
|
|
);
|
|
}
|
|
|
|
/**
|
|
* @ingroup histogram
|
|
* @brief Compute the histogram of a set of data.
|
|
*
|
|
* @param data The data.
|
|
* @param bins The number of bins.
|
|
* @param left The lower-most edge.
|
|
* @param right The upper-most edge.
|
|
* @param density If true the resulting integral is normalized to 1. [default: false]
|
|
* @return An one-dimensional xarray<double>, length: bin_edges.size()-1.
|
|
*/
|
|
template <class R = double, class E1, class E2>
|
|
inline auto histogram(E1&& data, std::size_t bins, E2 left, E2 right, bool density = false)
|
|
{
|
|
using value_type = typename std::decay_t<E1>::value_type;
|
|
|
|
auto n = data.size();
|
|
|
|
return detail::histogram_imp<R>(
|
|
std::forward<E1>(data),
|
|
histogram_bin_edges(data, left, right, bins),
|
|
xt::ones<value_type>({n}),
|
|
density,
|
|
true
|
|
);
|
|
}
|
|
|
|
/**
|
|
* @ingroup histogram
|
|
* @brief Compute the histogram of a set of data.
|
|
*
|
|
* @param data The data.
|
|
* @param bins The number of bins.
|
|
* @param weights Weight factors corresponding to each data-point.
|
|
* @param density If true the resulting integral is normalized to 1. [default: false]
|
|
* @return An one-dimensional xarray<double>, length: bin_edges.size()-1.
|
|
*/
|
|
template <class R = double, class E1, class E2>
|
|
inline auto histogram(E1&& data, std::size_t bins, E2&& weights, bool density = false)
|
|
{
|
|
return detail::histogram_imp<R>(
|
|
std::forward<E1>(data),
|
|
histogram_bin_edges(data, weights, bins),
|
|
std::forward<E2>(weights),
|
|
density,
|
|
true
|
|
);
|
|
}
|
|
|
|
/**
|
|
* @ingroup histogram
|
|
* @brief Compute the histogram of a set of data.
|
|
*
|
|
* @param data The data.
|
|
* @param bins The number of bins.
|
|
* @param left The lower-most edge.
|
|
* @param right The upper-most edge.
|
|
* @param weights Weight factors corresponding to each data-point.
|
|
* @param density If true the resulting integral is normalized to 1. [default: false]
|
|
* @return An one-dimensional xarray<double>, length: bin_edges.size()-1.
|
|
*/
|
|
template <class R = double, class E1, class E2, class E3>
|
|
inline auto histogram(E1&& data, std::size_t bins, E2&& weights, E3 left, E3 right, bool density = false)
|
|
{
|
|
return detail::histogram_imp<R>(
|
|
std::forward<E1>(data),
|
|
histogram_bin_edges(data, weights, left, right, bins),
|
|
std::forward<E2>(weights),
|
|
density,
|
|
true
|
|
);
|
|
}
|
|
|
|
/**
|
|
* @ingroup histogram
|
|
* @brief Defines different algorithms to be used in "histogram_bin_edges"
|
|
*/
|
|
enum class histogram_algorithm
|
|
{
|
|
automatic,
|
|
linspace,
|
|
logspace,
|
|
uniform
|
|
};
|
|
|
|
/**
|
|
* @ingroup histogram
|
|
* @brief Compute the bin-edges of a histogram of a set of data using different algorithms.
|
|
*
|
|
* @param data The data.
|
|
* @param weights Weight factors corresponding to each data-point.
|
|
* @param left The lower-most edge.
|
|
* @param right The upper-most edge.
|
|
* @param bins The number of bins. [default: 10]
|
|
* @param mode The type of algorithm to use. [default: "auto"]
|
|
* @return An one-dimensional xarray<double>, length: bins+1.
|
|
*/
|
|
template <class E1, class E2, class E3>
|
|
inline auto histogram_bin_edges(
|
|
E1&& data,
|
|
E2&& weights,
|
|
E3 left,
|
|
E3 right,
|
|
std::size_t bins = 10,
|
|
histogram_algorithm mode = histogram_algorithm::automatic
|
|
)
|
|
{
|
|
// counter and return type
|
|
using size_type = common_size_type_t<std::decay_t<E1>, std::decay_t<E2>>;
|
|
using value_type = typename std::decay_t<E1>::value_type;
|
|
using weights_type = typename std::decay_t<E2>::value_type;
|
|
|
|
// basic checks
|
|
// - rank
|
|
XTENSOR_ASSERT(data.dimension() == 1);
|
|
XTENSOR_ASSERT(weights.dimension() == 1);
|
|
// - size
|
|
XTENSOR_ASSERT(weights.size() == data.size());
|
|
// - bounds
|
|
XTENSOR_ASSERT(left <= right);
|
|
// - non-empty
|
|
XTENSOR_ASSERT(bins > std::size_t(0));
|
|
|
|
// act on different modes
|
|
switch (mode)
|
|
{
|
|
// bins of equal width
|
|
case histogram_algorithm::automatic:
|
|
{
|
|
xt::xtensor<value_type, 1> bin_edges = xt::linspace<value_type>(left, right, bins + 1);
|
|
return bin_edges;
|
|
}
|
|
|
|
// bins of equal width
|
|
case histogram_algorithm::linspace:
|
|
{
|
|
xt::xtensor<value_type, 1> bin_edges = xt::linspace<value_type>(left, right, bins + 1);
|
|
return bin_edges;
|
|
}
|
|
|
|
// bins of logarithmically increasing size
|
|
case histogram_algorithm::logspace:
|
|
{
|
|
using rhs_value_type = std::conditional_t<xtl::is_integral<value_type>::value, double, value_type>;
|
|
|
|
xtensor<value_type, 1> bin_edges = xt::cast<value_type>(
|
|
xt::logspace<rhs_value_type>(std::log10(left), std::log10(right), bins + 1)
|
|
);
|
|
|
|
// TODO: replace above with below after 'xsimd' fix
|
|
// xt::xtensor<value_type,1> bin_edges = xt::logspace<value_type>(
|
|
// std::log10(left), std::log10(right), bins+1);
|
|
|
|
return bin_edges;
|
|
}
|
|
|
|
// same amount of data in each bin
|
|
case histogram_algorithm::uniform:
|
|
{
|
|
// indices that sort "data"
|
|
auto sorter = xt::argsort(data);
|
|
|
|
// histogram: all of equal 'height'
|
|
// - height
|
|
weights_type w = xt::sum<weights_type>(weights)[0] / static_cast<weights_type>(bins);
|
|
// - apply to all bins
|
|
xt::xtensor<weights_type, 1> count = w * xt::ones<weights_type>({bins});
|
|
|
|
// take cumulative sum, to act as easy look-up
|
|
count = xt::cumsum(count);
|
|
|
|
// edges
|
|
// - allocate
|
|
std::vector<size_t> shape = {bins + 1};
|
|
xt::xtensor<value_type, 1> bin_edges = xtensor<value_type, 1>::from_shape(shape);
|
|
// - first/last edge
|
|
bin_edges[0] = left;
|
|
bin_edges[bins] = right;
|
|
// - cumulative weight
|
|
weights_type cum_weight = static_cast<weights_type>(0);
|
|
// - current bin
|
|
size_type ibin = 0;
|
|
// - loop to find interior bin-edges
|
|
for (size_type i = 0; i < weights.size(); ++i)
|
|
{
|
|
if (cum_weight >= count[ibin])
|
|
{
|
|
bin_edges[ibin + 1] = data[sorter[i]];
|
|
++ibin;
|
|
}
|
|
cum_weight += weights[sorter[i]];
|
|
}
|
|
return bin_edges;
|
|
}
|
|
|
|
// bins of equal width
|
|
default:
|
|
{
|
|
xt::xtensor<value_type, 1> bin_edges = xt::linspace<value_type>(left, right, bins + 1);
|
|
return bin_edges;
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @ingroup histogram
|
|
* @brief Compute the bin-edges of a histogram of a set of data using different algorithms.
|
|
*
|
|
* @param data The data.
|
|
* @param weights Weight factors corresponding to each data-point.
|
|
* @param bins The number of bins. [default: 10]
|
|
* @param mode The type of algorithm to use. [default: "auto"]
|
|
* @return An one-dimensional xarray<double>, length: bins+1.
|
|
*/
|
|
template <class E1, class E2>
|
|
inline auto histogram_bin_edges(
|
|
E1&& data,
|
|
E2&& weights,
|
|
std::size_t bins = 10,
|
|
histogram_algorithm mode = histogram_algorithm::automatic
|
|
)
|
|
{
|
|
using value_type = typename std::decay_t<E1>::value_type;
|
|
std::array<value_type, 2> left_right;
|
|
left_right = xt::minmax(data)();
|
|
|
|
return histogram_bin_edges(
|
|
std::forward<E1>(data),
|
|
std::forward<E2>(weights),
|
|
left_right[0],
|
|
left_right[1],
|
|
bins,
|
|
mode
|
|
);
|
|
}
|
|
|
|
/**
|
|
* @ingroup histogram
|
|
* @brief Compute the bin-edges of a histogram of a set of data using different algorithms.
|
|
*
|
|
* @param data The data.
|
|
* @param bins The number of bins. [default: 10]
|
|
* @param mode The type of algorithm to use. [default: "auto"]
|
|
* @return An one-dimensional xarray<double>, length: bins+1.
|
|
*/
|
|
template <class E1>
|
|
inline auto
|
|
histogram_bin_edges(E1&& data, std::size_t bins = 10, histogram_algorithm mode = histogram_algorithm::automatic)
|
|
{
|
|
using value_type = typename std::decay_t<E1>::value_type;
|
|
|
|
auto n = data.size();
|
|
std::array<value_type, 2> left_right;
|
|
left_right = xt::minmax(data)();
|
|
|
|
return histogram_bin_edges(
|
|
std::forward<E1>(data),
|
|
xt::ones<value_type>({n}),
|
|
left_right[0],
|
|
left_right[1],
|
|
bins,
|
|
mode
|
|
);
|
|
}
|
|
|
|
/**
|
|
* @ingroup histogram
|
|
* @brief Compute the bin-edges of a histogram of a set of data using different algorithms.
|
|
*
|
|
* @param data The data.
|
|
* @param left The lower-most edge.
|
|
* @param right The upper-most edge.
|
|
* @param bins The number of bins. [default: 10]
|
|
* @param mode The type of algorithm to use. [default: "auto"]
|
|
* @return An one-dimensional xarray<double>, length: bins+1.
|
|
*/
|
|
template <class E1, class E2>
|
|
inline auto histogram_bin_edges(
|
|
E1&& data,
|
|
E2 left,
|
|
E2 right,
|
|
std::size_t bins = 10,
|
|
histogram_algorithm mode = histogram_algorithm::automatic
|
|
)
|
|
{
|
|
using value_type = typename std::decay_t<E1>::value_type;
|
|
|
|
auto n = data.size();
|
|
|
|
return histogram_bin_edges(std::forward<E1>(data), xt::ones<value_type>({n}), left, right, bins, mode);
|
|
}
|
|
|
|
/**
|
|
* Count number of occurrences of each value in array of non-negative ints.
|
|
*
|
|
* The number of bins (of size 1) is one larger than the largest value in x.
|
|
* If minlength is specified, there will be at least this number of bins in
|
|
* the output array (though it will be longer if necessary, depending on the
|
|
* contents of x). Each bin gives the number of occurrences of its index
|
|
* value in x. If weights is specified the input array is weighted by it,
|
|
* i.e. if a value ``n`` is found at position ``i``, ``out[n] += weight[i]``
|
|
* instead of ``out[n] += 1``.
|
|
*
|
|
* @param data the 1D container with integers to count into bins
|
|
* @param weights a 1D container with the same number of elements as ``data``
|
|
* @param minlength The minlength
|
|
*
|
|
* @return 1D container with the bincount
|
|
*/
|
|
template <class E1, class E2, XTL_REQUIRES(is_xexpression<std::decay_t<E2>>)>
|
|
inline auto bincount(E1&& data, E2&& weights, std::size_t minlength = 0)
|
|
{
|
|
using result_value_type = typename std::decay_t<E2>::value_type;
|
|
using input_value_type = typename std::decay_t<E1>::value_type;
|
|
using size_type = typename std::decay_t<E1>::size_type;
|
|
|
|
static_assert(
|
|
xtl::is_integral<typename std::decay_t<E1>::value_type>::value,
|
|
"Bincount data has to be integral type."
|
|
);
|
|
XTENSOR_ASSERT(data.dimension() == 1);
|
|
XTENSOR_ASSERT(weights.dimension() == 1);
|
|
|
|
std::array<input_value_type, 2> left_right;
|
|
left_right = xt::minmax(data)();
|
|
|
|
if (left_right[0] < input_value_type(0))
|
|
{
|
|
XTENSOR_THROW(std::runtime_error, "Data argument for bincount can only contain positive integers!");
|
|
}
|
|
|
|
xt::xtensor<result_value_type, 1> res = xt::zeros<result_value_type>(
|
|
{(std::max)(minlength, std::size_t(left_right[1] + 1))}
|
|
);
|
|
|
|
for (size_type i = 0; i < data.size(); ++i)
|
|
{
|
|
res(data(i)) += weights(i);
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
template <class E1>
|
|
inline auto bincount(E1&& data, std::size_t minlength = 0)
|
|
{
|
|
return bincount(
|
|
std::forward<E1>(data),
|
|
xt::ones<typename std::decay_t<E1>::value_type>(data.shape()),
|
|
minlength
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Get the number of items in each bin, given the fraction of items per bin.
|
|
* The output is such that the total number of items of all bins is exactly "N".
|
|
*
|
|
* @param N the number of items to distribute
|
|
* @param weights fraction of items per bin: a 1D container whose size is the number of bins
|
|
*
|
|
* @return 1D container with the number of items per bin
|
|
*/
|
|
template <class E>
|
|
inline xt::xtensor<size_t, 1> bin_items(size_t N, E&& weights)
|
|
{
|
|
if (weights.size() <= std::size_t(1))
|
|
{
|
|
xt::xtensor<size_t, 1> n = N * xt::ones<size_t>({1});
|
|
return n;
|
|
}
|
|
|
|
#ifdef XTENSOR_ENABLE_ASSERT
|
|
using value_type = typename std::decay_t<E>::value_type;
|
|
|
|
XTENSOR_ASSERT(xt::all(weights >= static_cast<value_type>(0)));
|
|
XTENSOR_ASSERT(xt::sum(weights)() > static_cast<value_type>(0));
|
|
#endif
|
|
|
|
xt::xtensor<double, 1> P = xt::cast<double>(weights) / static_cast<double>(xt::sum(weights)());
|
|
xt::xtensor<size_t, 1> n = xt::ceil(static_cast<double>(N) * P);
|
|
|
|
if (xt::sum(n)() == N)
|
|
{
|
|
return n;
|
|
}
|
|
|
|
xt::xtensor<size_t, 1> d = xt::zeros<size_t>(P.shape());
|
|
xt::xtensor<size_t, 1> sorter = xt::argsort(P);
|
|
sorter = xt::view(sorter, xt::range(P.size(), _, -1));
|
|
sorter = xt::view(sorter, xt::range(0, xt::sum(n)(0) - N));
|
|
xt::view(d, xt::keep(sorter)) = 1;
|
|
n -= d;
|
|
|
|
return n;
|
|
}
|
|
|
|
/**
|
|
* Get the number of items in each bin, with each bin having approximately the same number of
|
|
* items in it,under the constraint that the total number of items of all bins is exactly "N".
|
|
*
|
|
* @param N the number of items to distribute
|
|
* @param bins the number of bins
|
|
*
|
|
* @return 1D container with the number of items per bin
|
|
*/
|
|
inline xt::xtensor<size_t, 1> bin_items(size_t N, size_t bins)
|
|
{
|
|
return bin_items(N, xt::ones<double>({bins}));
|
|
}
|
|
}
|
|
|
|
#endif
|