/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ /** * @brief construct histogram */ #ifndef XTENSOR_HISTOGRAM_HPP #define XTENSOR_HISTOGRAM_HPP #include "xset_operation.hpp" #include "xsort.hpp" #include "xtensor.hpp" #include "xview.hpp" using namespace xt::placeholders; namespace xt { /** * @ingroup digitize * @brief Return the indices of the bins to which each value in input array belongs. * * @param data The data. * @param bin_edges The bin-edges. It has to be 1-dimensional and monotonic. * @param right Indicating whether the intervals include the right or the left bin edge. * @return Output array of indices, of same shape as x. */ template inline auto digitize(E1&& data, E2&& bin_edges, bool right = false) { XTENSOR_ASSERT(bin_edges.dimension() == 1); XTENSOR_ASSERT(bin_edges.size() >= 2); XTENSOR_ASSERT(std::is_sorted(bin_edges.cbegin(), bin_edges.cend())); XTENSOR_ASSERT(xt::amin(data)[0] >= bin_edges[0]); XTENSOR_ASSERT(xt::amax(data)[0] <= bin_edges[bin_edges.size() - 1]); return xt::searchsorted(std::forward(bin_edges), std::forward(data), right); } namespace detail { template inline auto histogram_imp(E1&& data, E2&& bin_edges, E3&& weights, bool density, bool equal_bins) { using size_type = common_size_type_t, std::decay_t, std::decay_t>; using value_type = typename std::decay_t::value_type; XTENSOR_ASSERT(data.dimension() == 1); XTENSOR_ASSERT(weights.dimension() == 1); XTENSOR_ASSERT(bin_edges.dimension() == 1); XTENSOR_ASSERT(weights.size() == data.size()); XTENSOR_ASSERT(bin_edges.size() >= 2); XTENSOR_ASSERT(std::is_sorted(bin_edges.cbegin(), bin_edges.cend())); size_t n_bins = bin_edges.size() - 1; xt::xtensor count = xt::zeros({n_bins}); if (equal_bins) { std::array::value_type, 2> bounds = xt::minmax(bin_edges)(); auto left = static_cast(bounds[0]); auto right = static_cast(bounds[1]); double norm = 1. / (right - left); for (size_t i = 0; i < data.size(); ++i) { auto v = static_cast(data(i)); // left and right are not bounds of data if (v >= left && v < right) { auto i_bin = static_cast(static_cast(n_bins) * (v - left) * norm); count(i_bin) += weights(i); } else if (v == right) { count(n_bins - 1) += weights(i); } } } else { auto sorter = xt::argsort(data); size_type ibin = 0; for (auto& idx : sorter) { auto item = data[idx]; if (item < bin_edges[0]) { continue; } if (item > bin_edges[n_bins]) { break; } while (item >= bin_edges[ibin + 1] && ibin < n_bins - 1) { ++ibin; } count[ibin] += weights[idx]; } } xt::xtensor prob = xt::cast(count); if (density) { R n = static_cast(data.size()); for (size_type i = 0; i < bin_edges.size() - 1; ++i) { prob[i] /= (static_cast(bin_edges[i + 1] - bin_edges[i]) * n); } } return prob; } } // detail /** * @ingroup histogram * @brief Compute the histogram of a set of data. * * @param data The data. * @param bin_edges The bin-edges. It has to be 1-dimensional and monotonic. * @param weights Weight factors corresponding to each data-point. * @param density If true the resulting integral is normalized to 1. [default: false] * @return An one-dimensional xarray, length: bin_edges.size()-1. */ template inline auto histogram(E1&& data, E2&& bin_edges, E3&& weights, bool density = false) { return detail::histogram_imp( std::forward(data), std::forward(bin_edges), std::forward(weights), density, false ); } /** * @ingroup histogram * @brief Compute the histogram of a set of data. * * @param data The data. * @param bin_edges The bin-edges. * @param density If true the resulting integral is normalized to 1. [default: false] * @return An one-dimensional xarray, length: bin_edges.size()-1. */ template inline auto histogram(E1&& data, E2&& bin_edges, bool density = false) { using value_type = typename std::decay_t::value_type; auto n = data.size(); return detail::histogram_imp( std::forward(data), std::forward(bin_edges), xt::ones({n}), density, false ); } /** * @ingroup histogram * @brief Compute the histogram of a set of data. * * @param data The data. * @param bins The number of bins. [default: 10] * @param density If true the resulting integral is normalized to 1. [default: false] * @return An one-dimensional xarray, length: bin_edges.size()-1. */ template inline auto histogram(E1&& data, std::size_t bins = 10, bool density = false) { using value_type = typename std::decay_t::value_type; auto n = data.size(); return detail::histogram_imp( std::forward(data), histogram_bin_edges(data, xt::ones({n}), bins), xt::ones({n}), density, true ); } /** * @ingroup histogram * @brief Compute the histogram of a set of data. * * @param data The data. * @param bins The number of bins. * @param left The lower-most edge. * @param right The upper-most edge. * @param density If true the resulting integral is normalized to 1. [default: false] * @return An one-dimensional xarray, length: bin_edges.size()-1. */ template inline auto histogram(E1&& data, std::size_t bins, E2 left, E2 right, bool density = false) { using value_type = typename std::decay_t::value_type; auto n = data.size(); return detail::histogram_imp( std::forward(data), histogram_bin_edges(data, left, right, bins), xt::ones({n}), density, true ); } /** * @ingroup histogram * @brief Compute the histogram of a set of data. * * @param data The data. * @param bins The number of bins. * @param weights Weight factors corresponding to each data-point. * @param density If true the resulting integral is normalized to 1. [default: false] * @return An one-dimensional xarray, length: bin_edges.size()-1. */ template inline auto histogram(E1&& data, std::size_t bins, E2&& weights, bool density = false) { return detail::histogram_imp( std::forward(data), histogram_bin_edges(data, weights, bins), std::forward(weights), density, true ); } /** * @ingroup histogram * @brief Compute the histogram of a set of data. * * @param data The data. * @param bins The number of bins. * @param left The lower-most edge. * @param right The upper-most edge. * @param weights Weight factors corresponding to each data-point. * @param density If true the resulting integral is normalized to 1. [default: false] * @return An one-dimensional xarray, length: bin_edges.size()-1. */ template inline auto histogram(E1&& data, std::size_t bins, E2&& weights, E3 left, E3 right, bool density = false) { return detail::histogram_imp( std::forward(data), histogram_bin_edges(data, weights, left, right, bins), std::forward(weights), density, true ); } /** * @ingroup histogram * @brief Defines different algorithms to be used in "histogram_bin_edges" */ enum class histogram_algorithm { automatic, linspace, logspace, uniform }; /** * @ingroup histogram * @brief Compute the bin-edges of a histogram of a set of data using different algorithms. * * @param data The data. * @param weights Weight factors corresponding to each data-point. * @param left The lower-most edge. * @param right The upper-most edge. * @param bins The number of bins. [default: 10] * @param mode The type of algorithm to use. [default: "auto"] * @return An one-dimensional xarray, length: bins+1. */ template inline auto histogram_bin_edges( E1&& data, E2&& weights, E3 left, E3 right, std::size_t bins = 10, histogram_algorithm mode = histogram_algorithm::automatic ) { // counter and return type using size_type = common_size_type_t, std::decay_t>; using value_type = typename std::decay_t::value_type; using weights_type = typename std::decay_t::value_type; // basic checks // - rank XTENSOR_ASSERT(data.dimension() == 1); XTENSOR_ASSERT(weights.dimension() == 1); // - size XTENSOR_ASSERT(weights.size() == data.size()); // - bounds XTENSOR_ASSERT(left <= right); // - non-empty XTENSOR_ASSERT(bins > std::size_t(0)); // act on different modes switch (mode) { // bins of equal width case histogram_algorithm::automatic: { xt::xtensor bin_edges = xt::linspace(left, right, bins + 1); return bin_edges; } // bins of equal width case histogram_algorithm::linspace: { xt::xtensor bin_edges = xt::linspace(left, right, bins + 1); return bin_edges; } // bins of logarithmically increasing size case histogram_algorithm::logspace: { using rhs_value_type = std::conditional_t::value, double, value_type>; xtensor bin_edges = xt::cast( xt::logspace(std::log10(left), std::log10(right), bins + 1) ); // TODO: replace above with below after 'xsimd' fix // xt::xtensor bin_edges = xt::logspace( // std::log10(left), std::log10(right), bins+1); return bin_edges; } // same amount of data in each bin case histogram_algorithm::uniform: { // indices that sort "data" auto sorter = xt::argsort(data); // histogram: all of equal 'height' // - height weights_type w = xt::sum(weights)[0] / static_cast(bins); // - apply to all bins xt::xtensor count = w * xt::ones({bins}); // take cumulative sum, to act as easy look-up count = xt::cumsum(count); // edges // - allocate std::vector shape = {bins + 1}; xt::xtensor bin_edges = xtensor::from_shape(shape); // - first/last edge bin_edges[0] = left; bin_edges[bins] = right; // - cumulative weight weights_type cum_weight = static_cast(0); // - current bin size_type ibin = 0; // - loop to find interior bin-edges for (size_type i = 0; i < weights.size(); ++i) { if (cum_weight >= count[ibin]) { bin_edges[ibin + 1] = data[sorter[i]]; ++ibin; } cum_weight += weights[sorter[i]]; } return bin_edges; } // bins of equal width default: { xt::xtensor bin_edges = xt::linspace(left, right, bins + 1); return bin_edges; } } } /** * @ingroup histogram * @brief Compute the bin-edges of a histogram of a set of data using different algorithms. * * @param data The data. * @param weights Weight factors corresponding to each data-point. * @param bins The number of bins. [default: 10] * @param mode The type of algorithm to use. [default: "auto"] * @return An one-dimensional xarray, length: bins+1. */ template inline auto histogram_bin_edges( E1&& data, E2&& weights, std::size_t bins = 10, histogram_algorithm mode = histogram_algorithm::automatic ) { using value_type = typename std::decay_t::value_type; std::array left_right; left_right = xt::minmax(data)(); return histogram_bin_edges( std::forward(data), std::forward(weights), left_right[0], left_right[1], bins, mode ); } /** * @ingroup histogram * @brief Compute the bin-edges of a histogram of a set of data using different algorithms. * * @param data The data. * @param bins The number of bins. [default: 10] * @param mode The type of algorithm to use. [default: "auto"] * @return An one-dimensional xarray, length: bins+1. */ template inline auto histogram_bin_edges(E1&& data, std::size_t bins = 10, histogram_algorithm mode = histogram_algorithm::automatic) { using value_type = typename std::decay_t::value_type; auto n = data.size(); std::array left_right; left_right = xt::minmax(data)(); return histogram_bin_edges( std::forward(data), xt::ones({n}), left_right[0], left_right[1], bins, mode ); } /** * @ingroup histogram * @brief Compute the bin-edges of a histogram of a set of data using different algorithms. * * @param data The data. * @param left The lower-most edge. * @param right The upper-most edge. * @param bins The number of bins. [default: 10] * @param mode The type of algorithm to use. [default: "auto"] * @return An one-dimensional xarray, length: bins+1. */ template inline auto histogram_bin_edges( E1&& data, E2 left, E2 right, std::size_t bins = 10, histogram_algorithm mode = histogram_algorithm::automatic ) { using value_type = typename std::decay_t::value_type; auto n = data.size(); return histogram_bin_edges(std::forward(data), xt::ones({n}), left, right, bins, mode); } /** * Count number of occurrences of each value in array of non-negative ints. * * The number of bins (of size 1) is one larger than the largest value in x. * If minlength is specified, there will be at least this number of bins in * the output array (though it will be longer if necessary, depending on the * contents of x). Each bin gives the number of occurrences of its index * value in x. If weights is specified the input array is weighted by it, * i.e. if a value ``n`` is found at position ``i``, ``out[n] += weight[i]`` * instead of ``out[n] += 1``. * * @param data the 1D container with integers to count into bins * @param weights a 1D container with the same number of elements as ``data`` * @param minlength The minlength * * @return 1D container with the bincount */ template >)> inline auto bincount(E1&& data, E2&& weights, std::size_t minlength = 0) { using result_value_type = typename std::decay_t::value_type; using input_value_type = typename std::decay_t::value_type; using size_type = typename std::decay_t::size_type; static_assert( xtl::is_integral::value_type>::value, "Bincount data has to be integral type." ); XTENSOR_ASSERT(data.dimension() == 1); XTENSOR_ASSERT(weights.dimension() == 1); std::array left_right; left_right = xt::minmax(data)(); if (left_right[0] < input_value_type(0)) { XTENSOR_THROW(std::runtime_error, "Data argument for bincount can only contain positive integers!"); } xt::xtensor res = xt::zeros( {(std::max)(minlength, std::size_t(left_right[1] + 1))} ); for (size_type i = 0; i < data.size(); ++i) { res(data(i)) += weights(i); } return res; } template inline auto bincount(E1&& data, std::size_t minlength = 0) { return bincount( std::forward(data), xt::ones::value_type>(data.shape()), minlength ); } /** * Get the number of items in each bin, given the fraction of items per bin. * The output is such that the total number of items of all bins is exactly "N". * * @param N the number of items to distribute * @param weights fraction of items per bin: a 1D container whose size is the number of bins * * @return 1D container with the number of items per bin */ template inline xt::xtensor bin_items(size_t N, E&& weights) { if (weights.size() <= std::size_t(1)) { xt::xtensor n = N * xt::ones({1}); return n; } #ifdef XTENSOR_ENABLE_ASSERT using value_type = typename std::decay_t::value_type; XTENSOR_ASSERT(xt::all(weights >= static_cast(0))); XTENSOR_ASSERT(xt::sum(weights)() > static_cast(0)); #endif xt::xtensor P = xt::cast(weights) / static_cast(xt::sum(weights)()); xt::xtensor n = xt::ceil(static_cast(N) * P); if (xt::sum(n)() == N) { return n; } xt::xtensor d = xt::zeros(P.shape()); xt::xtensor sorter = xt::argsort(P); sorter = xt::view(sorter, xt::range(P.size(), _, -1)); sorter = xt::view(sorter, xt::range(0, xt::sum(n)(0) - N)); xt::view(d, xt::keep(sorter)) = 1; n -= d; return n; } /** * Get the number of items in each bin, with each bin having approximately the same number of * items in it,under the constraint that the total number of items of all bins is exactly "N". * * @param N the number of items to distribute * @param bins the number of bins * * @return 1D container with the number of items per bin */ inline xt::xtensor bin_items(size_t N, size_t bins) { return bin_items(N, xt::ones({bins})); } } #endif