Aggregate Histogram
From PostgreSQL wiki
Jump to navigationJump to searchA histogram represents the distribution of a set of values.
histogram()
Aggregate Histogram
Works with PostgreSQL
Any version
Written in
PLPGSQL
Depends on
Nothing
We write a function which runs width_bucket on each value to be aggregated and uses that to increment the corresponding bucket. The state is stored as an array of integers. The array has an element with index zero holding the count of values < MIN and an element with index (nbuckets+1) holding the count of values >= MAX.
This aggregate should work for PostgreSQL > 9.0, but it was only tested with PostgreSQL 9.6.
CREATE OR REPLACE FUNCTION hist_sfunc (state INTEGER[], val DOUBLE PRECISION,
MIN DOUBLE PRECISION, MAX DOUBLE PRECISION, nbuckets INTEGER) RETURNS INTEGER[] AS $$
DECLARE
bucket INTEGER;
i INTEGER;
BEGIN
-- Do nothing if val is NULL
IF val IS NULL THEN
RETURN state;
END IF;
-- This will put values in buckets with a 0 bucket for <MIN and a (nbuckets+1) bucket for >=MAX
bucket := width_bucket(val, MIN, MAX, nbuckets);
-- Init the array with the correct number of 0's so the caller doesn't see NULLs
IF state[0] IS NULL THEN
state := array_fill(0,ARRAY[nbuckets+2],ARRAY[0]);
END IF;
state[bucket] := state[bucket] + 1;
RETURN state;
END;
$$ LANGUAGE plpgsql IMMUTABLE;
-- Tell Postgres how to use the new function
DROP AGGREGATE IF EXISTS histogram (DOUBLE PRECISION, DOUBLE PRECISION, DOUBLE PRECISION, INTEGER);
CREATE AGGREGATE histogram (val DOUBLE PRECISION, min DOUBLE PRECISION, max DOUBLE PRECISION, nbuckets INTEGER) (
SFUNC = hist_sfunc,
STYPE = INTEGER[],
PARALLEL = SAFE -- Remove line for compatibility with Postgresql < 9.6
);
Helper functions
We can also define some helper functions that give the midpoints, breaks and ranges of the buckets in the histogram. These functions require PostgreSQL >= 9.5.
CREATE OR REPLACE FUNCTION histogram_ranges(MIN DOUBLE PRECISION, MAX DOUBLE PRECISION, nbuckets INTEGER)
RETURNS numrange[] AS
$$
DECLARE
res numrange[];
BEGIN
res := array_agg(numrange(l,u,'[)')) FROM
(SELECT generate_series(MIN::numeric,(MAX-(MAX-MIN)/nbuckets)::numeric,((MAX-MIN)/nbuckets)::numeric) AS l,
generate_series((MIN+(MAX-MIN)/nbuckets)::numeric,MAX::numeric,((MAX-MIN)/nbuckets)::numeric) AS u) t;
res[0] := numrange(NULL,MIN::numeric,'[)');
res[nbuckets+1] := numrange(MAX::numeric,NULL,'[)');
RETURN res;
END;
$$ LANGUAGE plpgsql IMMUTABLE;
CREATE OR REPLACE FUNCTION histogram_breaks(MIN DOUBLE PRECISION, MAX DOUBLE PRECISION, nbuckets INTEGER)
RETURNS DOUBLE PRECISION[] AS
$$
SELECT array(SELECT generate_series(MIN::numeric,MAX::numeric,((MAX-MIN)/nbuckets)::numeric)::DOUBLE PRECISION)
;
$$ LANGUAGE sql IMMUTABLE;
CREATE OR REPLACE FUNCTION histogram_mids(MIN DOUBLE PRECISION, MAX DOUBLE PRECISION, nbuckets INTEGER)
RETURNS DOUBLE PRECISION[] AS
$$
SELECT array(SELECT generate_series((MIN + 0.5*((MAX-MIN)/nbuckets))::numeric,
MAX::numeric,
((MAX-MIN)/nbuckets)::numeric)::DOUBLE PRECISION);
$$ LANGUAGE sql IMMUTABLE;
Example usage
WITH a AS (
SELECT generate_series(-2,5,0.5) AS i
)
SELECT array_agg(i) AS values,
histogram_mids(0,3,3) AS mids,
histogram_breaks(0,3,3) AS breaks,
histogram_ranges(0,3,3) AS ranges,
histogram(i,0,3,3) AS counts,
(histogram_ranges(0,3,3))[1:3] AS ranges_in_limits,
(histogram(i,0,3,3))[1:3] AS counts_in_limits
FROM a;
Caution
Returns NULL if the argument is a column of NULL values.