DevDocsDev Docs
SQL Fundamentals

Aggregations & GROUP BY

Summarize data with aggregate functions, GROUP BY clauses, and HAVING filters

Aggregations transform multiple rows into summary values. They are essential for reporting, analytics, and data analysis.

How Aggregation Works

Aggregate Functions

-- Count all rows
SELECT COUNT(*) AS total_orders FROM orders;

-- Count non-NULL values
SELECT COUNT(phone) AS customers_with_phone FROM customers;

-- Count distinct values
SELECT COUNT(DISTINCT category_id) AS unique_categories FROM products;

-- Count with condition
SELECT COUNT(*) FILTER (WHERE status = 'active') AS active_count -- PostgreSQL
FROM users;

SELECT COUNT(CASE WHEN status = 'active' THEN 1 END) AS active_count -- All DBs
FROM users;
FunctionDescriptionNULL Handling
COUNT(*)Count all rowsIncludes NULLs
COUNT(column)Count non-NULL valuesExcludes NULLs
COUNT(DISTINCT column)Count unique valuesExcludes NULLs
-- Total of a column
SELECT SUM(total) AS total_revenue FROM orders;

-- Sum with condition
SELECT 
    SUM(total) AS total_revenue,
    SUM(CASE WHEN status = 'completed' THEN total ELSE 0 END) AS completed_revenue
FROM orders;

-- Sum of expression
SELECT SUM(quantity * price) AS total_value FROM order_items;

NULL in SUM

SUM ignores NULL values. If all values are NULL, the result is NULL (not 0). Use COALESCE(SUM(column), 0) to default to 0.

-- Average value
SELECT AVG(price) AS avg_price FROM products;

-- Average with precision
SELECT ROUND(AVG(price), 2) AS avg_price FROM products;

-- Weighted average
SELECT 
    SUM(price * quantity) / SUM(quantity) AS weighted_avg_price
FROM order_items;

AVG and Integers

In some databases, AVG of integers returns an integer. Cast to decimal for precision: AVG(CAST(quantity AS DECIMAL)) or AVG(quantity::DECIMAL) in PostgreSQL.

-- Minimum and maximum
SELECT 
    MIN(price) AS cheapest,
    MAX(price) AS most_expensive
FROM products;

-- For dates
SELECT 
    MIN(created_at) AS first_order,
    MAX(created_at) AS last_order
FROM orders;

-- For strings (alphabetical)
SELECT 
    MIN(name) AS first_alphabetically,
    MAX(name) AS last_alphabetically
FROM products;
-- PostgreSQL: STRING_AGG
SELECT 
    customer_id,
    STRING_AGG(product_name, ', ') AS products_ordered
FROM order_items
GROUP BY customer_id;

-- MySQL: GROUP_CONCAT
SELECT 
    customer_id,
    GROUP_CONCAT(product_name SEPARATOR ', ') AS products_ordered
FROM order_items
GROUP BY customer_id;

-- SQL Server: STRING_AGG (2017+)
SELECT 
    customer_id,
    STRING_AGG(product_name, ', ') AS products_ordered
FROM order_items
GROUP BY customer_id;

-- With ordering
SELECT 
    customer_id,
    STRING_AGG(product_name, ', ' ORDER BY product_name) AS products_ordered -- PostgreSQL
FROM order_items
GROUP BY customer_id;

GROUP BY

GROUP BY divides rows into groups and applies aggregate functions to each group.

-- Count orders per customer
SELECT 
    customer_id,
    COUNT(*) AS order_count,
    SUM(total) AS total_spent
FROM orders
GROUP BY customer_id;
customer_idorder_counttotal_spent
151500.00
23800.00
382200.00
-- Sales by category and year
SELECT 
    category_id,
    EXTRACT(YEAR FROM created_at) AS year,
    COUNT(*) AS order_count,
    SUM(total) AS revenue
FROM orders o
JOIN order_items oi ON o.id = oi.order_id
GROUP BY category_id, EXTRACT(YEAR FROM created_at)
ORDER BY category_id, year;
category_idyearorder_countrevenue
1202315045000.00
1202420062000.00
220238024000.00
2202412038000.00
-- Group by expression
SELECT 
    CASE 
        WHEN price < 100 THEN 'Budget'
        WHEN price < 500 THEN 'Mid-range'
        ELSE 'Premium'
    END AS price_tier,
    COUNT(*) AS product_count,
    ROUND(AVG(price), 2) AS avg_price
FROM products
GROUP BY 
    CASE 
        WHEN price < 100 THEN 'Budget'
        WHEN price < 500 THEN 'Mid-range'
        ELSE 'Premium'
    END;
-- Monthly revenue
SELECT 
    DATE_TRUNC('month', created_at) AS month,  -- PostgreSQL
    -- DATE_FORMAT(created_at, '%Y-%m-01') AS month,  -- MySQL
    COUNT(*) AS order_count,
    SUM(total) AS revenue
FROM orders
GROUP BY DATE_TRUNC('month', created_at)
ORDER BY month;

-- Daily, weekly, monthly options
SELECT 
    DATE_TRUNC('day', created_at) AS day,
    DATE_TRUNC('week', created_at) AS week,
    DATE_TRUNC('month', created_at) AS month,
    COUNT(*)
FROM orders
GROUP BY 
    DATE_TRUNC('day', created_at),
    DATE_TRUNC('week', created_at),
    DATE_TRUNC('month', created_at);

HAVING vs WHERE

ClausePurposeOperates OnAggregates
WHEREFilter rows before groupingIndividual rows❌ Cannot use
HAVINGFilter groups after groupingGroups✅ Can use
-- ❌ ERROR: Can't use aggregate in WHERE
SELECT customer_id, COUNT(*)
FROM orders
WHERE COUNT(*) > 5  -- ERROR!
GROUP BY customer_id;

-- ✅ CORRECT: Use HAVING for aggregate conditions
SELECT customer_id, COUNT(*)
FROM orders
GROUP BY customer_id
HAVING COUNT(*) > 5;
-- WHERE filters BEFORE grouping
SELECT 
    customer_id,
    COUNT(*) AS order_count,
    SUM(total) AS total_spent
FROM orders
WHERE status = 'completed'  -- Only count completed orders
GROUP BY customer_id;

Filter applied: Only rows with status = 'completed' are included in groups.

-- HAVING filters AFTER grouping
SELECT 
    customer_id,
    COUNT(*) AS order_count,
    SUM(total) AS total_spent
FROM orders
GROUP BY customer_id
HAVING COUNT(*) >= 5  -- Only show customers with 5+ orders
   AND SUM(total) > 1000;  -- AND spent over $1000

Filter applied: Groups (customers) are removed if they don't meet the criteria.

-- Use both for complex filtering
SELECT 
    category_id,
    COUNT(*) AS product_count,
    AVG(price) AS avg_price
FROM products
WHERE stock > 0          -- Only in-stock products (row filter)
GROUP BY category_id
HAVING COUNT(*) >= 5     -- Categories with 5+ products (group filter)
   AND AVG(price) > 100  -- And average price over $100
ORDER BY avg_price DESC;

Advanced Aggregation

ROLLUP, CUBE, and GROUPING SETS

-- ROLLUP: Hierarchical subtotals
SELECT 
    COALESCE(region, 'ALL REGIONS') AS region,
    COALESCE(category, 'ALL CATEGORIES') AS category,
    SUM(sales) AS total_sales
FROM sales
GROUP BY ROLLUP(region, category);
regioncategorytotal_sales
NorthElectronics10000
NorthClothing5000
NorthALL CATEGORIES15000
SouthElectronics8000
SouthClothing6000
SouthALL CATEGORIES14000
ALL REGIONSALL CATEGORIES29000
-- CUBE: All possible combinations
SELECT 
    region,
    category,
    SUM(sales) AS total_sales
FROM sales
GROUP BY CUBE(region, category);

CUBE produces subtotals for:

  • Each region + category combination
  • Each region (all categories)
  • Each category (all regions)
  • Grand total

More combinations than ROLLUP!

-- GROUPING SETS: Specific combinations
SELECT 
    region,
    category,
    SUM(sales) AS total_sales
FROM sales
GROUP BY GROUPING SETS (
    (region, category),  -- Combination
    (region),            -- By region only
    (category),          -- By category only
    ()                   -- Grand total
);

-- Equivalent to CUBE(region, category)

Conditional Aggregation

-- Count by condition
SELECT 
    COUNT(*) AS total_orders,
    COUNT(*) FILTER (WHERE status = 'pending') AS pending_orders,    -- PostgreSQL
    COUNT(*) FILTER (WHERE status = 'completed') AS completed_orders,
    COUNT(*) FILTER (WHERE status = 'cancelled') AS cancelled_orders
FROM orders;

-- Works in all databases
SELECT 
    COUNT(*) AS total_orders,
    SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) AS pending_orders,
    SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) AS completed_orders,
    SUM(CASE WHEN status = 'cancelled' THEN 1 ELSE 0 END) AS cancelled_orders
FROM orders;

-- Revenue by payment method
SELECT 
    SUM(CASE WHEN payment_method = 'credit' THEN total ELSE 0 END) AS credit_revenue,
    SUM(CASE WHEN payment_method = 'debit' THEN total ELSE 0 END) AS debit_revenue,
    SUM(CASE WHEN payment_method = 'cash' THEN total ELSE 0 END) AS cash_revenue
FROM orders;

Percentages and Ratios

-- Percentage of total
SELECT 
    category_id,
    SUM(price) AS category_total,
    ROUND(
        SUM(price) * 100.0 / (SELECT SUM(price) FROM products),
        2
    ) AS percentage_of_total
FROM products
GROUP BY category_id;

-- Using window function (more efficient)
SELECT 
    category_id,
    SUM(price) AS category_total,
    ROUND(
        SUM(price) * 100.0 / SUM(SUM(price)) OVER (),
        2
    ) AS percentage_of_total
FROM products
GROUP BY category_id;

-- Status distribution
SELECT 
    status,
    COUNT(*) AS count,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) AS percentage
FROM orders
GROUP BY status;

Common Patterns

Best Practices

Include Non-Aggregated Columns in GROUP BY

All columns in SELECT that are not in aggregate functions must be in GROUP BY.

-- ❌ ERROR: 'name' not in GROUP BY
SELECT customer_id, name, COUNT(*)
FROM orders
GROUP BY customer_id;

-- ✅ CORRECT
SELECT customer_id, name, COUNT(*)
FROM orders
GROUP BY customer_id, name;

Use HAVING Only for Aggregates

Don't use HAVING for conditions that could go in WHERE.

-- ❌ Inefficient: Filters after grouping
SELECT category_id, AVG(price)
FROM products
GROUP BY category_id
HAVING category_id = 1;

-- ✅ Efficient: Filters before grouping
SELECT category_id, AVG(price)
FROM products
WHERE category_id = 1
GROUP BY category_id;

Handle NULL Values Explicitly

Aggregates ignore NULLs, which can lead to unexpected results.

-- COALESCE for NULL handling
SELECT 
    COALESCE(category_id, 0) AS category_id,
    COUNT(*) AS product_count
FROM products
GROUP BY COALESCE(category_id, 0);

Next Steps

On this page