Aggregations & GROUP BY
Summarize data with aggregate functions, GROUP BY clauses, and HAVING filters
Aggregations transform multiple rows into summary values. They are essential for reporting, analytics, and data analysis.
How Aggregation Works
Aggregate Functions
-- Count all rows
SELECT COUNT(*) AS total_orders FROM orders;
-- Count non-NULL values
SELECT COUNT(phone) AS customers_with_phone FROM customers;
-- Count distinct values
SELECT COUNT(DISTINCT category_id) AS unique_categories FROM products;
-- Count with condition
SELECT COUNT(*) FILTER (WHERE status = 'active') AS active_count -- PostgreSQL
FROM users;
SELECT COUNT(CASE WHEN status = 'active' THEN 1 END) AS active_count -- All DBs
FROM users;| Function | Description | NULL Handling |
|---|---|---|
COUNT(*) | Count all rows | Includes NULLs |
COUNT(column) | Count non-NULL values | Excludes NULLs |
COUNT(DISTINCT column) | Count unique values | Excludes NULLs |
-- Total of a column
SELECT SUM(total) AS total_revenue FROM orders;
-- Sum with condition
SELECT
SUM(total) AS total_revenue,
SUM(CASE WHEN status = 'completed' THEN total ELSE 0 END) AS completed_revenue
FROM orders;
-- Sum of expression
SELECT SUM(quantity * price) AS total_value FROM order_items;NULL in SUM
SUM ignores NULL values. If all values are NULL, the result is NULL (not 0).
Use COALESCE(SUM(column), 0) to default to 0.
-- Average value
SELECT AVG(price) AS avg_price FROM products;
-- Average with precision
SELECT ROUND(AVG(price), 2) AS avg_price FROM products;
-- Weighted average
SELECT
SUM(price * quantity) / SUM(quantity) AS weighted_avg_price
FROM order_items;AVG and Integers
In some databases, AVG of integers returns an integer. Cast to decimal for precision:
AVG(CAST(quantity AS DECIMAL)) or AVG(quantity::DECIMAL) in PostgreSQL.
-- Minimum and maximum
SELECT
MIN(price) AS cheapest,
MAX(price) AS most_expensive
FROM products;
-- For dates
SELECT
MIN(created_at) AS first_order,
MAX(created_at) AS last_order
FROM orders;
-- For strings (alphabetical)
SELECT
MIN(name) AS first_alphabetically,
MAX(name) AS last_alphabetically
FROM products;-- PostgreSQL: STRING_AGG
SELECT
customer_id,
STRING_AGG(product_name, ', ') AS products_ordered
FROM order_items
GROUP BY customer_id;
-- MySQL: GROUP_CONCAT
SELECT
customer_id,
GROUP_CONCAT(product_name SEPARATOR ', ') AS products_ordered
FROM order_items
GROUP BY customer_id;
-- SQL Server: STRING_AGG (2017+)
SELECT
customer_id,
STRING_AGG(product_name, ', ') AS products_ordered
FROM order_items
GROUP BY customer_id;
-- With ordering
SELECT
customer_id,
STRING_AGG(product_name, ', ' ORDER BY product_name) AS products_ordered -- PostgreSQL
FROM order_items
GROUP BY customer_id;GROUP BY
GROUP BY divides rows into groups and applies aggregate functions to each group.
-- Count orders per customer
SELECT
customer_id,
COUNT(*) AS order_count,
SUM(total) AS total_spent
FROM orders
GROUP BY customer_id;| customer_id | order_count | total_spent |
|---|---|---|
| 1 | 5 | 1500.00 |
| 2 | 3 | 800.00 |
| 3 | 8 | 2200.00 |
-- Sales by category and year
SELECT
category_id,
EXTRACT(YEAR FROM created_at) AS year,
COUNT(*) AS order_count,
SUM(total) AS revenue
FROM orders o
JOIN order_items oi ON o.id = oi.order_id
GROUP BY category_id, EXTRACT(YEAR FROM created_at)
ORDER BY category_id, year;| category_id | year | order_count | revenue |
|---|---|---|---|
| 1 | 2023 | 150 | 45000.00 |
| 1 | 2024 | 200 | 62000.00 |
| 2 | 2023 | 80 | 24000.00 |
| 2 | 2024 | 120 | 38000.00 |
-- Group by expression
SELECT
CASE
WHEN price < 100 THEN 'Budget'
WHEN price < 500 THEN 'Mid-range'
ELSE 'Premium'
END AS price_tier,
COUNT(*) AS product_count,
ROUND(AVG(price), 2) AS avg_price
FROM products
GROUP BY
CASE
WHEN price < 100 THEN 'Budget'
WHEN price < 500 THEN 'Mid-range'
ELSE 'Premium'
END;-- Monthly revenue
SELECT
DATE_TRUNC('month', created_at) AS month, -- PostgreSQL
-- DATE_FORMAT(created_at, '%Y-%m-01') AS month, -- MySQL
COUNT(*) AS order_count,
SUM(total) AS revenue
FROM orders
GROUP BY DATE_TRUNC('month', created_at)
ORDER BY month;
-- Daily, weekly, monthly options
SELECT
DATE_TRUNC('day', created_at) AS day,
DATE_TRUNC('week', created_at) AS week,
DATE_TRUNC('month', created_at) AS month,
COUNT(*)
FROM orders
GROUP BY
DATE_TRUNC('day', created_at),
DATE_TRUNC('week', created_at),
DATE_TRUNC('month', created_at);HAVING vs WHERE
| Clause | Purpose | Operates On | Aggregates |
|---|---|---|---|
WHERE | Filter rows before grouping | Individual rows | ❌ Cannot use |
HAVING | Filter groups after grouping | Groups | ✅ Can use |
-- ❌ ERROR: Can't use aggregate in WHERE
SELECT customer_id, COUNT(*)
FROM orders
WHERE COUNT(*) > 5 -- ERROR!
GROUP BY customer_id;
-- ✅ CORRECT: Use HAVING for aggregate conditions
SELECT customer_id, COUNT(*)
FROM orders
GROUP BY customer_id
HAVING COUNT(*) > 5;-- WHERE filters BEFORE grouping
SELECT
customer_id,
COUNT(*) AS order_count,
SUM(total) AS total_spent
FROM orders
WHERE status = 'completed' -- Only count completed orders
GROUP BY customer_id;Filter applied: Only rows with status = 'completed' are included in groups.
-- HAVING filters AFTER grouping
SELECT
customer_id,
COUNT(*) AS order_count,
SUM(total) AS total_spent
FROM orders
GROUP BY customer_id
HAVING COUNT(*) >= 5 -- Only show customers with 5+ orders
AND SUM(total) > 1000; -- AND spent over $1000Filter applied: Groups (customers) are removed if they don't meet the criteria.
-- Use both for complex filtering
SELECT
category_id,
COUNT(*) AS product_count,
AVG(price) AS avg_price
FROM products
WHERE stock > 0 -- Only in-stock products (row filter)
GROUP BY category_id
HAVING COUNT(*) >= 5 -- Categories with 5+ products (group filter)
AND AVG(price) > 100 -- And average price over $100
ORDER BY avg_price DESC;Advanced Aggregation
ROLLUP, CUBE, and GROUPING SETS
-- ROLLUP: Hierarchical subtotals
SELECT
COALESCE(region, 'ALL REGIONS') AS region,
COALESCE(category, 'ALL CATEGORIES') AS category,
SUM(sales) AS total_sales
FROM sales
GROUP BY ROLLUP(region, category);| region | category | total_sales |
|---|---|---|
| North | Electronics | 10000 |
| North | Clothing | 5000 |
| North | ALL CATEGORIES | 15000 |
| South | Electronics | 8000 |
| South | Clothing | 6000 |
| South | ALL CATEGORIES | 14000 |
| ALL REGIONS | ALL CATEGORIES | 29000 |
-- CUBE: All possible combinations
SELECT
region,
category,
SUM(sales) AS total_sales
FROM sales
GROUP BY CUBE(region, category);CUBE produces subtotals for:
- Each region + category combination
- Each region (all categories)
- Each category (all regions)
- Grand total
More combinations than ROLLUP!
-- GROUPING SETS: Specific combinations
SELECT
region,
category,
SUM(sales) AS total_sales
FROM sales
GROUP BY GROUPING SETS (
(region, category), -- Combination
(region), -- By region only
(category), -- By category only
() -- Grand total
);
-- Equivalent to CUBE(region, category)Conditional Aggregation
-- Count by condition
SELECT
COUNT(*) AS total_orders,
COUNT(*) FILTER (WHERE status = 'pending') AS pending_orders, -- PostgreSQL
COUNT(*) FILTER (WHERE status = 'completed') AS completed_orders,
COUNT(*) FILTER (WHERE status = 'cancelled') AS cancelled_orders
FROM orders;
-- Works in all databases
SELECT
COUNT(*) AS total_orders,
SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) AS pending_orders,
SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) AS completed_orders,
SUM(CASE WHEN status = 'cancelled' THEN 1 ELSE 0 END) AS cancelled_orders
FROM orders;
-- Revenue by payment method
SELECT
SUM(CASE WHEN payment_method = 'credit' THEN total ELSE 0 END) AS credit_revenue,
SUM(CASE WHEN payment_method = 'debit' THEN total ELSE 0 END) AS debit_revenue,
SUM(CASE WHEN payment_method = 'cash' THEN total ELSE 0 END) AS cash_revenue
FROM orders;Percentages and Ratios
-- Percentage of total
SELECT
category_id,
SUM(price) AS category_total,
ROUND(
SUM(price) * 100.0 / (SELECT SUM(price) FROM products),
2
) AS percentage_of_total
FROM products
GROUP BY category_id;
-- Using window function (more efficient)
SELECT
category_id,
SUM(price) AS category_total,
ROUND(
SUM(price) * 100.0 / SUM(SUM(price)) OVER (),
2
) AS percentage_of_total
FROM products
GROUP BY category_id;
-- Status distribution
SELECT
status,
COUNT(*) AS count,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) AS percentage
FROM orders
GROUP BY status;Common Patterns
Best Practices
Include Non-Aggregated Columns in GROUP BY
All columns in SELECT that are not in aggregate functions must be in GROUP BY.
-- ❌ ERROR: 'name' not in GROUP BY
SELECT customer_id, name, COUNT(*)
FROM orders
GROUP BY customer_id;
-- ✅ CORRECT
SELECT customer_id, name, COUNT(*)
FROM orders
GROUP BY customer_id, name;Use HAVING Only for Aggregates
Don't use HAVING for conditions that could go in WHERE.
-- ❌ Inefficient: Filters after grouping
SELECT category_id, AVG(price)
FROM products
GROUP BY category_id
HAVING category_id = 1;
-- ✅ Efficient: Filters before grouping
SELECT category_id, AVG(price)
FROM products
WHERE category_id = 1
GROUP BY category_id;Handle NULL Values Explicitly
Aggregates ignore NULLs, which can lead to unexpected results.
-- COALESCE for NULL handling
SELECT
COALESCE(category_id, 0) AS category_id,
COUNT(*) AS product_count
FROM products
GROUP BY COALESCE(category_id, 0);