Database Performance Case Study
An in-depth analysis of optimizing a PostgreSQL-based application that achieved a 47x query performance improvement and reduced infrastructure costs by 65% through systematic database optimization techniques.
Executive Summary
Challenge: A data analytics platform was experiencing severe database performance issues with query times exceeding 30 seconds and frequent timeouts under moderate load.
Solution: Comprehensive database optimization including connection pooling, query optimization, intelligent caching, and schema redesign resulted in transformational performance improvements.
Impact:
- 47x faster query execution (avg 12.3s → 260ms)
- 89% reduction in database CPU usage
- 65% reduction in infrastructure costs
- 99.97% query success rate (vs 78% before)
Problem Analysis
Initial Symptoms
The analytics platform serving business intelligence dashboards was failing under production load:
-- Typical slow query taking 30+ seconds
SELECT
u.user_id,
u.username,
COUNT(t.transaction_id) as transaction_count,
SUM(t.amount) as total_amount,
AVG(t.amount) as avg_amount,
MAX(t.created_at) as last_transaction
FROM users u
LEFT JOIN transactions t ON u.user_id = t.user_id
WHERE t.created_at >= NOW() - INTERVAL '30 days'
GROUP BY u.user_id, u.username
ORDER BY total_amount DESC
LIMIT 1000;
-- Execution time: 34,567ms
-- Rows examined: 15M+ rows
-- Temporary tables: 2.1GB
Database Performance Metrics:
# PostgreSQL performance stats before optimization
Active connections: 485/500 (97% utilization)
Average query time: 12.3 seconds
95th percentile: 45.7 seconds
Failed queries: 22% (timeout/connection exhaustion)
CPU usage: 94% average
Memory usage: 28GB/32GB (87.5%)
Disk I/O wait: 45% of query time
Performance Investigation
Query Analysis with pg_stat_statements:
-- Top problematic queries identified
SELECT
calls,
total_time,
mean_time,
rows,
query
FROM pg_stat_statements
ORDER BY mean_time DESC
LIMIT 10;
/*
Results showed:
1. User analytics query: 34.2s avg (called 1,200x/day)
2. Transaction reports: 28.9s avg (called 800x/day)
3. Revenue calculations: 22.1s avg (called 2,400x/day)
4. User segmentation: 18.7s avg (called 600x/day)
5. Fraud detection: 15.3s avg (called 4,800x/day)
*/
Connection Pool Analysis:
# Connection monitoring revealed pool exhaustion
SELECT
state,
COUNT(*) as connection_count,
AVG(EXTRACT(EPOCH FROM (now() - state_change))) as avg_duration
FROM pg_stat_activity
WHERE pid <> pg_backend_pid()
GROUP BY state;
/*
Results:
- active: 245 connections (avg 23.4s duration)
- idle in transaction: 156 connections (avg 8.9s)
- idle: 84 connections
- waiting: 15 connections
*/
Index Analysis:
-- Missing indexes identified
SELECT
schemaname,
tablename,
seq_scan,
seq_tup_read,
seq_tup_read / seq_scan AS avg_tup_per_scan
FROM pg_stat_user_tables
WHERE seq_scan > 0
ORDER BY seq_tup_read DESC;
/*
Critical findings:
- transactions table: 2.4M sequential scans
- users table: 890K sequential scans
- audit_logs table: 1.6M sequential scans
- user_sessions table: 450K sequential scans
*/
Optimization Strategy
1. Connection Pool Optimization
Problem: Connection exhaustion and long-lived idle connections.
Solution: Implemented PgBouncer with optimized pooling strategy:
// Advanced connection pooling configuration
type DatabaseConfig struct {
// PgBouncer configuration
PoolMode string `yaml:"pool_mode"` // transaction
MaxClientConns int `yaml:"max_client_conns"` // 1000
DefaultPoolSize int `yaml:"default_pool_size"` // 25
MinPoolSize int `yaml:"min_pool_size"` // 5
ReservePoolSize int `yaml:"reserve_pool_size"` // 5
MaxDbConnections int `yaml:"max_db_connections"` // 100
// Connection lifecycle
ServerLifetime int `yaml:"server_lifetime"` // 3600s
ServerIdleTimeout int `yaml:"server_idle_timeout"` // 600s
ClientIdleTimeout int `yaml:"client_idle_timeout"` // 300s
// Application-level pooling
AppMaxOpenConns int `yaml:"app_max_open_conns"` // 50
AppMaxIdleConns int `yaml:"app_max_idle_conns"` // 25
ConnMaxLifetime time.Duration `yaml:"conn_max_lifetime"` // 30m
ConnMaxIdleTime time.Duration `yaml:"conn_max_idle_time"` // 5m
}
// Optimized database connection manager
type ConnectionManager struct {
db *sql.DB
pgBouncer *PgBouncerClient
healthChecker *HealthChecker
metrics *ConnectionMetrics
circuitBreaker *CircuitBreaker
}
func NewConnectionManager(config DatabaseConfig) (*ConnectionManager, error) {
// Configure PgBouncer
pgBouncer, err := NewPgBouncerClient(config)
if err != nil {
return nil, err
}
// Application connection pool
db, err := sql.Open("postgres", config.GetDSN())
if err != nil {
return nil, err
}
// Optimized pool settings
db.SetMaxOpenConns(config.AppMaxOpenConns)
db.SetMaxIdleConns(config.AppMaxIdleConns)
db.SetConnMaxLifetime(config.ConnMaxLifetime)
db.SetConnMaxIdleTime(config.ConnMaxIdleTime)
cm := &ConnectionManager{
db: db,
pgBouncer: pgBouncer,
metrics: NewConnectionMetrics(),
circuitBreaker: NewCircuitBreaker(CircuitBreakerConfig{
MaxFailures: 5,
Timeout: 30 * time.Second,
}),
}
// Start health checking
cm.healthChecker = NewHealthChecker(cm)
go cm.healthChecker.Start()
return cm, nil
}
func (cm *ConnectionManager) ExecuteQuery(ctx context.Context, query string, args ...interface{}) (*sql.Rows, error) {
// Circuit breaker check
if !cm.circuitBreaker.CanProceed() {
return nil, fmt.Errorf("circuit breaker open")
}
start := time.Now()
// Get connection with timeout
conn, err := cm.db.Conn(ctx)
if err != nil {
cm.metrics.RecordConnectionError()
cm.circuitBreaker.RecordFailure()
return nil, err
}
defer conn.Close()
// Execute query
rows, err := conn.QueryContext(ctx, query, args...)
duration := time.Since(start)
if err != nil {
cm.metrics.RecordQueryError(duration)
cm.circuitBreaker.RecordFailure()
return nil, err
}
cm.metrics.RecordQuerySuccess(duration)
cm.circuitBreaker.RecordSuccess()
return rows, nil
}
// PgBouncer configuration template
const pgBouncerConfig = `
[databases]
analytics = host=postgres-primary.local port=5432 dbname=analytics_db pool_size=25 reserve_pool=5
[pgbouncer]
pool_mode = transaction
listen_port = 6432
listen_addr = 0.0.0.0
auth_type = md5
auth_file = /etc/pgbouncer/userlist.txt
max_client_conn = 1000
default_pool_size = 25
min_pool_size = 5
reserve_pool_size = 5
reserve_pool_timeout = 5
server_lifetime = 3600
server_idle_timeout = 600
server_connect_timeout = 15
server_login_retry = 15
client_idle_timeout = 300
client_login_timeout = 60
ignore_startup_parameters = extra_float_digits
log_connections = 1
log_disconnections = 1
log_pooler_errors = 1
`
// Results: Connection utilization dropped from 97% to 23%
// Query queue time reduced from 8.9s to 45ms
// Connection errors reduced by 94%
2. Query Optimization and Indexing
Problem: Queries performing full table scans on millions of rows.
Solution: Comprehensive indexing strategy and query rewriting:
-- Critical index creation for performance
-- Compound index for user transaction analytics
CREATE INDEX CONCURRENTLY idx_transactions_user_created_amount
ON transactions(user_id, created_at DESC, amount)
WHERE deleted_at IS NULL;
-- Partial index for recent transactions (90% of queries)
CREATE INDEX CONCURRENTLY idx_transactions_recent
ON transactions(created_at DESC, user_id, amount)
WHERE created_at >= NOW() - INTERVAL '90 days';
-- Covering index for user analytics
CREATE INDEX CONCURRENTLY idx_users_analytics_covering
ON users(user_id)
INCLUDE (username, email, created_at, status);
-- Functional index for case-insensitive searches
CREATE INDEX CONCURRENTLY idx_users_username_lower
ON users(LOWER(username));
-- Partial index for active users only
CREATE INDEX CONCURRENTLY idx_users_active
ON users(created_at DESC, user_id)
WHERE status = 'active' AND deleted_at IS NULL;
Optimized Query Implementation:
// Query optimization service
type QueryOptimizer struct {
db *sql.DB
cache *QueryCache
analyzer *QueryAnalyzer
rewriter *QueryRewriter
metrics *QueryMetrics
}
type OptimizedQuery struct {
SQL string
Args []interface{}
CacheKey string
TTL time.Duration
Explanation *QueryPlan
}
// Before: Slow user analytics query
const slowUserAnalyticsQuery = `
SELECT
u.user_id,
u.username,
COUNT(t.transaction_id) as transaction_count,
SUM(t.amount) as total_amount,
AVG(t.amount) as avg_amount,
MAX(t.created_at) as last_transaction
FROM users u
LEFT JOIN transactions t ON u.user_id = t.user_id
WHERE t.created_at >= $1
GROUP BY u.user_id, u.username
ORDER BY total_amount DESC
LIMIT $2
`
// After: Optimized query with proper indexing
const optimizedUserAnalyticsQuery = `
WITH recent_transactions AS (
SELECT
user_id,
COUNT(*) as transaction_count,
SUM(amount) as total_amount,
AVG(amount) as avg_amount,
MAX(created_at) as last_transaction
FROM transactions
WHERE created_at >= $1
AND deleted_at IS NULL
AND amount > 0 -- Filter out test transactions
GROUP BY user_id
),
ranked_users AS (
SELECT
rt.*,
ROW_NUMBER() OVER (ORDER BY rt.total_amount DESC) as rank
FROM recent_transactions rt
WHERE rt.total_amount > 0
)
SELECT
u.user_id,
u.username,
ru.transaction_count,
ru.total_amount,
ru.avg_amount,
ru.last_transaction
FROM ranked_users ru
JOIN users u ON u.user_id = ru.user_id
WHERE ru.rank <= $2
AND u.status = 'active'
AND u.deleted_at IS NULL
ORDER BY ru.total_amount DESC
`
func (qo *QueryOptimizer) ExecuteUserAnalytics(ctx context.Context, since time.Time, limit int) ([]*UserAnalytics, error) {
// Generate cache key
cacheKey := fmt.Sprintf("user_analytics:%s:%d", since.Format("2006-01-02"), limit)
// Check cache first
if cached := qo.cache.Get(cacheKey); cached != nil {
qo.metrics.RecordCacheHit("user_analytics")
return cached.([]*UserAnalytics), nil
}
qo.metrics.RecordCacheMiss("user_analytics")
start := time.Now()
// Execute optimized query
rows, err := qo.db.QueryContext(ctx, optimizedUserAnalyticsQuery, since, limit)
if err != nil {
qo.metrics.RecordQueryError("user_analytics", time.Since(start))
return nil, err
}
defer rows.Close()
var results []*UserAnalytics
for rows.Next() {
var ua UserAnalytics
err := rows.Scan(
&ua.UserID, &ua.Username, &ua.TransactionCount,
&ua.TotalAmount, &ua.AvgAmount, &ua.LastTransaction,
)
if err != nil {
return nil, err
}
results = append(results, &ua)
}
duration := time.Since(start)
qo.metrics.RecordQuerySuccess("user_analytics", duration)
// Cache results for 5 minutes
qo.cache.Set(cacheKey, results, 5*time.Minute)
return results, nil
}
// Query performance improvement: 34.2s → 180ms (190x faster)
// Index usage: 100% queries now use indexes
// Temporary table usage: Eliminated (was 2.1GB)
3. Intelligent Caching Strategy
Problem: Repeated execution of expensive analytical queries.
Solution: Multi-layer caching with intelligent invalidation:
// Advanced caching architecture
type CacheManager struct {
l1Cache *MemoryCache // Application memory cache
l2Cache *RedisCache // Distributed Redis cache
l3Cache *MaterializedViews // Database materialized views
metrics *CacheMetrics
invalidator *CacheInvalidator
}
type CacheConfig struct {
L1Size int `yaml:"l1_size"` // 10000
L1TTL time.Duration `yaml:"l1_ttl"` // 5m
L2TTL time.Duration `yaml:"l2_ttl"` // 30m
L3RefreshInterval time.Duration `yaml:"l3_refresh"` // 1h
CompressionEnabled bool `yaml:"compression"` // true
}
func NewCacheManager(config CacheConfig) *CacheManager {
cm := &CacheManager{
l1Cache: NewMemoryCache(config.L1Size, config.L1TTL),
l2Cache: NewRedisCache(config.L2TTL),
l3Cache: NewMaterializedViews(),
metrics: NewCacheMetrics(),
}
// Set up cache invalidation
cm.invalidator = NewCacheInvalidator(cm)
return cm
}
func (cm *CacheManager) Get(ctx context.Context, key string) (interface{}, bool) {
start := time.Now()
// L1 Cache (Memory)
if value, found := cm.l1Cache.Get(key); found {
cm.metrics.RecordHit("l1", time.Since(start))
return value, true
}
// L2 Cache (Redis)
if value, found := cm.l2Cache.Get(ctx, key); found {
// Populate L1 cache
cm.l1Cache.Set(key, value)
cm.metrics.RecordHit("l2", time.Since(start))
return value, true
}
// L3 Cache (Materialized Views)
if value, found := cm.l3Cache.Get(ctx, key); found {
// Populate L2 and L1 caches
cm.l2Cache.Set(ctx, key, value)
cm.l1Cache.Set(key, value)
cm.metrics.RecordHit("l3", time.Since(start))
return value, true
}
cm.metrics.RecordMiss(time.Since(start))
return nil, false
}
func (cm *CacheManager) Set(ctx context.Context, key string, value interface{}, ttl time.Duration) {
// Set in all cache layers
cm.l1Cache.Set(key, value)
cm.l2Cache.Set(ctx, key, value)
// Update materialized views if applicable
if cm.isMaterializable(key) {
cm.l3Cache.Update(ctx, key, value)
}
}
// Materialized views for expensive aggregations
const createMaterializedViews = `
-- User analytics materialized view
CREATE MATERIALIZED VIEW mv_user_analytics AS
SELECT
u.user_id,
u.username,
u.email,
COUNT(t.transaction_id) as transaction_count,
COALESCE(SUM(t.amount), 0) as total_amount,
COALESCE(AVG(t.amount), 0) as avg_amount,
MAX(t.created_at) as last_transaction_date,
date_trunc('day', NOW()) as computed_date
FROM users u
LEFT JOIN transactions t ON u.user_id = t.user_id
AND t.created_at >= NOW() - INTERVAL '30 days'
AND t.deleted_at IS NULL
WHERE u.status = 'active'
AND u.deleted_at IS NULL
GROUP BY u.user_id, u.username, u.email;
-- Create unique index for fast lookups
CREATE UNIQUE INDEX ON mv_user_analytics(user_id);
CREATE INDEX ON mv_user_analytics(total_amount DESC);
-- Revenue analytics materialized view
CREATE MATERIALIZED VIEW mv_revenue_analytics AS
SELECT
date_trunc('day', created_at) as date,
COUNT(*) as transaction_count,
SUM(amount) as total_revenue,
AVG(amount) as avg_transaction_size,
COUNT(DISTINCT user_id) as unique_users
FROM transactions
WHERE created_at >= NOW() - INTERVAL '90 days'
AND deleted_at IS NULL
AND amount > 0
GROUP BY date_trunc('day', created_at)
ORDER BY date DESC;
CREATE UNIQUE INDEX ON mv_revenue_analytics(date);
`
// Automated materialized view refresh
func (cm *CacheManager) RefreshMaterializedViews() {
views := []string{
"mv_user_analytics",
"mv_revenue_analytics",
"mv_fraud_detection",
"mv_user_segmentation",
}
for _, view := range views {
start := time.Now()
_, err := cm.db.Exec(fmt.Sprintf("REFRESH MATERIALIZED VIEW CONCURRENTLY %s", view))
if err != nil {
log.Printf("Failed to refresh materialized view %s: %v", view, err)
cm.metrics.RecordRefreshError(view)
continue
}
duration := time.Since(start)
cm.metrics.RecordRefreshSuccess(view, duration)
log.Printf("Refreshed materialized view %s in %v", view, duration)
}
}
// Cache invalidation strategy
type CacheInvalidator struct {
cm *CacheManager
subscribers map[string][]chan InvalidationEvent
mu sync.RWMutex
}
type InvalidationEvent struct {
Table string
Action string // INSERT, UPDATE, DELETE
Key string
UserID string
}
func (ci *CacheInvalidator) InvalidateUserData(userID string) {
patterns := []string{
fmt.Sprintf("user_analytics:%s:*", userID),
fmt.Sprintf("user_transactions:%s:*", userID),
fmt.Sprintf("user_profile:%s", userID),
}
for _, pattern := range patterns {
ci.cm.l1Cache.DeletePattern(pattern)
ci.cm.l2Cache.DeletePattern(pattern)
}
}
// Cache hit rates achieved:
// L1 Cache: 78% hit rate (avg 0.1ms response)
// L2 Cache: 92% hit rate (avg 2.3ms response)
// L3 Cache: 97% hit rate (avg 15ms response)
// Overall cache hit rate: 89% (vs 0% before)
4. Database Schema Optimization
Problem: Inefficient schema design causing unnecessary joins and scans.
Solution: Strategic denormalization and partitioning:
-- Table partitioning for large transaction table
CREATE TABLE transactions (
transaction_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
user_id UUID NOT NULL,
amount DECIMAL(15,2) NOT NULL,
currency CHAR(3) NOT NULL DEFAULT 'USD',
status VARCHAR(20) NOT NULL,
transaction_type VARCHAR(50) NOT NULL,
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
deleted_at TIMESTAMP WITH TIME ZONE NULL,
-- Denormalized user data for faster queries
user_email VARCHAR(255) NOT NULL,
user_username VARCHAR(100) NOT NULL,
user_tier VARCHAR(20) NOT NULL DEFAULT 'standard',
-- Pre-computed aggregation fields
daily_transaction_count INTEGER DEFAULT 1,
monthly_transaction_sum DECIMAL(15,2) DEFAULT 0,
CONSTRAINT valid_amount CHECK (amount >= 0),
CONSTRAINT valid_status CHECK (status IN ('pending', 'completed', 'failed', 'cancelled'))
) PARTITION BY RANGE (created_at);
-- Monthly partitions for better query performance
CREATE TABLE transactions_2024_01 PARTITION OF transactions
FOR VALUES FROM ('2024-01-01') TO ('2024-02-01');
CREATE TABLE transactions_2024_02 PARTITION OF transactions
FOR VALUES FROM ('2024-02-01') TO ('2024-03-01');
-- Current month partition (most queried)
CREATE TABLE transactions_current PARTITION OF transactions
FOR VALUES FROM ('2024-03-01') TO ('2024-04-01');
-- Optimized indexes on each partition
CREATE INDEX ON transactions_2024_01(user_id, created_at DESC) WHERE deleted_at IS NULL;
CREATE INDEX ON transactions_2024_02(user_id, created_at DESC) WHERE deleted_at IS NULL;
CREATE INDEX ON transactions_current(user_id, created_at DESC) WHERE deleted_at IS NULL;
-- Summary table for fast aggregations
CREATE TABLE user_transaction_summary (
user_id UUID PRIMARY KEY,
total_transactions INTEGER NOT NULL DEFAULT 0,
total_amount DECIMAL(15,2) NOT NULL DEFAULT 0,
avg_amount DECIMAL(15,2) NOT NULL DEFAULT 0,
first_transaction_date TIMESTAMP WITH TIME ZONE,
last_transaction_date TIMESTAMP WITH TIME ZONE,
last_updated TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
FOREIGN KEY (user_id) REFERENCES users(user_id)
);
-- Trigger to maintain summary table
CREATE OR REPLACE FUNCTION update_user_transaction_summary()
RETURNS TRIGGER AS $$
BEGIN
IF TG_OP = 'INSERT' THEN
INSERT INTO user_transaction_summary (
user_id, total_transactions, total_amount,
avg_amount, first_transaction_date, last_transaction_date
)
VALUES (
NEW.user_id, 1, NEW.amount, NEW.amount,
NEW.created_at, NEW.created_at
)
ON CONFLICT (user_id) DO UPDATE SET
total_transactions = user_transaction_summary.total_transactions + 1,
total_amount = user_transaction_summary.total_amount + NEW.amount,
avg_amount = (user_transaction_summary.total_amount + NEW.amount) /
(user_transaction_summary.total_transactions + 1),
last_transaction_date = NEW.created_at,
last_updated = NOW();
ELSIF TG_OP = 'UPDATE' AND OLD.amount != NEW.amount THEN
UPDATE user_transaction_summary SET
total_amount = total_amount - OLD.amount + NEW.amount,
avg_amount = (total_amount - OLD.amount + NEW.amount) / total_transactions,
last_updated = NOW()
WHERE user_id = NEW.user_id;
ELSIF TG_OP = 'DELETE' THEN
UPDATE user_transaction_summary SET
total_transactions = total_transactions - 1,
total_amount = total_amount - OLD.amount,
avg_amount = CASE
WHEN total_transactions = 1 THEN 0
ELSE (total_amount - OLD.amount) / (total_transactions - 1)
END,
last_updated = NOW()
WHERE user_id = OLD.user_id;
END IF;
RETURN COALESCE(NEW, OLD);
END;
$$ LANGUAGE plpgsql;
CREATE TRIGGER trigger_update_user_transaction_summary
AFTER INSERT OR UPDATE OR DELETE ON transactions
FOR EACH ROW EXECUTE FUNCTION update_user_transaction_summary();
Results and Performance Impact
Query Performance Improvements
Before vs After Comparison:
| Query Type | Before (avg) | After (avg) | Improvement |
|---|---|---|---|
| User Analytics | 34.2s | 180ms | 190x faster |
| Revenue Reports | 28.9s | 145ms | 199x faster |
| Transaction History | 22.1s | 95ms | 233x faster |
| User Segmentation | 18.7s | 125ms | 150x faster |
| Fraud Detection | 15.3s | 89ms | 172x faster |
| Dashboard Aggregates | 12.8s | 67ms | 191x faster |
System Performance Metrics:
| Metric | Before | After | Improvement |
|---|---|---|---|
| Average Query Time | 12.3s | 260ms | 47x faster |
| 95th Percentile | 45.7s | 890ms | 51x faster |
| Database CPU Usage | 94% | 18% | 81% reduction |
| Connection Utilization | 97% | 23% | 76% reduction |
| Query Success Rate | 78% | 99.97% | 28% improvement |
| Failed Queries/Day | 15,840 | 45 | 99.7% reduction |
Infrastructure Impact
Cost Reduction:
# Infrastructure costs (monthly)
Before optimization:
- Database instances: 8x db.r5.4xlarge @ $2,190/month = $17,520
- Storage (SSD): 4TB @ $0.23/GB = $920
- Backup storage: 12TB @ $0.095/GB = $1,140
- Data transfer: 500GB @ $0.09/GB = $45
Total: $19,625/month
After optimization:
- Database instances: 3x db.r5.2xlarge @ $1,095/month = $3,285
- Storage (SSD): 2TB @ $0.23/GB = $460
- Backup storage: 6TB @ $0.095/GB = $570
- Data transfer: 200GB @ $0.09/GB = $18
Total: $4,333/month
Monthly savings: $15,292 (78% reduction)
Annual savings: $183,504
Application Performance
Dashboard Response Times:
# User dashboard loading times
Before: 45-60 seconds (often timeout)
After: 2.3 seconds average
# Real-time analytics refresh
Before: 180 seconds (3 minutes)
After: 4.5 seconds
# Report generation
Before: 5-8 minutes (often failed)
After: 15-25 seconds
# Data export (100K records)
Before: 12+ minutes (frequently failed)
After: 90 seconds
Monitoring and Alerting
Performance Monitoring Dashboard:
// Real-time database performance monitoring
type DatabaseMonitor struct {
metrics *DatabaseMetrics
alerter *AlertManager
thresholds *PerformanceThresholds
}
type DatabaseMetrics struct {
ActiveConnections int64 `json:"active_connections"`
QueryLatencyP50 time.Duration `json:"query_latency_p50"`
QueryLatencyP95 time.Duration `json:"query_latency_p95"`
QueryLatencyP99 time.Duration `json:"query_latency_p99"`
CacheHitRate float64 `json:"cache_hit_rate"`
SlowQueryCount int64 `json:"slow_query_count"`
ErrorRate float64 `json:"error_rate"`
IndexEfficiency float64 `json:"index_efficiency"`
LockWaitTime time.Duration `json:"lock_wait_time"`
BufferCacheHitRate float64 `json:"buffer_cache_hit_rate"`
}
type PerformanceThresholds struct {
MaxQueryLatency time.Duration `yaml:"max_query_latency"` // 1s
MaxErrorRate float64 `yaml:"max_error_rate"` // 1%
MinCacheHitRate float64 `yaml:"min_cache_hit_rate"` // 85%
MaxSlowQueries int64 `yaml:"max_slow_queries"` // 10/min
MaxConnections int64 `yaml:"max_connections"` // 200
}
func (dm *DatabaseMonitor) CheckPerformance() {
metrics := dm.collectMetrics()
// Check query latency
if metrics.QueryLatencyP95 > dm.thresholds.MaxQueryLatency {
dm.alerter.SendAlert(Alert{
Level: WARNING,
Message: fmt.Sprintf("P95 query latency %v exceeds threshold %v",
metrics.QueryLatencyP95, dm.thresholds.MaxQueryLatency),
Metrics: metrics,
})
}
// Check cache hit rate
if metrics.CacheHitRate < dm.thresholds.MinCacheHitRate {
dm.alerter.SendAlert(Alert{
Level: WARNING,
Message: fmt.Sprintf("Cache hit rate %.2f%% below threshold %.2f%%",
metrics.CacheHitRate*100, dm.thresholds.MinCacheHitRate*100),
Metrics: metrics,
})
}
// Check error rate
if metrics.ErrorRate > dm.thresholds.MaxErrorRate {
dm.alerter.SendAlert(Alert{
Level: CRITICAL,
Message: fmt.Sprintf("Query error rate %.2f%% exceeds threshold %.2f%%",
metrics.ErrorRate*100, dm.thresholds.MaxErrorRate*100),
Metrics: metrics,
})
}
}
// Automated performance regression detection
func (dm *DatabaseMonitor) detectPerformanceRegression() {
current := dm.getLastHourMetrics()
baseline := dm.getBaselineMetrics()
// Detect significant performance degradation
if current.QueryLatencyP95 > baseline.QueryLatencyP95*1.5 {
dm.alerter.SendAlert(Alert{
Level: CRITICAL,
Message: "Performance regression detected: 50% increase in P95 latency",
Details: map[string]interface{}{
"current_p95": current.QueryLatencyP95,
"baseline_p95": baseline.QueryLatencyP95,
"degradation": float64(current.QueryLatencyP95) / float64(baseline.QueryLatencyP95),
},
})
}
}
This comprehensive database optimization case study demonstrates how systematic performance engineering can transform database performance, achieving dramatic improvements in query execution time, system reliability, and infrastructure costs while establishing sustainable performance monitoring and alerting systems.