Production Performance

Master production-ready performance engineering, monitoring, testing, and scalability patterns for Go applications in real-world deployments.

Production Performance Overview

Production performance engineering involves:

Continuous profiling - Real-time performance monitoring
Performance testing - Systematic validation of performance characteristics
Scalability planning - Designing for growth and load variations
Incident response - Rapid diagnosis and resolution of performance issues

Performance in Production Context

Production vs Development Performance

// Production configuration example
type ProductionConfig struct {
    // Runtime configuration
    GOMAXPROCS       int    `env:"GOMAXPROCS" default:"0"`        // Use all CPUs
    GOMEMLIMIT       string `env:"GOMEMLIMIT" default:""`         // Memory limit
    GOGC             int    `env:"GOGC" default:"100"`            // GC target percentage

    // Application configuration
    PoolSize         int    `env:"POOL_SIZE" default:"100"`       // Connection pool size
    CacheSize        int    `env:"CACHE_SIZE" default:"10000"`    // Cache entries
    RequestTimeout   time.Duration `env:"REQUEST_TIMEOUT" default:"30s"`

    // Monitoring configuration
    ProfilingEnabled bool   `env:"PROFILING_ENABLED" default:"true"`
    MetricsAddr      string `env:"METRICS_ADDR" default:":8080"`
    LogLevel         string `env:"LOG_LEVEL" default:"info"`
}

func NewProductionConfig() (*ProductionConfig, error) {
    config := &ProductionConfig{}

    // Load from environment
    if err := env.Parse(config); err != nil {
        return nil, fmt.Errorf("failed to parse config: %w", err)
    }

    // Apply production optimizations
    if config.GOMAXPROCS > 0 {
        runtime.GOMAXPROCS(config.GOMAXPROCS)
    }

    if config.GOMEMLIMIT != "" {
        if limit, err := parseMemoryLimit(config.GOMEMLIMIT); err == nil {
            debug.SetMemoryLimit(limit)
        }
    }

    debug.SetGCPercent(config.GOGC)

    return config, nil
}

Production Deployment Patterns

// Graceful shutdown pattern
type Server struct {
    httpServer *http.Server
    cleanup    []func() error
    logger     *log.Logger
}

func (s *Server) Start() error {
    // Start HTTP server
    go func() {
        if err := s.httpServer.ListenAndServe(); err != nil && err != http.ErrServerClosed {
            s.logger.Printf("HTTP server error: %v", err)
        }
    }()

    // Wait for interrupt signal
    sigChan := make(chan os.Signal, 1)
    signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)

    <-sigChan
    s.logger.Println("Shutting down server...")

    return s.Shutdown()
}

func (s *Server) Shutdown() error {
    // Create shutdown context with timeout
    ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
    defer cancel()

    // Shutdown HTTP server gracefully
    if err := s.httpServer.Shutdown(ctx); err != nil {
        s.logger.Printf("HTTP server shutdown error: %v", err)
    }

    // Run cleanup functions
    for _, cleanup := range s.cleanup {
        if err := cleanup(); err != nil {
            s.logger.Printf("Cleanup error: %v", err)
        }
    }

    return nil
}

// Health check endpoints
func (s *Server) setupHealthChecks() {
    http.HandleFunc("/health", s.healthCheck)
    http.HandleFunc("/ready", s.readinessCheck)
    http.HandleFunc("/metrics", s.metricsHandler)
}

func (s *Server) healthCheck(w http.ResponseWriter, r *http.Request) {
    // Basic liveness check
    w.WriteHeader(http.StatusOK)
    w.Write([]byte("OK"))
}

func (s *Server) readinessCheck(w http.ResponseWriter, r *http.Request) {
    // Check dependencies (database, cache, etc.)
    if !s.isDatabaseReady() || !s.isCacheReady() {
        w.WriteHeader(http.StatusServiceUnavailable)
        w.Write([]byte("Service Unavailable"))
        return
    }

    w.WriteHeader(http.StatusOK)
    w.Write([]byte("Ready"))
}

Performance Monitoring in Production

Built-in Profiling Integration

import (
    _ "net/http/pprof" // Enable pprof endpoints
)

func setupProfiling() {
    // Production-safe profiling setup
    mux := http.NewServeMux()

    // Restrict pprof access in production
    if os.Getenv("ENABLE_PPROF") == "true" {
        mux.Handle("/debug/pprof/", http.DefaultServeMux)
    }

    // Custom metrics endpoint
    mux.HandleFunc("/metrics", func(w http.ResponseWriter, r *http.Request) {
        var m runtime.MemStats
        runtime.ReadMemStats(&m)

        metrics := map[string]interface{}{
            "goroutines":     runtime.NumGoroutine(),
            "memory_alloc":   m.Alloc,
            "memory_sys":     m.Sys,
            "gc_runs":        m.NumGC,
            "heap_objects":   m.HeapObjects,
        }

        w.Header().Set("Content-Type", "application/json")
        json.NewEncoder(w).Encode(metrics)
    })

    // Start metrics server on separate port
    go func() {
        log.Printf("Metrics server starting on :6060")
        log.Fatal(http.ListenAndServe(":6060", mux))
    }()
}

Custom Metrics Collection

// Production metrics system
type MetricsCollector struct {
    requestDuration *prometheus.HistogramVec
    requestCount    *prometheus.CounterVec
    activeRequests  prometheus.Gauge
    errorCount      *prometheus.CounterVec
    goroutineCount  prometheus.Gauge
    memoryUsage     prometheus.Gauge
}

func NewMetricsCollector() *MetricsCollector {
    mc := &MetricsCollector{
        requestDuration: prometheus.NewHistogramVec(
            prometheus.HistogramOpts{
                Name:    "http_request_duration_seconds",
                Help:    "HTTP request duration in seconds",
                Buckets: prometheus.DefBuckets,
            },
            []string{"method", "endpoint", "status"},
        ),

        requestCount: prometheus.NewCounterVec(
            prometheus.CounterOpts{
                Name: "http_requests_total",
                Help: "Total number of HTTP requests",
            },
            []string{"method", "endpoint", "status"},
        ),

        activeRequests: prometheus.NewGauge(
            prometheus.GaugeOpts{
                Name: "http_active_requests",
                Help: "Number of active HTTP requests",
            },
        ),

        errorCount: prometheus.NewCounterVec(
            prometheus.CounterOpts{
                Name: "http_errors_total",
                Help: "Total number of HTTP errors",
            },
            []string{"method", "endpoint", "error_type"},
        ),

        goroutineCount: prometheus.NewGauge(
            prometheus.GaugeOpts{
                Name: "go_goroutines",
                Help: "Number of goroutines",
            },
        ),

        memoryUsage: prometheus.NewGauge(
            prometheus.GaugeOpts{
                Name: "go_memory_usage_bytes",
                Help: "Memory usage in bytes",
            },
        ),
    }

    // Register metrics
    prometheus.MustRegister(
        mc.requestDuration,
        mc.requestCount,
        mc.activeRequests,
        mc.errorCount,
        mc.goroutineCount,
        mc.memoryUsage,
    )

    // Start background metrics collection
    go mc.collectRuntimeMetrics()

    return mc
}

func (mc *MetricsCollector) collectRuntimeMetrics() {
    ticker := time.NewTicker(10 * time.Second)
    defer ticker.Stop()

    for range ticker.C {
        var m runtime.MemStats
        runtime.ReadMemStats(&m)

        mc.goroutineCount.Set(float64(runtime.NumGoroutine()))
        mc.memoryUsage.Set(float64(m.Alloc))
    }
}

// Middleware for request metrics
func (mc *MetricsCollector) Middleware(next http.Handler) http.Handler {
    return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
        start := time.Now()
        mc.activeRequests.Inc()
        defer mc.activeRequests.Dec()

        // Wrap response writer to capture status code
        ww := &responseWriter{ResponseWriter: w, statusCode: 200}

        next.ServeHTTP(ww, r)

        duration := time.Since(start).Seconds()
        status := strconv.Itoa(ww.statusCode)

        mc.requestDuration.WithLabelValues(r.Method, r.URL.Path, status).Observe(duration)
        mc.requestCount.WithLabelValues(r.Method, r.URL.Path, status).Inc()

        if ww.statusCode >= 400 {
            errorType := "client_error"
            if ww.statusCode >= 500 {
                errorType = "server_error"
            }
            mc.errorCount.WithLabelValues(r.Method, r.URL.Path, errorType).Inc()
        }
    })
}

type responseWriter struct {
    http.ResponseWriter
    statusCode int
}

func (rw *responseWriter) WriteHeader(code int) {
    rw.statusCode = code
    rw.ResponseWriter.WriteHeader(code)
}

Performance Testing in Production

Load Testing Framework

// Production load testing
type LoadTester struct {
    target     string
    clients    int
    duration   time.Duration
    rampUp     time.Duration
    results    chan TestResult
    metrics    *MetricsCollector
}

type TestResult struct {
    Timestamp    time.Time
    Duration     time.Duration
    StatusCode   int
    Error        error
    RequestSize  int64
    ResponseSize int64
}

func NewLoadTester(target string, clients int, duration time.Duration) *LoadTester {
    return &LoadTester{
        target:   target,
        clients:  clients,
        duration: duration,
        rampUp:   duration / 10, // 10% ramp-up time
        results:  make(chan TestResult, clients*100),
        metrics:  NewMetricsCollector(),
    }
}

func (lt *LoadTester) Run() error {
    ctx, cancel := context.WithTimeout(context.Background(), lt.duration)
    defer cancel()

    // Start result collector
    go lt.collectResults()

    // Ramp up clients gradually
    clientInterval := lt.rampUp / time.Duration(lt.clients)

    var wg sync.WaitGroup
    for i := 0; i < lt.clients; i++ {
        wg.Add(1)
        go func(clientID int) {
            defer wg.Done()
            lt.runClient(ctx, clientID)
        }(i)

        // Stagger client start times
        time.Sleep(clientInterval)
    }

    wg.Wait()
    close(lt.results)

    return nil
}

func (lt *LoadTester) runClient(ctx context.Context, clientID int) {
    client := &http.Client{
        Timeout: 30 * time.Second,
    }

    for {
        select {
        case <-ctx.Done():
            return
        default:
            result := lt.makeRequest(client)
            select {
            case lt.results <- result:
            case <-ctx.Done():
                return
            }
        }
    }
}

func (lt *LoadTester) makeRequest(client *http.Client) TestResult {
    start := time.Now()

    resp, err := client.Get(lt.target)
    duration := time.Since(start)

    result := TestResult{
        Timestamp: start,
        Duration:  duration,
        Error:     err,
    }

    if err != nil {
        return result
    }

    result.StatusCode = resp.StatusCode
    result.ResponseSize = resp.ContentLength

    // Read response body to simulate real usage
    io.Copy(io.Discard, resp.Body)
    resp.Body.Close()

    return result
}

func (lt *LoadTester) collectResults() {
    var (
        totalRequests   int64
        successRequests int64
        totalDuration   time.Duration
        minDuration     time.Duration = time.Hour
        maxDuration     time.Duration
        durations       []time.Duration
    )

    for result := range lt.results {
        totalRequests++
        totalDuration += result.Duration

        if result.Error == nil && result.StatusCode < 400 {
            successRequests++
        }

        if result.Duration < minDuration {
            minDuration = result.Duration
        }
        if result.Duration > maxDuration {
            maxDuration = result.Duration
        }

        durations = append(durations, result.Duration)
    }

    // Calculate percentiles
    sort.Slice(durations, func(i, j int) bool {
        return durations[i] < durations[j]
    })

    p50 := durations[len(durations)*50/100]
    p95 := durations[len(durations)*95/100]
    p99 := durations[len(durations)*99/100]

    fmt.Printf("Load Test Results:\n")
    fmt.Printf("Total Requests: %d\n", totalRequests)
    fmt.Printf("Success Rate: %.2f%%\n", float64(successRequests)/float64(totalRequests)*100)
    fmt.Printf("Average Duration: %v\n", totalDuration/time.Duration(totalRequests))
    fmt.Printf("Min Duration: %v\n", minDuration)
    fmt.Printf("Max Duration: %v\n", maxDuration)
    fmt.Printf("P50 Duration: %v\n", p50)
    fmt.Printf("P95 Duration: %v\n", p95)
    fmt.Printf("P99 Duration: %v\n", p99)
}

Chaos Engineering

// Chaos testing for production resilience
type ChaosTest struct {
    name        string
    probability float64
    impact      ChaosImpact
}

type ChaosImpact interface {
    Apply() error
    Restore() error
}

// Network latency injection
type NetworkLatency struct {
    delay time.Duration
}

func (nl *NetworkLatency) Apply() error {
    // Inject network latency (implementation depends on infrastructure)
    return injectNetworkDelay(nl.delay)
}

func (nl *NetworkLatency) Restore() error {
    return removeNetworkDelay()
}

// Memory pressure simulation
type MemoryPressure struct {
    size int64
    data []byte
}

func (mp *MemoryPressure) Apply() error {
    mp.data = make([]byte, mp.size)
    // Fill with random data to prevent optimization
    rand.Read(mp.data)
    return nil
}

func (mp *MemoryPressure) Restore() error {
    mp.data = nil
    runtime.GC()
    return nil
}

// CPU stress simulation
type CPUStress struct {
    workers int
    done    chan bool
}

func (cs *CPUStress) Apply() error {
    cs.done = make(chan bool)

    for i := 0; i < cs.workers; i++ {
        go func() {
            for {
                select {
                case <-cs.done:
                    return
                default:
                    // Busy loop to consume CPU
                    for j := 0; j < 1000000; j++ {
                        _ = j * j
                    }
                }
            }
        }()
    }

    return nil
}

func (cs *CPUStress) Restore() error {
    close(cs.done)
    return nil
}

// Chaos test runner
func runChaosTests(tests []ChaosTest, duration time.Duration) {
    ctx, cancel := context.WithTimeout(context.Background(), duration)
    defer cancel()

    ticker := time.NewTicker(time.Minute)
    defer ticker.Stop()

    for {
        select {
        case <-ctx.Done():
            return
        case <-ticker.C:
            for _, test := range tests {
                if rand.Float64() < test.probability {
                    log.Printf("Applying chaos test: %s", test.name)

                    if err := test.impact.Apply(); err != nil {
                        log.Printf("Failed to apply chaos test %s: %v", test.name, err)
                        continue
                    }

                    // Let chaos run for a short time
                    time.Sleep(30 * time.Second)

                    if err := test.impact.Restore(); err != nil {
                        log.Printf("Failed to restore from chaos test %s: %v", test.name, err)
                    }

                    log.Printf("Restored from chaos test: %s", test.name)
                }
            }
        }
    }
}

Production Optimization Strategies

Resource Management

// Production resource management
type ResourceManager struct {
    pools map[string]*sync.Pool
    mu    sync.RWMutex
}

func NewResourceManager() *ResourceManager {
    rm := &ResourceManager{
        pools: make(map[string]*sync.Pool),
    }

    // Pre-configure common pools
    rm.RegisterPool("buffer", func() interface{} {
        return make([]byte, 0, 4096)
    })

    rm.RegisterPool("strings", func() interface{} {
        return &strings.Builder{}
    })

    rm.RegisterPool("json_encoder", func() interface{} {
        var buf bytes.Buffer
        return json.NewEncoder(&buf)
    })

    return rm
}

func (rm *ResourceManager) RegisterPool(name string, factory func() interface{}) {
    rm.mu.Lock()
    defer rm.mu.Unlock()

    rm.pools[name] = &sync.Pool{New: factory}
}

func (rm *ResourceManager) Get(name string) interface{} {
    rm.mu.RLock()
    pool, exists := rm.pools[name]
    rm.mu.RUnlock()

    if !exists {
        return nil
    }

    return pool.Get()
}

func (rm *ResourceManager) Put(name string, obj interface{}) {
    rm.mu.RLock()
    pool, exists := rm.pools[name]
    rm.mu.RUnlock()

    if exists {
        pool.Put(obj)
    }
}

Circuit Breaker Pattern

// Circuit breaker for production resilience
type CircuitBreaker struct {
    name          string
    maxFailures   int
    resetTimeout  time.Duration
    state         CircuitState
    failures      int
    lastFailTime  time.Time
    mu            sync.RWMutex
}

type CircuitState int

const (
    Closed CircuitState = iota
    Open
    HalfOpen
)

func NewCircuitBreaker(name string, maxFailures int, resetTimeout time.Duration) *CircuitBreaker {
    return &CircuitBreaker{
        name:         name,
        maxFailures:  maxFailures,
        resetTimeout: resetTimeout,
        state:        Closed,
    }
}

func (cb *CircuitBreaker) Execute(fn func() error) error {
    cb.mu.Lock()
    defer cb.mu.Unlock()

    switch cb.state {
    case Open:
        if time.Since(cb.lastFailTime) > cb.resetTimeout {
            cb.state = HalfOpen
            cb.failures = 0
        } else {
            return fmt.Errorf("circuit breaker %s is open", cb.name)
        }
    case HalfOpen:
        // Allow one request to test if service is recovered
    case Closed:
        // Normal operation
    }

    err := fn()

    if err != nil {
        cb.failures++
        cb.lastFailTime = time.Now()

        if cb.failures >= cb.maxFailures {
            cb.state = Open
        }

        return err
    }

    // Success - reset circuit breaker
    cb.failures = 0
    cb.state = Closed

    return nil
}

func (cb *CircuitBreaker) State() CircuitState {
    cb.mu.RLock()
    defer cb.mu.RUnlock()
    return cb.state
}

Observability and Debugging

Distributed Tracing

// Distributed tracing for production debugging
type TraceSpan struct {
    ID       string
    ParentID string
    Name     string
    Start    time.Time
    End      time.Time
    Tags     map[string]string
    Logs     []TraceLog
}

type TraceLog struct {
    Timestamp time.Time
    Message   string
    Level     string
}

type Tracer struct {
    spans map[string]*TraceSpan
    mu    sync.RWMutex
}

func NewTracer() *Tracer {
    return &Tracer{
        spans: make(map[string]*TraceSpan),
    }
}

func (t *Tracer) StartSpan(name string, parentID string) *TraceSpan {
    span := &TraceSpan{
        ID:       generateSpanID(),
        ParentID: parentID,
        Name:     name,
        Start:    time.Now(),
        Tags:     make(map[string]string),
    }

    t.mu.Lock()
    t.spans[span.ID] = span
    t.mu.Unlock()

    return span
}

func (t *Tracer) FinishSpan(spanID string) {
    t.mu.Lock()
    defer t.mu.Unlock()

    if span, exists := t.spans[spanID]; exists {
        span.End = time.Now()
        // Send span to tracing backend
        t.exportSpan(span)
    }
}

func (t *Tracer) exportSpan(span *TraceSpan) {
    // Export to Jaeger, Zipkin, or other tracing systems
    // Implementation depends on chosen tracing backend
}

// Context propagation
type contextKey string

const traceContextKey contextKey = "trace"

func WithTrace(ctx context.Context, span *TraceSpan) context.Context {
    return context.WithValue(ctx, traceContextKey, span)
}

func SpanFromContext(ctx context.Context) (*TraceSpan, bool) {
    span, ok := ctx.Value(traceContextKey).(*TraceSpan)
    return span, ok
}

Best Practices for Production Performance

1. Monitoring and Alerting

Implement comprehensive metrics collection
Set up alerting for performance degradation
Use distributed tracing for debugging
Monitor business metrics alongside technical metrics

2. Graceful Degradation

Implement circuit breakers for external dependencies
Use timeouts and retries with backoff
Provide fallback mechanisms
Design for partial failures

3. Resource Management

Use connection pooling for databases and external services
Implement proper resource cleanup
Monitor resource usage continuously
Set appropriate limits and quotas

4. Testing and Validation

Conduct regular load testing
Implement chaos engineering practices
Validate performance in staging environments
Use canary deployments for performance validation

Production performance engineering requires a holistic approach combining monitoring, testing, optimization, and operational excellence to ensure applications perform reliably under real-world conditions.

Production Monitoring