Skip to main content

OpenTelemetry Tracing

Panduan lengkap implementasi OpenTelemetry (OTEL) untuk observability di MStore Backend.

🎯 Overview

MStore Backend menggunakan OpenTelemetry untuk:
  • Distributed Tracing - Track request flow across services
  • Structured Logging - Contextual logs dengan correlation ID
  • Metrics Collection - Performance metrics & monitoring
Package: pkg/utils/logger/otel.go

📊 Architecture

OTEL Stack


🚀 Quick Start

1. Initialize OTEL

package main

import (
    "context"
    utils_logger "gitlab.com/mushola-store/mstore_backend/pkg/utils/logger"
)

func main() {
    // Initialize OpenTelemetry
    shutdown, err := utils_logger.InitOpenTelemetry(
        context.Background(),
        "pos-api",           // service name
        "1.0.0",            // version
        "production",       // environment
        "localhost:4317",   // OTLP endpoint
    )
    if err != nil {
        panic(err)
    }
    defer shutdown(context.Background())
    
    // Your application code
}

🎨 Trace Layers

Layer Types

MStore menggunakan 4 layer standar untuk tracing:
const (
    LayerHandler    TraceLayer = "handler"     // HTTP/gRPC handlers
    LayerService    TraceLayer = "service"     // Business logic
    LayerRepository TraceLayer = "repository"  // Database operations
    LayerExternal   TraceLayer = "external"    // External API calls
)

📝 Usage Examples

Handler Layer

package transaction_handler

import (
    "github.com/gofiber/fiber/v2"
    utils_logger "gitlab.com/mushola-store/mstore_backend/pkg/utils/logger"
)

func (h *TransactionHandler) CreateTransaction(ctx *fiber.Ctx) error {
    // Start handler span
    fctx, span := utils_logger.StartLayerSpanFromFiber(
        ctx,
        utils_logger.LayerHandler,
        "CreateTransaction",
    )
    defer utils_logger.EndLayerSpan(span, nil)
    
    // Step 1: Parse body
    utils_logger.TraceStep(fctx, "parse_request_body")
    var payload PayloadCreateTransaction
    if err := ctx.BodyParser(&payload); err != nil {
        utils_logger.TraceStepErr(fctx, "parse_body_failed", err)
        return utils_logger.EndLayerSpan(span, err)
    }
    
    // Step 2: Validate
    utils_logger.TraceStep(fctx, "validate_payload")
    if payload.BranchCode == "" {
        err := errors.New("branch_code required")
        utils_logger.TraceStepErr(fctx, "validation_failed", err)
        return utils_logger.EndLayerSpan(span, err)
    }
    
    // Step 3: Call service
    utils_logger.TraceStep(fctx, "call_service")
    result, err := h.transactionService.CreateTransaction(fctx, payload)
    if err != nil {
        return utils_logger.EndLayerSpan(span, err)
    }
    
    // Success
    utils_logger.AddStepEvent(fctx, "transaction_created", map[string]interface{}{
        "transaction_id": result.ID,
        "grand_total": result.GrandTotal,
    })
    
    return utils_response.Result(ctx, result, nil)
}

Service Layer

package transaction_service

import (
    "context"
    utils_logger "gitlab.com/mushola-store/mstore_backend/pkg/utils/logger"
)

func (s *TransactionService) CreateTransaction(
    ctx context.Context,
    payload PayloadCreateTransaction,
) (*Transaction, error) {
    // Start service span
    ctx, span := utils_logger.StartLayerSpan(
        ctx,
        utils_logger.LayerService,
        "CreateTransaction",
    )
    defer utils_logger.EndLayerSpan(span, nil)
    
    // Step 1: Get branch
    utils_logger.TraceStep(ctx, "get_branch")
    branch, err := s.branchRepo.GetByCode(ctx, payload.BranchCode)
    if err != nil {
        utils_logger.TraceStepErr(ctx, "branch_not_found", err)
        return nil, utils_logger.EndLayerSpan(span, err)
    }
    
    // Step 2: Calculate totals
    utils_logger.TraceStep(ctx, "calculate_totals")
    subtotal := calculateSubtotal(payload.Items)
    grandTotal := subtotal - payload.DiscountTotal + payload.TaxTotal
    
    utils_logger.AddStepEvent(ctx, "totals_calculated", map[string]interface{}{
        "subtotal": subtotal,
        "grand_total": grandTotal,
    })
    
    // Step 3: Save transaction
    utils_logger.TraceStep(ctx, "save_transaction")
    transaction := &Transaction{
        BranchID: branch.ID,
        Subtotal: subtotal,
        GrandTotal: grandTotal,
    }
    
    if err := s.transactionRepo.Create(ctx, transaction); err != nil {
        return nil, utils_logger.EndLayerSpan(span, err)
    }
    
    return transaction, nil
}

Repository Layer

package transaction_repository

import (
    "context"
    "gorm.io/gorm"
    utils_logger "gitlab.com/mushola-store/mstore_backend/pkg/utils/logger"
)

func (r *TransactionRepository) Create(
    ctx context.Context,
    transaction *Transaction,
) error {
    // Start repository span
    ctx, span := utils_logger.StartLayerSpan(
        ctx,
        utils_logger.LayerRepository,
        "CreateTransaction",
    )
    defer utils_logger.EndLayerSpan(span, nil)
    
    // Add database attributes
    utils_logger.AddStepEvent(ctx, "db_insert", map[string]interface{}{
        "table": "transactions",
        "operation": "INSERT",
    })
    
    // Execute query
    if err := r.db.WithContext(ctx).Create(transaction).Error; err != nil {
        utils_logger.TraceStepErr(ctx, "db_insert_failed", err)
        return utils_logger.EndLayerSpan(span, err)
    }
    
    utils_logger.AddStepEvent(ctx, "db_insert_success", map[string]interface{}{
        "transaction_id": transaction.ID,
    })
    
    return nil
}

External API Layer

package xendit_service

import (
    "context"
    "net/http"
    utils_logger "gitlab.com/mushola-store/mstore_backend/pkg/utils/logger"
)

func (s *XenditService) CreateQRIS(
    ctx context.Context,
    amount float64,
) (*QRISResponse, error) {
    // Start external span
    ctx, span := utils_logger.StartLayerSpan(
        ctx,
        utils_logger.LayerExternal,
        "Xendit.CreateQRIS",
    )
    defer utils_logger.EndLayerSpan(span, nil)
    
    // Add external service attributes
    utils_logger.AddStepEvent(ctx, "external_api_call", map[string]interface{}{
        "service": "xendit",
        "endpoint": "/qr_codes",
        "method": "POST",
    })
    
    // Make HTTP request
    resp, err := s.httpClient.Post(ctx, "/qr_codes", payload)
    if err != nil {
        utils_logger.TraceStepErr(ctx, "api_call_failed", err)
        return nil, utils_logger.EndLayerSpan(span, err)
    }
    
    utils_logger.AddStepEvent(ctx, "api_call_success", map[string]interface{}{
        "status_code": resp.StatusCode,
        "qr_id": resp.ID,
    })
    
    return resp, nil
}

🔧 Helper Functions

StartLayerSpan

Memulai span baru dengan layer context.
func StartLayerSpan(
    ctx context.Context,
    layer TraceLayer,
    operationName string,
) (context.Context, oteltrace.Span)
Example:
ctx, span := utils_logger.StartLayerSpan(
    ctx,
    utils_logger.LayerService,
    "ProcessPayment",
)
defer utils_logger.EndLayerSpan(span, nil)

StartLayerSpanFromFiber

Memulai span dari Fiber context (untuk handlers).
func StartLayerSpanFromFiber(
    fctx *fiber.Ctx,
    layer TraceLayer,
    operationName string,
) (context.Context, oteltrace.Span)
Example:
ctx, span := utils_logger.StartLayerSpanFromFiber(
    fiberCtx,
    utils_logger.LayerHandler,
    "GetUserProfile",
)
defer utils_logger.EndLayerSpan(span, nil)

TraceStep

Mencatat step dalam span (tanpa error).
func TraceStep(ctx context.Context, stepName string)
Example:
utils_logger.TraceStep(ctx, "validate_input")
utils_logger.TraceStep(ctx, "fetch_user_data")
utils_logger.TraceStep(ctx, "calculate_total")

TraceStepErr

Mencatat step dengan error.
func TraceStepErr(ctx context.Context, stepName string, err error)
Example:
if err := validatePayload(payload); err != nil {
    utils_logger.TraceStepErr(ctx, "validation_failed", err)
    return err
}

AddStepEvent

Menambahkan event dengan attributes ke span.
func AddStepEvent(
    ctx context.Context,
    eventName string,
    attrs map[string]interface{},
)
Example:
utils_logger.AddStepEvent(ctx, "payment_processed", map[string]interface{}{
    "payment_id": payment.ID,
    "amount": payment.Amount,
    "method": payment.Method,
    "status": "success",
})

EndLayerSpan

Mengakhiri span dengan error handling.
func EndLayerSpan(span oteltrace.Span, err error) error
Example:
defer utils_logger.EndLayerSpan(span, nil)

// Or with error
if err != nil {
    return utils_logger.EndLayerSpan(span, err)
}

🎯 Best Practices

1. Always Defer EndLayerSpan

// ✅ GOOD
ctx, span := utils_logger.StartLayerSpan(ctx, layer, "Operation")
defer utils_logger.EndLayerSpan(span, nil)

// ❌ BAD - Span tidak akan ditutup jika panic
ctx, span := utils_logger.StartLayerSpan(ctx, layer, "Operation")
// ... code ...
utils_logger.EndLayerSpan(span, nil)

2. Use Descriptive Operation Names

// ✅ GOOD
StartLayerSpan(ctx, LayerService, "CreateTransactionWithPayment")
StartLayerSpan(ctx, LayerRepository, "GetUserByEmail")

// ❌ BAD
StartLayerSpan(ctx, LayerService, "Process")
StartLayerSpan(ctx, LayerRepository, "Get")

3. Add Meaningful Attributes

// ✅ GOOD
utils_logger.AddStepEvent(ctx, "order_created", map[string]interface{}{
    "order_id": order.ID,
    "customer_id": order.CustomerID,
    "total_amount": order.TotalAmount,
    "item_count": len(order.Items),
})

// ❌ BAD
utils_logger.AddStepEvent(ctx, "done", map[string]interface{}{
    "status": "ok",
})

4. Trace Important Steps

// ✅ GOOD - Trace key steps
utils_logger.TraceStep(ctx, "validate_input")
utils_logger.TraceStep(ctx, "check_inventory")
utils_logger.TraceStep(ctx, "reserve_stock")
utils_logger.TraceStep(ctx, "create_transaction")
utils_logger.TraceStep(ctx, "process_payment")

// ❌ BAD - Too granular
utils_logger.TraceStep(ctx, "line_1")
utils_logger.TraceStep(ctx, "line_2")
utils_logger.TraceStep(ctx, "line_3")

5. Always Log Errors

// ✅ GOOD
if err := service.Process(ctx, data); err != nil {
    utils_logger.TraceStepErr(ctx, "process_failed", err)
    return utils_logger.EndLayerSpan(span, err)
}

// ❌ BAD - Error tidak tercatat
if err := service.Process(ctx, data); err != nil {
    return err
}

📊 Viewing Traces

Grafana Tempo

Access: http://localhost:3000/explore Query Examples:
# Find traces by service
{service.name="pos-api"}

# Find traces by operation
{span.name="CreateTransaction"}

# Find traces with errors
{status="error"}

# Find traces by duration
{duration > 1s}

# Find traces by attribute
{transaction.id="TRX-123"}

Trace Structure

Span: CreateTransaction (handler)
├── Step: parse_request_body
├── Step: validate_payload
├── Span: CreateTransaction (service)
│   ├── Step: get_branch
│   ├── Step: calculate_totals
│   ├── Span: CreateTransaction (repository)
│   │   ├── Event: db_insert
│   │   └── Event: db_insert_success
│   └── Span: CreateQRIS (external)
│       ├── Event: external_api_call
│       └── Event: api_call_success
└── Event: transaction_created

🔍 Troubleshooting

Traces Not Appearing

Check:
  1. OTLP endpoint accessible
  2. Service name configured
  3. Spans properly closed
  4. No network issues
# Test OTLP endpoint
curl -v http://localhost:4317

# Check logs
tail -f logs/app.log | grep -i otel

High Cardinality Attributes

Problem: Too many unique attribute values Solution: Use bounded values
// ✅ GOOD
attrs["status"] = "success" // bounded: success|failed|pending

// ❌ BAD
attrs["user_id"] = userID // unbounded: millions of values

Missing Context

Problem: Context not propagated Solution: Always pass context
// ✅ GOOD
func ProcessOrder(ctx context.Context, order Order) error {
    ctx, span := utils_logger.StartLayerSpan(ctx, LayerService, "ProcessOrder")
    defer utils_logger.EndLayerSpan(span, nil)
    
    // Pass context to next layer
    return repo.SaveOrder(ctx, order)
}

// ❌ BAD
func ProcessOrder(order Order) error {
    // No context - traces disconnected
    return repo.SaveOrder(order)
}

🎨 Configuration

Environment Variables

# OTLP Endpoint
OTEL_EXPORTER_OTLP_ENDPOINT=localhost:4317

# Service Info
OTEL_SERVICE_NAME=pos-api
OTEL_SERVICE_VERSION=1.0.0
OTEL_DEPLOYMENT_ENVIRONMENT=production

# Sampling
OTEL_TRACES_SAMPLER=parentbased_traceidratio
OTEL_TRACES_SAMPLER_ARG=0.1  # 10% sampling

# Enable/Disable
OTEL_SDK_DISABLED=false

Sampling Strategy

Development: 100% sampling
sampler := sdktrace.AlwaysSample()
Production: Adaptive sampling
sampler := sdktrace.ParentBased(
    sdktrace.TraceIDRatioBased(0.1), // 10%
)

System Design

Overall system architecture

Observability (LGTM)

LGTM stack overview

Grafana Tempo

Tempo traces documentation

Error Handling

Error handling best practices

OpenTelemetry: Tracing diaktifkan secara default di development & production. Gunakan EnableTraceSteps = false untuk disable step events.