1. 可观测性概述
1.1 从监控到可观测性
监控(Monitoring) 与 可观测性(Observability) 有着本质区别:
维度 |
传统监控 |
可观测性 |
关注点 |
已知问题、预设指标 |
未知问题、探索性分析 |
数据维度 |
指标为主 |
指标+日志+追踪三位一体 |
方法论 |
reactive(被动响应) |
proactive(主动探索) |
复杂度 |
相对简单 |
处理复杂系统不确定性 |
1.2 可观测性的三大支柱
2. 指标(Metrics)体系构建
2.1 Micrometer 指标收集
环境配置
<dependencies> <dependency> <groupId>io.micrometer</groupId> <artifactId>micrometer-core</artifactId> <version>1.10.0</version> </dependency> <dependency> <groupId>io.micrometer</groupId> <artifactId>micrometer-registry-prometheus</artifactId> <version>1.10.0</version> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-actuator</artifactId> </dependency> </dependencies>
应用配置
management: endpoints: web: exposure: include: health,info,metrics,prometheus endpoint: health: show-details: always metrics: enabled: true prometheus: enabled: true metrics: export: prometheus: enabled: true distribution: percentiles-histogram: http.server.requests: true percentiles: http.server.requests: 0.5, 0.95, 0.99 tags: application: ${spring.application.name} environment: ${spring.profiles.active:default}
2.2 自定义业务指标
@Component public class BusinessMetrics { private final MeterRegistry meterRegistry; // 计数器 - 业务操作计数 private final Counter orderCreatedCounter; private final Counter paymentSuccessCounter; private final Counter paymentFailedCounter; // 计时器 - 业务操作耗时 private final Timer orderProcessingTimer; private final Timer paymentProcessingTimer; // 计量器 - 业务数值度量 private final DistributionSummary orderAmountSummary; // 仪表 - 当前值 private final Gauge activeUsersGauge; private final AtomicInteger activeUsersCount = new AtomicInteger(0); public BusinessMetrics(MeterRegistry meterRegistry) { this.meterRegistry = meterRegistry; // 初始化计数器 this.orderCreatedCounter = Counter.builder("business.order.created") .description("订单创建总数") .tag("application", "order-service") .register(meterRegistry); this.paymentSuccessCounter = Counter.builder("business.payment.result") .description("支付结果统计") .tag("result", "success") .register(meterRegistry); this.paymentFailedCounter = Counter.builder("business.payment.result") .description("支付结果统计") .tag("result", "failed") .register(meterRegistry); // 初始化计时器 this.orderProcessingTimer = Timer.builder("business.order.processing.time") .description("订单处理耗时") .publishPercentiles(0.5, 0.95, 0.99) .register(meterRegistry); this.paymentProcessingTimer = Timer.builder("business.payment.processing.time") .description("支付处理耗时") .publishPercentiles(0.5, 0.95, 0.99) .register(meterRegistry); // 初始化计量器 this.orderAmountSummary = DistributionSummary.builder("business.order.amount") .description("订单金额分布") .baseUnit("CNY") .register(meterRegistry); // 初始化仪表 this.activeUsersGauge = Gauge.builder("business.users.active") .description("活跃用户数") .register(meterRegistry, activeUsersCount); } /** * 记录订单创建指标 */ public void recordOrderCreated(Order order) { orderCreatedCounter.increment(); orderAmountSummary.record(order.getAmount().doubleValue()); // 记录业务标签 Counter.builder("business.order.created.by.type") .tag("orderType", order.getType().name()) .tag("channel", order.getChannel()) .register(meterRegistry) .increment(); } /** * 记录订单处理耗时 */ public Timer.Sample startOrderProcessingTimer() { return Timer.start(meterRegistry); } public void stopOrderProcessingTimer(Timer.Sample sample, String status) { sample.stop(orderProcessingTimer.tag("status", status)); } /** * 记录支付结果 */ public void recordPaymentResult(PaymentResult result) { if (result.isSuccess()) { paymentSuccessCounter.increment(); } else { paymentFailedCounter.increment(); // 记录失败原因 Counter.builder("business.payment.failed.reason") .tag("reason", result.getErrorCode()) .register(meterRegistry) .increment(); } } /** * 更新活跃用户数 */ public void updateActiveUsers(int count) { activeUsersCount.set(count); } /** * 记录自定义业务事件 */ public void recordBusinessEvent(String eventType, Map<String, String> tags) { Counter counter = Counter.builder("business.event") .tags(tags) .tag("eventType", eventType) .register(meterRegistry); counter.increment(); } }
2.3 指标切面监控
@Aspect @Component @Slf4j public class MetricsAspect { @Autowired private BusinessMetrics businessMetrics; private final MeterRegistry meterRegistry; public MetricsAspect(MeterRegistry meterRegistry) { this.meterRegistry = meterRegistry; } /** * 监控Service层方法执行 */ @Around("execution(* com.example.service.*.*(..))") public Object monitorServiceMethod(ProceedingJoinPoint joinPoint) throws Throwable { String className = joinPoint.getTarget().getClass().getSimpleName(); String methodName = joinPoint.getSignature().getName(); String metricName = "service.method.execution"; Timer.Sample sample = Timer.start(meterRegistry); String status = "success"; try { Object result = joinPoint.proceed(); return result; } catch (Exception e) { status = "error"; throw e; } finally { sample.stop(Timer.builder(metricName) .tag("class", className) .tag("method", methodName) .tag("status", status) .register(meterRegistry)); } } /** * 监控Controller层HTTP请求 */ @Around("@annotation(org.springframework.web.bind.annotation.GetMapping) || " + "@annotation(org.springframework.web.bind.annotation.PostMapping) || " + "@annotation(org.springframework.web.bind.annotation.PutMapping) || " + "@annotation(org.springframework.web.bind.annotation.DeleteMapping)") public Object monitorHttpRequest(ProceedingJoinPoint joinPoint) throws Throwable { ServletRequestAttributes attributes = (ServletRequestAttributes) RequestContextHolder.getRequestAttributes(); HttpServletRequest request = attributes.getRequest(); String path = request.getRequestURI(); String method = request.getMethod(); String metricName = "http.request.details"; Timer.Sample sample = Timer.start(meterRegistry); String status = "200"; try { Object result = joinPoint.proceed(); return result; } catch (Exception e) { status = getHttpStatusFromException(e); throw e; } finally { sample.stop(Timer.builder(metricName) .tag("path", path) .tag("method", method) .tag("status", status) .register(meterRegistry)); } } private String getHttpStatusFromException(Exception e) { if (e instanceof NotFoundException) { return "404"; } else if (e instanceof BadRequestException) { return "400"; } else if (e instanceof UnauthorizedException) { return "401"; } else { return "500"; } } }
3. 日志(Logs)体系优化
3.1 结构化日志配置
Logback 配置
<?xml version="1.0" encoding="UTF-8"?> <configuration> <include resource="org/springframework/boot/logging/logback/defaults.xml"/> <!-- JSON 布局 --> <appender name="JSON" class="ch.qos.logback.core.ConsoleAppender"> <encoder class="net.logstash.logback.encoder.LoggingEventCompositeJsonEncoder"> <providers> <timestamp> <timeZone>UTC</timeZone> </timestamp> <logLevel/> <loggerName/> <message/> <mdc/> <stackTrace> <throwableConverter class="net.logstash.logback.stacktrace.ShortenedThrowableConverter"> <maxDepthPerThrowable>30</maxDepthPerThrowable> <maxLength>2048</maxLength> <shortenedClassNameLength>20</shortenedClassNameLength> <rootCauseFirst>true</rootCauseFirst> </throwableConverter> </stackTrace> <pattern> <pattern> { "service": "order-service", "version": "1.0.0", "env": "${SPRING_PROFILES_ACTIVE:-default}" } </pattern> </pattern> </providers> </encoder> </appender> <!-- 异步文件输出 --> <appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender"> <file>logs/order-service.log</file> <encoder class="net.logstash.logback.encoder.LoggingEventCompositeJsonEncoder"> <providers> <timestamp/> <logLevel/> <loggerName/> <message/> <mdc/> <stackTrace/> </providers> </encoder> <rollingPolicy class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy"> <fileNamePattern>logs/order-service.%d{yyyy-MM-dd}.%i.log</fileNamePattern> <maxFileSize>100MB</maxFileSize> <maxHistory>30</maxHistory> <totalSizeCap>3GB</totalSizeCap> </rollingPolicy> </appender> <!-- 异步Appender --> <appender name="ASYNC" class="ch.qos.logback.classic.AsyncAppender"> <queueSize>10000</queueSize> <discardingThreshold>0</discardingThreshold> <includeCallerData>true</includeCallerData> <appender-ref ref="JSON"/> </appender> <root level="INFO"> <appender-ref ref="ASYNC"/> <appender-ref ref="FILE"/> </root> <!-- 业务日志单独配置 --> <logger name="com.example.service" level="DEBUG" additivity="false"> <appender-ref ref="ASYNC"/> </logger> <!-- 第三方库日志级别控制 --> <logger name="org.apache.kafka" level="WARN"/> <logger name="org.springframework" level="INFO"/> <logger name="com.zaxxer.hikari" level="INFO"/> </configuration>
3.2 结构化日志实践
@Slf4j @Component public class StructuredLogger { private static final String TRACE_ID = "traceId"; private static final String SPAN_ID = "spanId"; private static final String USER_ID = "userId"; private static final String ORDER_ID = "orderId"; /** * 记录业务操作日志 */ public void logBusinessOperation(String operation, String orderId, String userId, Map<String, Object> additionalFields) { MDC.put(TRACE_ID, getCurrentTraceId()); MDC.put(SPAN_ID, getCurrentSpanId()); MDC.put(ORDER_ID, orderId); MDC.put(USER_ID, userId); try { Map<String, Object> logData = new HashMap<>(); logData.put("operation", operation); logData.put("timestamp", Instant.now().toString()); logData.put("orderId", orderId); logData.put("userId", userId); logData.put("additionalInfo", additionalFields); log.info(JSON.toJSONString(logData)); } finally { MDC.clear(); } } /** * 记录性能日志 */ public void logPerformance(String operation, long duration, String status, Map<String, String> tags) { MDC.put(TRACE_ID, getCurrentTraceId()); try { Map<String, Object> perfData = new HashMap<>(); perfData.put("type", "performance"); perfData.put("operation", operation); perfData.put("durationMs", duration); perfData.put("status", status); perfData.put("tags", tags); perfData.put("timestamp", Instant.now().toString()); if (duration > 1000) { // 超过1秒记录为警告 log.warn(JSON.toJSONString(perfData)); } else { log.info(JSON.toJSONString(perfData)); } } finally { MDC.clear(); } } /** * 记录错误日志 */ public void logError(String operation, Exception error, Map<String, Object> context) { MDC.put(TRACE_ID, getCurrentTraceId()); MDC.put(ORDER_ID, context.get("orderId").toString()); try { Map<String, Object> errorData = new HashMap<>(); errorData.put("type", "error"); errorData.put("operation", operation); errorData.put("errorType", error.getClass().getSimpleName()); errorData.put("errorMessage", error.getMessage()); errorData.put("stackTrace", getStackTrace(error)); errorData.put("context", context); errorData.put("timestamp", Instant.now().toString()); log.error(JSON.toJSONString(errorData)); } finally { MDC.clear(); } } private String getCurrentTraceId() { // 从当前线程上下文获取Trace ID return Optional.ofNullable(MDC.get(TRACE_ID)) .orElse(UUID.randomUUID().toString().substring(0, 8)); } private String getCurrentSpanId() { return Optional.ofNullable(MDC.get(SPAN_ID)) .orElse(UUID.randomUUID().toString().substring(0, 8)); } private String getStackTrace(Exception e) { StringWriter sw = new StringWriter(); e.printStackTrace(new PrintWriter(sw)); return sw.toString(); } }
3.3 日志增强工具】
@Component public class LogEnhancer { /** * 为方法调用添加日志增强 */ @Around("execution(* com.example.service..*(..))") public Object enhanceWithLogging(ProceedingJoinPoint joinPoint) throws Throwable { String className = joinPoint.getTarget().getClass().getSimpleName(); String methodName = joinPoint.getSignature().getName(); Object[] args = joinPoint.getArgs(); // 记录方法开始 log.info("Method started: {}.{} with args: {}", className, methodName, Arrays.toString(args)); long startTime = System.currentTimeMillis(); String status = "success"; try { Object result = joinPoint.proceed(); // 记录方法成功结束 long duration = System.currentTimeMillis() - startTime; log.info("Method completed: {}.{} took {}ms with result: {}", className, methodName, duration, truncateResult(result)); return result; } catch (Exception e) { status = "error"; long duration = System.currentTimeMillis() - startTime; // 记录方法异常 log.error("Method failed: {}.{} took {}ms with error: {}", className, methodName, duration, e.getMessage(), e); throw e; } finally { // 记录指标 recordMethodMetrics(className, methodName, status, System.currentTimeMillis() - startTime); } } private String truncateResult(Object result) { if (result == null) return "null"; String resultStr = result.toString(); return resultStr.length() > 200 ? resultStr.substring(0, 200) + "..." : resultStr; } private void recordMethodMetrics(String className, String methodName, String status, long duration) { // 记录到指标系统 Metrics.counter("method_execution_total", "class", className, "method", methodName, "status", status) .increment(); Metrics.timer("method_execution_duration", "class", className, "method", methodName) .record(Duration.ofMillis(duration)); } }
4. 分布式追踪(Tracing)
4.1 OpenTelemetry 配置
依赖配置
<dependencies> <dependency> <groupId>io.opentelemetry</groupId> <artifactId>opentelemetry-api</artifactId> <version>1.25.0</version> </dependency> <dependency> <groupId>io.opentelemetry</groupId> <artifactId>opentelemetry-sdk</artifactId> <version>1.25.0</version> </dependency> <dependency> <groupId>io.opentelemetry</groupId> <artifactId>opentelemetry-exporter-jaeger</artifactId> <version>1.25.0</version> </dependency> <dependency> <groupId>io.opentelemetry</groupId> <artifactId>opentelemetry-semconv</artifactId> <version>1.25.0</version> </dependency> </dependencies>
追踪配置
@Configuration public class TracingConfig { @Bean public OpenTelemetry openTelemetry() { String applicationName = "order-service"; // 设置资源属性 Resource resource = Resource.getDefault() .merge(Resource.builder() .put(SERVICE_NAME, applicationName) .put(SERVICE_VERSION, "1.0.0") .put(DEPLOYMENT_ENVIRONMENT, System.getenv().getOrDefault("ENVIRONMENT", "dev")) .build()); // 配置Span处理器 SpanProcessor spanProcessor = BatchSpanProcessor.builder( JaegerGrpcSpanExporter.builder() .setEndpoint("http://jaeger:14250") .build()) .build(); // 配置采样率 Sampler sampler = Sampler.traceIdRatioBased(0.1); // 10%采样率 // 构建SdkTracerProvider SdkTracerProvider tracerProvider = SdkTracerProvider.builder() .setResource(resource) .addSpanProcessor(spanProcessor) .setSampler(sampler) .build(); // 构建OpenTelemetry实例 OpenTelemetrySdk openTelemetry = OpenTelemetrySdk.builder() .setTracerProvider(tracerProvider) .setPropagators(ContextPropagators.create(W3CTraceContextPropagator.getInstance())) .build(); return openTelemetry; } @Bean public Tracer tracer(OpenTelemetry openTelemetry) { return openTelemetry.getTracer("order-service"); } }
4.2 分布式追踪实践
@Service @Slf4j public class OrderServiceWithTracing { @Autowired private Tracer tracer; @Autowired private PaymentServiceClient paymentService; @Autowired private InventoryServiceClient inventoryService; /** * 创建订单 - 带分布式追踪 */ public Order createOrderWithTracing(CreateOrderRequest request) { // 创建Span Span span = tracer.spanBuilder("OrderService.createOrder") .setAttribute("order.id", request.getOrderId()) .setAttribute("user.id", request.getUserId()) .setAttribute("order.amount", request.getAmount().doubleValue()) .startSpan(); try (Scope scope = span.makeCurrent()) { log.info("开始创建订单: {}", request.getOrderId()); // 1. 检查库存 Span inventorySpan = tracer.spanBuilder("InventoryService.checkInventory") .setParent(Context.current().with(span)) .startSpan(); try (Scope inventoryScope = inventorySpan.makeCurrent()) { InventoryCheckResult result = inventoryService.checkInventory( request.getProductId(), request.getQuantity()); inventorySpan.setAttribute("inventory.available", result.isAvailable()); } catch (Exception e) { inventorySpan.recordException(e); inventorySpan.setStatus(StatusCode.ERROR); throw e; } finally { inventorySpan.end(); } // 2. 处理支付 Span paymentSpan = tracer.spanBuilder("PaymentService.processPayment") .setParent(Context.current().with(span)) .startSpan(); try (Scope paymentScope = paymentSpan.makeCurrent()) { PaymentResult paymentResult = paymentService.processPayment( buildPaymentRequest(request)); paymentSpan.setAttribute("payment.status", paymentResult.getStatus()); paymentSpan.setAttribute("payment.amount", paymentResult.getAmount().doubleValue()); } catch (Exception e) { paymentSpan.recordException(e); paymentSpan.setStatus(StatusCode.ERROR); throw e; } finally { paymentSpan.end(); } // 3. 创建订单记录 Order order = saveOrder(request); span.setAttribute("order.status", order.getStatus().name()); log.info("订单创建成功: {}", order.getOrderId()); return order; } catch (Exception e) { span.recordException(e); span.setStatus(StatusCode.ERROR, e.getMessage()); throw e; } finally { span.end(); } } /** * 异步操作的追踪 */ @Async public CompletableFuture<Void> processOrderAsync(String orderId) { Span span = tracer.spanBuilder("OrderService.processOrderAsync") .setAttribute("order.id", orderId) .startSpan(); // 将Span上下文传递到异步线程 Context context = Context.current().with(span); return CompletableFuture.supplyAsync(() -> { try (Scope scope = context.makeCurrent()) { // 异步处理逻辑 processOrderBackground(orderId); return null; } catch (Exception e) { span.recordException(e); span.setStatus(StatusCode.ERROR); throw e; } finally { span.end(); } }); } /** * 数据库操作的追踪 */ public Order findOrderWithTracing(String orderId) { Span span = tracer.spanBuilder("Database.findOrder") .setAttribute("db.operation", "SELECT") .setAttribute("db.table", "orders") .setAttribute("db.order.id", orderId) .startSpan(); try (Scope scope = span.makeCurrent()) { // 执行数据库查询 Order order = orderRepository.findById(orderId) .orElseThrow(() -> new OrderNotFoundException("订单不存在")); span.setAttribute("db.result.found", true); return order; } catch (Exception e) { span.recordException(e); span.setStatus(StatusCode.ERROR); throw e; } finally { span.end(); } } }
4.3 HTTP请求追踪
@Component public class TracingFilter implements Filter { private final Tracer tracer; private final TextMapPropagator propagator; public TracingFilter(Tracer tracer) { this.tracer = tracer; this.propagator = W3CTraceContextPropagator.getInstance(); } @Override public void doFilter(ServletRequest request, ServletResponse response, FilterChain chain) throws IOException, ServletException { HttpServletRequest httpRequest = (HttpServletRequest) request; HttpServletResponse httpResponse = (HttpServletResponse) response; // 从HTTP头中提取追踪上下文 Context extractedContext = propagator.extract( Context.current(), httpRequest, this::getHeaders); // 创建Span Span span = tracer.spanBuilder(httpRequest.getMethod() + " " + httpRequest.getRequestURI()) .setParent(extractedContext) .setSpanKind(SpanKind.SERVER) .startSpan(); try (Scope scope = span.makeCurrent()) { // 设置Span属性 span.setAttribute("http.method", httpRequest.getMethod()); span.setAttribute("http.route", httpRequest.getRequestURI()); span.setAttribute("http.url", httpRequest.getRequestURL().toString()); span.setAttribute("http.client_ip", getClientIp(httpRequest)); span.setAttribute("http.user_agent", httpRequest.getHeader("User-Agent")); // 继续处理请求 chain.doFilter(request, response); // 记录响应状态 span.setAttribute("http.status_code", httpResponse.getStatus()); span.setStatus(getSpanStatus(httpResponse.getStatus())); } catch (Exception e) { span.recordException(e); span.setStatus(StatusCode.ERROR, e.getMessage()); throw e; } finally { span.end(); } } private TextMapGetter<HttpServletRequest> getHeaders() { return new TextMapGetter<HttpServletRequest>() { @Override public Iterable<String> keys(HttpServletRequest carrier) { List<String> keys = new ArrayList<>(); Enumeration<String> headerNames = carrier.getHeaderNames(); while (headerNames.hasMoreElements()) { keys.add(headerNames.nextElement()); } return keys; } @Override public String get(HttpServletRequest carrier, String key) { return carrier.getHeader(key); } }; } private String getClientIp(HttpServletRequest request) { String xForwardedFor = request.getHeader("X-Forwarded-For"); if (xForwardedFor != null && !xForwardedFor.isEmpty()) { return xForwardedFor.split(",")[0]; } return request.getRemoteAddr(); } private StatusCode getSpanStatus(int statusCode) { if (statusCode >= 200 && statusCode < 400) { return StatusCode.OK; } else { return StatusCode.ERROR; } } }
5. 可观测性数据关联
5.1 三大支柱关联实践
@Component @Slf4j public class ObservabilityCorrelation { @Autowired private Tracer tracer; @Autowired private BusinessMetrics businessMetrics; private static final String TRACE_ID = "traceId"; private static final String SPAN_ID = "spanId"; /** * 执行可观测的操作 */ public <T> T executeWithObservability(String operationName, Map<String, Object> attributes, Supplier<T> operation) { // 1. 创建追踪Span Span.Builder spanBuilder = tracer.spanBuilder(operationName); attributes.forEach((key, value) -> spanBuilder.setAttribute(key, value.toString())); Span span = spanBuilder.startSpan(); String traceId = span.getSpanContext().getTraceId(); String spanId = span.getSpanContext().getSpanId(); // 2. 设置日志上下文 MDC.put(TRACE_ID, traceId); MDC.put(SPAN_ID, spanId); // 3. 开始指标计时 Timer.Sample metricSample = Timer.start(); try (Scope scope = span.makeCurrent()) { log.info("开始执行操作: {}, traceId: {}", operationName, traceId); // 执行实际操作 T result = operation.get(); // 记录成功指标 businessMetrics.recordBusinessEvent(operationName + ".success", Map.of("traceId", traceId)); log.info("操作执行成功: {}, traceId: {}", operationName, traceId); return result; } catch (Exception e) { // 记录错误 span.recordException(e); span.setStatus(StatusCode.ERROR); // 记录错误指标 businessMetrics.recordBusinessEvent(operationName + ".error", Map.of("traceId", traceId, "error", e.getClass().getSimpleName())); // 记录错误日志 log.error("操作执行失败: {}, traceId: {}, error: {}", operationName, traceId, e.getMessage(), e); throw e; } finally { // 结束指标计时 metricSample.stop(Timer.builder("operation.duration") .tag("operation", operationName) .tag("traceId", traceId) .register(Metrics.globalRegistry)); // 结束Span span.end(); // 清理MDC MDC.clear(); } } /** * 关联分析数据生成 */ public void generateCorrelationData(String businessKey, String operation) { Span currentSpan = Span.current(); String traceId = currentSpan.getSpanContext().getTraceId(); // 生成关联数据 Map<String, Object> correlationData = new HashMap<>(); correlationData.put("timestamp", Instant.now().toString()); correlationData.put("businessKey", businessKey); correlationData.put("operation", operation); correlationData.put("traceId", traceId); correlationData.put("spanId", currentSpan.getSpanContext().getSpanId()); correlationData.put("service", "order-service"); // 记录到日志 log.info(JSON.toJSONString(correlationData)); // 记录指标 Metrics.counter("business.correlation", "operation", operation, "businessKey", businessKey, "traceId", traceId) .increment(); } }
5.2 可观测性数据查询
@RestController @RequestMapping("/observability") @Slf4j public class ObservabilityQueryController { @Autowired private MeterRegistry meterRegistry; @Autowired private TracingQueryService tracingQueryService; @Autowired private LogQueryService logQueryService; /** * 获取服务健康状态 */ @GetMapping("/health") public ServiceHealth getServiceHealth() { return ServiceHealth.builder() .status(HealthStatus.HEALTHY) .timestamp(Instant.now()) .metrics(getCurrentMetrics()) .build(); } /** * 查询追踪数据 */ @GetMapping("/traces") public List<TraceSummary> queryTraces( @RequestParam(required = false) String serviceName, @RequestParam(required = false) String operation, @RequestParam(required = false) Long startTime, @RequestParam(required = false) Long endTime, @RequestParam(defaultValue = "50") int limit) { return tracingQueryService.queryTraces( serviceName, operation, startTime, endTime, limit); } /** * 查询指标数据 */ @GetMapping("/metrics/{metricName}") public MetricData getMetricData( @PathVariable String metricName, @RequestParam(required = false) Map<String, String> tags, @RequestParam(defaultValue = "1h") String duration) { return meterRegistry.find(metricName) .tags(tags) .meter() .map(meter -> buildMetricData(meter, duration)) .orElseThrow(() -> new MetricNotFoundException("指标不存在: " + metricName)); } /** * 关联查询 - 根据Trace ID查询相关数据 */ @GetMapping("/correlation/{traceId}") public CorrelationData getCorrelationData(@PathVariable String traceId) { CorrelationData data = new CorrelationData(); // 查询追踪数据 data.setTrace(tracingQueryService.getTraceDetail(traceId)); // 查询相关日志 data.setLogs(logQueryService.queryLogsByTraceId(traceId)); // 查询相关指标 data.setMetrics(getMetricsByTraceId(traceId)); return data; } /** * 性能分析 */ @GetMapping("/performance/analysis") public PerformanceAnalysis analyzePerformance( @RequestParam String operation, @RequestParam Long startTime, @RequestParam Long endTime) { PerformanceAnalysis analysis = new PerformanceAnalysis(); // 分析指标趋势 analysis.setMetricTrends(analyzeMetricTrends(operation, startTime, endTime)); // 分析慢追踪 analysis.setSlowTraces(analyzeSlowTraces(operation, startTime, endTime)); // 分析错误模式 analysis.setErrorPatterns(analyzeErrorPatterns(operation, startTime, endTime)); return analysis; } private Map<String, Object> getCurrentMetrics() { Map<String, Object> metrics = new HashMap<>(); // 系统指标 metrics.put("system.cpu.usage", getCpuUsage()); metrics.put("system.memory.usage", getMemoryUsage()); metrics.put("system.disk.usage", getDiskUsage()); // 应用指标 metrics.put("app.requests.active", getActiveRequests()); metrics.put("app.threads.active", getActiveThreads()); metrics.put("app.db.connections.active", getActiveDbConnections()); return metrics; } }
6. 监控告警与可视化
6.1 告警规则配置
# alert-rules.yml groups: - name: order-service rules: # 错误率告警 - alert: HighErrorRate expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05 for: 2m labels: severity: warning service: order-service annotations: summary: "高错误率告警" description: "订单服务错误率超过5%,当前值: {{ $value }}" # 响应时间告警 - alert: HighResponseTime expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2 for: 3m labels: severity: warning service: order-service annotations: summary: "高响应时间告警" description: "订单服务95分位响应时间超过2秒,当前值: {{ $value }}s" # 业务指标告警 - alert: PaymentFailureRateHigh expr: rate(business_payment_result{result="failed"}[10m]) / rate(business_payment_result[10m]) > 0.1 for: 5m labels: severity: critical service: order-service annotations: summary: "支付失败率过高" description: "支付失败率超过10%,当前值: {{ $value }}" # 系统资源告警 - alert: HighMemoryUsage expr: system_memory_usage > 0.8 for: 2m labels: severity: warning service: order-service annotations: summary: "内存使用率过高" description: "内存使用率超过80%,当前值: {{ $value }}"
6.2 告警管理器
@Component @Slf4j public class AlertManager { @Autowired private NotificationService notificationService; @Autowired private MetricsService metricsService; /** * 处理监控告警 */ @EventListener public void handleAlert(AlertEvent event) { log.warn("接收到告警事件: {}", event); // 记录告警指标 metricsService.recordAlertEvent(event); // 根据严重程度发送通知 switch (event.getSeverity()) { case CRITICAL: handleCriticalAlert(event); break; case WARNING: handleWarningAlert(event); break; case INFO: handleInfoAlert(event); break; } } /** * 处理严重告警 */ private void handleCriticalAlert(AlertEvent event) { // 发送即时通知 notificationService.sendCriticalAlert( buildAlertMessage(event), getOnCallEngineers() ); // 创建应急工单 incidentService.createIncident(event); // 触发自动恢复流程 if (event.canAutoRecover()) { autoRecoveryService.attemptRecovery(event); } } /** * 处理警告告警 */ private void handleWarningAlert(AlertEvent event) { // 发送常规通知 notificationService.sendWarningAlert(buildAlertMessage(event)); // 记录到告警面板 alertDashboard.recordAlert(event); } /** * 构建告警消息 */ private AlertMessage buildAlertMessage(AlertEvent event) { return AlertMessage.builder() .title(event.getSummary()) .description(event.getDescription()) .severity(event.getSeverity()) .timestamp(event.getTimestamp()) .service(event.getService()) .relatedMetrics(getRelatedMetrics(event)) .suggestedActions(getSuggestedActions(event)) .build(); } /** * 智能告警去重 */ public boolean shouldTriggerAlert(AlertEvent event) { // 检查是否在静默期 if (isInSilencePeriod(event)) { log.info("告警处于静默期,不触发: {}", event.getAlertName()); return false; } // 检查是否已经存在相同告警 if (isDuplicateAlert(event)) { log.info("重复告警,不触发: {}", event.getAlertName()); return false; } // 检查告警频率 if (isAlertFlapping(event)) { log.warn("告警抖动,标记为不稳定: {}", event.getAlertName()); return false; } return true; } }
7. 生产环境最佳实践
7.1 可观测性检查清单
@Component @Slf4j public class ObservabilityChecklist { /** * 应用启动时可观测性检查 */ @EventListener(ApplicationReadyEvent.class) public void performObservabilityCheck() { log.info("开始可观测性检查..."); List<CheckItem> checks = Arrays.asList( checkMetricsEndpoint(), checkTracingConfiguration(), checkLoggingConfiguration(), checkAlertRules(), checkDashboardConfiguration() ); boolean allPassed = checks.stream().allMatch(CheckItem::isPassed); if (allPassed) { log.info("可观测性检查通过"); } else { log.error("可观测性检查发现问题:"); checks.stream() .filter(item -> !item.isPassed()) .forEach(item -> log.error(" - {}: {}", item.getName(), item.getMessage())); } } /** * 定期健康检查 */ @Scheduled(fixedRate = 300000) // 5分钟 public void performHealthCheck() { ObservabilityHealth health = checkObservabilityHealth(); if (health.getStatus() == HealthStatus.DEGRADED) { log.warn("可观测性系统健康度下降: {}", health.getMessage()); alertService.sendObservabilityDegradedAlert(health); } } private CheckItem checkMetricsEndpoint() { try { RestTemplate restTemplate = new RestTemplate(); ResponseEntity<String> response = restTemplate.getForEntity( "http://localhost:8080/actuator/prometheus", String.class); if (response.getStatusCode().is2xxSuccessful()) { return CheckItem.passed("Metrics端点检查"); } else { return CheckItem.failed("Metrics端点检查", "端点返回状态: " + response.getStatusCode()); } } catch (Exception e) { return CheckItem.failed("Metrics端点检查", e.getMessage()); } } private CheckItem checkTracingConfiguration() { try { // 检查追踪配置 Tracer tracer = applicationContext.getBean(Tracer.class); if (tracer != null) { return CheckItem.passed("追踪配置检查"); } else { return CheckItem.failed("追踪配置检查", "Tracer bean未找到"); } } catch (Exception e) { return CheckItem.failed("追踪配置检查", e.getMessage()); } } }
7.2 性能优化建议
# 可观测性性能配置 observability: metrics: # 指标采集间隔 collection-interval: 30s # 指标保留时间 retention: 30d # 采样率 sampling-rate: 1.0 tracing: # 追踪采样率 sampling-rate: 0.1 # 最大属性数量 max-attributes: 128 # 最大事件数量 max-events: 128 logging: # 异步日志队列大小 async-queue-size: 10000 # 日志文件大小限制 max-file-size: 100MB # 日志保留天数 retention-days: 30 alerts: # 告警分组间隔 group-interval: 2m # 告警重复间隔 repeat-interval: 1h # 告警超时时间 timeout: 1m
8. 总结
8.1 可观测性价值体现
通过本文的深入实践,你应该理解可观测性在现代分布式系统中的核心价值:
- 快速问题定位:通过关联分析快速定位问题根因
- 性能优化依据:基于数据驱动的性能优化决策
- 容量规划支撑:为系统扩容和资源分配提供数据支持
- 用户体验保障:实时监控业务指标,保障用户体验
8.2 关键成功要素
✅ 三位一体:指标、日志、追踪的完整覆盖 ✅ 关联分析:跨数据源的关联查询和分析 ✅ 智能告警:基于机器学习的智能告警和预测 ✅ 自动化运维:自动化的监控配置和故障恢复 ✅ 持续优化:基于反馈的持续监控体系优化
8.3 演进路线图
初级阶段:
- 基础指标收集和告警
- 结构化日志输出
- 简单的仪表板
中级阶段:
- 分布式追踪集成
- 智能告警和预测
- 自动化故障检测
高级阶段:
- AIOps智能运维
- 全链路可观测性
- 自动化根因分析
可观测性不是一次性项目,而是需要持续投入和改进的工程实践。通过建立完善的可观测性体系,能够显著提升系统的可靠性和运维效率。