diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 7a5964ba35..34bc6c7d0f 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -11,6 +11,7 @@ on: paths-ignore: - 'docs/**' - 'adr/**' + - 'observability/**' workflow_dispatch: jobs: check_format_and_unit_tests: diff --git a/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java b/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java index 7beabb7a6e..26971e7fa9 100644 --- a/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java +++ b/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java @@ -39,6 +39,7 @@ import static io.javaoperatorsdk.operator.api.reconciler.Constants.CONTROLLER_NAME; +@Deprecated(forRemoval = true) public class MicrometerMetrics implements Metrics { private static final String PREFIX = "operator.sdk."; @@ -228,7 +229,8 @@ public void reconcileCustomResource( } @Override - public void finishedReconciliation(HasMetadata resource, Map metadata) { + public void successfullyFinishedReconciliation( + HasMetadata resource, Map metadata) { incrementCounter(ResourceID.fromResource(resource), RECONCILIATIONS_SUCCESS, metadata); } diff --git a/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetricsV2.java b/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetricsV2.java new file mode 100644 index 0000000000..9b75845776 --- /dev/null +++ b/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetricsV2.java @@ -0,0 +1,297 @@ +/* + * Copyright Java Operator SDK Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.javaoperatorsdk.operator.monitoring.micrometer; + +import java.util.*; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Consumer; + +import org.jspecify.annotations.NonNull; + +import io.fabric8.kubernetes.api.model.HasMetadata; +import io.javaoperatorsdk.operator.OperatorException; +import io.javaoperatorsdk.operator.api.monitoring.Metrics; +import io.javaoperatorsdk.operator.api.reconciler.Constants; +import io.javaoperatorsdk.operator.api.reconciler.RetryInfo; +import io.javaoperatorsdk.operator.processing.Controller; +import io.javaoperatorsdk.operator.processing.event.Event; +import io.javaoperatorsdk.operator.processing.event.ResourceID; +import io.javaoperatorsdk.operator.processing.event.source.controller.ResourceEvent; +import io.micrometer.core.instrument.MeterRegistry; +import io.micrometer.core.instrument.Tag; +import io.micrometer.core.instrument.Timer; + +public class MicrometerMetricsV2 implements Metrics { + + private static final String PREFIX = "operator.sdk."; + private static final String CONTROLLER_NAME = "controller.name"; + private static final String EVENT = "event"; + private static final String ACTION = "action"; + private static final String EVENTS_RECEIVED = PREFIX + "events.received"; + private static final String EVENTS_DELETE = PREFIX + "events.delete"; + private static final String UNKNOWN_ACTION = "UNKNOWN"; + public static final String TOTAL_SUFFIX = ".total"; + private static final String SUCCESS_SUFFIX = "success"; + private static final String FAILURE_SUFFIX = "failure"; + + private static final String RECONCILIATIONS = "reconciliations."; + + private static final String RECONCILIATIONS_FAILED = + PREFIX + RECONCILIATIONS + FAILURE_SUFFIX + TOTAL_SUFFIX; + private static final String RECONCILIATIONS_SUCCESS = + PREFIX + RECONCILIATIONS + SUCCESS_SUFFIX + TOTAL_SUFFIX; + private static final String RECONCILIATIONS_RETRIES_NUMBER = + PREFIX + RECONCILIATIONS + "retries" + TOTAL_SUFFIX; + private static final String RECONCILIATIONS_RETRIES_LAST_ATTEMPT = + PREFIX + RECONCILIATIONS + "retries.lastattempt" + TOTAL_SUFFIX; + private static final String RECONCILIATIONS_STARTED = + PREFIX + RECONCILIATIONS + "started" + TOTAL_SUFFIX; + + private static final String CONTROLLERS = "controllers."; + + private static final String CONTROLLERS_SUCCESSFUL_EXECUTION = + PREFIX + CONTROLLERS + SUCCESS_SUFFIX + TOTAL_SUFFIX; + private static final String CONTROLLERS_FAILED_EXECUTION = + PREFIX + CONTROLLERS + FAILURE_SUFFIX + TOTAL_SUFFIX; + + private static final String RECONCILIATIONS_EXECUTIONS_GAUGE = + PREFIX + RECONCILIATIONS + "executions"; + private static final String RECONCILIATIONS_QUEUE_SIZE_GAUGE = + PREFIX + RECONCILIATIONS + "active"; + + private static final String RECONCILIATION_EXECUTION_DURATION = + PREFIX + RECONCILIATIONS + "execution.seconds"; + + private final MeterRegistry registry; + private final Map gauges = new ConcurrentHashMap<>(); + private final Map executionTimers = new ConcurrentHashMap<>(); + private final Consumer timerConfig; + + /** + * Creates a new builder to configure how the eventual MicrometerMetricsV2 instance will behave, + * pre-configuring it to collect metrics per resource. + * + * @param registry the {@link MeterRegistry} instance to use for metrics recording + * @return a MicrometerMetricsV2 instance configured to not collect per-resource metrics + * @see MicrometerMetricsV2Builder + */ + public static MicrometerMetricsV2Builder newPerResourceCollectingMicrometerMetricsBuilder( + MeterRegistry registry) { + return new MicrometerMetricsV2Builder(registry); + } + + /** + * Creates a micrometer-based Metrics implementation. + * + * @param registry the {@link MeterRegistry} instance to use for metrics recording + * @param timerConfig optional configuration for timers, defaults to publishing percentiles 0.5, + * 0.95, 0.99 and histogram + */ + private MicrometerMetricsV2(MeterRegistry registry, Consumer timerConfig) { + this.registry = registry; + this.timerConfig = + timerConfig != null + ? timerConfig + : builder -> builder.publishPercentiles(0.5, 0.95, 0.99).publishPercentileHistogram(); + } + + @Override + public void controllerRegistered(Controller controller) { + final var configuration = controller.getConfiguration(); + final var name = configuration.getName(); + final var executingThreadsRefName = reconciliationExecutionGaugeRefKey(name); + final var tags = new ArrayList(); + addControllerName(name, tags); + AtomicInteger executingThreads = + registry.gauge(RECONCILIATIONS_EXECUTIONS_GAUGE, tags, new AtomicInteger(0)); + gauges.put(executingThreadsRefName, executingThreads); + + final var controllerQueueRefName = controllerQueueSizeGaugeRefKey(name); + AtomicInteger controllerQueueSize = + registry.gauge(RECONCILIATIONS_QUEUE_SIZE_GAUGE, tags, new AtomicInteger(0)); + gauges.put(controllerQueueRefName, controllerQueueSize); + + final var timerBuilder = Timer.builder(RECONCILIATION_EXECUTION_DURATION).tags(tags); + timerConfig.accept(timerBuilder); + var timer = timerBuilder.register(registry); + executionTimers.put(name, timer); + } + + // todo does it make sense to have both controller and reconciler execution counters? + @Override + public T timeControllerExecution(ControllerExecution execution) { + final var name = execution.controllerName(); + final var tags = new ArrayList(1); + addControllerName(name, tags); + + final var timer = executionTimers.get(name); + try { + final var result = + timer.record( + () -> { + try { + return execution.execute(); + } catch (Exception e) { + throw new OperatorException(e); + } + }); + registry.counter(CONTROLLERS_SUCCESSFUL_EXECUTION, CONTROLLER_NAME, name).increment(); + return result; + } catch (Exception e) { + registry.counter(CONTROLLERS_FAILED_EXECUTION, CONTROLLER_NAME, name).increment(); + throw e; + } + } + + @Override + public void receivedEvent(Event event, Map metadata) { + if (event instanceof ResourceEvent resourceEvent) { + incrementCounter( + EVENTS_RECEIVED, + metadata, + Tag.of(EVENT, event.getClass().getSimpleName()), + Tag.of(ACTION, resourceEvent.getAction().toString())); + } else { + incrementCounter( + EVENTS_RECEIVED, + metadata, + Tag.of(EVENT, event.getClass().getSimpleName()), + Tag.of(ACTION, UNKNOWN_ACTION)); + } + } + + @Override + public void cleanupDoneFor(ResourceID resourceID, Map metadata) { + incrementCounter(EVENTS_DELETE, metadata); + } + + @Override + public void submittedForReconciliation( + HasMetadata resource, RetryInfo retryInfoNullable, Map metadata) { + Optional retryInfo = Optional.ofNullable(retryInfoNullable); + + // Record the counter without retry tags + incrementCounter(RECONCILIATIONS_STARTED, metadata); + + int retryNumber = retryInfo.map(RetryInfo::getAttemptCount).orElse(0); + if (retryNumber > 0) { + incrementCounter(RECONCILIATIONS_RETRIES_NUMBER, metadata); + } + retryInfo.ifPresent( + i -> { + if (retryInfoNullable.isLastAttempt()) { + incrementCounter(RECONCILIATIONS_RETRIES_LAST_ATTEMPT, metadata); + } + }); + + var controllerQueueSize = + gauges.get(controllerQueueSizeGaugeRefKey(getControllerName(metadata))); + controllerQueueSize.incrementAndGet(); + } + + @Override + public void successfullyFinishedReconciliation( + HasMetadata resource, Map metadata) { + incrementCounter(RECONCILIATIONS_SUCCESS, metadata); + } + + @Override + public void reconciliationExecutionStarted(HasMetadata resource, Map metadata) { + var reconcilerExecutions = + gauges.get(reconciliationExecutionGaugeRefKey(getControllerName(metadata))); + reconcilerExecutions.incrementAndGet(); + } + + @Override + public void reconciliationExecutionFinished(HasMetadata resource, Map metadata) { + var reconcilerExecutions = + gauges.get(reconciliationExecutionGaugeRefKey(metadata.get(CONTROLLER_NAME).toString())); + reconcilerExecutions.decrementAndGet(); + + var controllerQueueSize = + gauges.get(controllerQueueSizeGaugeRefKey(metadata.get(CONTROLLER_NAME).toString())); + controllerQueueSize.decrementAndGet(); + } + + @Override + public void failedReconciliation( + HasMetadata resource, Exception exception, Map metadata) { + incrementCounter(RECONCILIATIONS_FAILED, metadata); + } + + private static void addTag(String name, String value, List tags) { + tags.add(Tag.of(name, value)); + } + + private static void addControllerName(Map metadata, List tags) { + addTag(CONTROLLER_NAME, getControllerName(metadata), tags); + } + + private static void addControllerName(String name, List tags) { + addTag(CONTROLLER_NAME, name, tags); + } + + private void incrementCounter( + String counterName, Map metadata, Tag... additionalTags) { + + final var tags = new ArrayList(1 + additionalTags.length); + addControllerName(metadata, tags); + if (additionalTags.length > 0) { + tags.addAll(List.of(additionalTags)); + } + registry.counter(counterName, tags).increment(); + } + + private static @NonNull String reconciliationExecutionGaugeRefKey(String controllerName) { + return RECONCILIATIONS_EXECUTIONS_GAUGE + "." + controllerName; + } + + private static @NonNull String controllerQueueSizeGaugeRefKey(String controllerName) { + return RECONCILIATIONS_QUEUE_SIZE_GAUGE + "." + controllerName; + } + + public static String getControllerName(Map metadata) { + return (String) metadata.get(Constants.CONTROLLER_NAME); + } + + public static class MicrometerMetricsV2Builder { + protected final MeterRegistry registry; + protected Consumer executionTimerConfig = null; + + public MicrometerMetricsV2Builder(MeterRegistry registry) { + this.registry = registry; + } + + /** + * Configures the Timer used for timing controller executions. By default, timers are configured + * to publish percentiles 0.5, 0.95, 0.99 and a percentile histogram. + * + * @param executionTimerConfig a consumer that will configure the Timer.Builder. The builder + * will already have the metric name and tags set. + * @return this builder for method chaining + */ + public MicrometerMetricsV2Builder withExecutionTimerConfig( + Consumer executionTimerConfig) { + this.executionTimerConfig = executionTimerConfig; + return this; + } + + public MicrometerMetricsV2 build() { + return new MicrometerMetricsV2(registry, executionTimerConfig); + } + } +} diff --git a/observability/README.md b/observability/README.md new file mode 100644 index 0000000000..58caae27d0 --- /dev/null +++ b/observability/README.md @@ -0,0 +1,252 @@ +# Observability Stack for Java Operator SDK + +This directory contains the setup scripts and Grafana dashboards for monitoring Java Operator SDK applications. + +## Installation + +Run the installation script to deploy the full observability stack (OpenTelemetry Collector, Prometheus, and Grafana): + +```bash +./install-observability.sh +``` + +This will install: +- **cert-manager** - Required for OpenTelemetry Operator +- **OpenTelemetry Operator** - Manages OpenTelemetry Collector instances +- **OpenTelemetry Collector** - Receives OTLP metrics and exports to Prometheus +- **Prometheus** - Metrics storage and querying +- **Grafana** - Metrics visualization + +## Accessing Services + +### Grafana +```bash +kubectl port-forward -n observability svc/kube-prometheus-stack-grafana 3000:80 +``` +Then open http://localhost:3000 +- Username: `admin` +- Password: `admin` + +### Prometheus +```bash +kubectl port-forward -n observability svc/kube-prometheus-stack-prometheus 9090:9090 +``` +Then open http://localhost:9090 + +## Grafana Dashboards + +Two pre-configured dashboards are **automatically imported** during installation: + +### 1. JVM Metrics Dashboard (`jvm-metrics-dashboard.json`) + +Monitors Java Virtual Machine health and performance: + +**Panels:** +- **JVM Memory Used** - Heap and non-heap memory consumption by memory pool +- **JVM Threads** - Live, daemon, and peak thread counts +- **GC Pause Time Rate** - Garbage collection pause duration +- **GC Pause Count Rate** - Frequency of garbage collection events +- **CPU Usage** - System CPU utilization percentage +- **Classes Loaded** - Number of classes currently loaded +- **Process Uptime** - Application uptime in seconds +- **CPU Count** - Available processor cores +- **GC Memory Allocation Rate** - Memory allocation and promotion rates +- **Heap Memory Max vs Committed** - Heap memory limits and commitments + +**Key Metrics:** +- `jvm.memory.used`, `jvm.memory.max`, `jvm.memory.committed` +- `jvm.gc.pause`, `jvm.gc.memory.allocated`, `jvm.gc.memory.promoted` +- `jvm.threads.live`, `jvm.threads.daemon`, `jvm.threads.peak` +- `jvm.classes.loaded`, `jvm.classes.unloaded` +- `system.cpu.usage`, `system.cpu.count` +- `process.uptime` + +**Filtering:** +All panels filter by `service_name="josdk"` to show metrics only from your operator. + +### 2. Java Operator SDK Metrics Dashboard (`josdk-operator-metrics-dashboard.json`) + +Monitors Kubernetes operator performance and health: + +**Panels:** +- **Reconciliation Rate (Started)** - Rate of reconciliation loops triggered +- **Reconciliation Success vs Failure Rate** - Success/failure ratio over time +- **Currently Executing Reconciliations** - Active reconciliation threads +- **Reconciliation Queue Size** - Pending reconciliation work +- **Total Reconciliations** - Cumulative count of reconciliations +- **Error Rate** - Overall error rate across all reconciliations +- **Reconciliation Execution Time** - P50, P95, P99 latency percentiles +- **Event Reception Rate** - Kubernetes event processing rate +- **Failures by Exception Type** - Breakdown of errors by exception class +- **Controller Execution Success vs Failure** - Controller-level success metrics +- **Delete Event Rate** - Resource deletion event frequency +- **Reconciliation Retry Rate** - Retry attempts and patterns + +**Key Metrics:** +- `operator.sdk.reconciliations.started`, `.success`, `.failed` +- `operator.sdk.reconciliations.executions` - Current execution count +- `operator.sdk.reconciliations.queue.size` - Queue depth +- `operator.sdk.controllers.execution.reconcile` - Execution timing histograms +- `operator.sdk.events.received`, `.delete` - Event reception +- Retry metrics and failure breakdowns + +**Filtering:** +All panels filter by `service_name="josdk"` to show metrics only from your operator. + +## Importing Dashboards into Grafana + +### Automatic Import (Default) + +The dashboards are **automatically imported** when you run `./install-observability.sh`. They will appear in Grafana within 30-60 seconds after installation. No manual steps required! + +To verify the dashboards were imported: +1. Access Grafana at http://localhost:3000 +2. Navigate to **Dashboards** → **Browse** +3. Look for "JOSDK - JVM Metrics" and "JOSDK - Operator Metrics" + +### Manual Import Methods + +If you need to re-import or update the dashboards manually: + +#### Method 1: Via Grafana UI + +1. Access Grafana at http://localhost:3000 +2. Login with admin/admin +3. Navigate to **Dashboards** → **Import** +4. Click **Upload JSON file** +5. Select `jvm-metrics-dashboard.json` or `josdk-operator-metrics-dashboard.json` +6. Select **Prometheus** as the data source +7. Click **Import** + +#### Method 2: Via kubectl ConfigMap + +```bash +# Re-import JVM dashboard +kubectl create configmap jvm-metrics-dashboard \ + --from-file=jvm-metrics-dashboard.json \ + -n observability \ + -o yaml --dry-run=client | \ + kubectl label --dry-run=client --local -f - grafana_dashboard=1 -o yaml | \ + kubectl apply -f - + +# Re-import Operator dashboard +kubectl create configmap josdk-operator-metrics-dashboard \ + --from-file=josdk-operator-metrics-dashboard.json \ + -n observability \ + -o yaml --dry-run=client | \ + kubectl label --dry-run=client --local -f - grafana_dashboard=1 -o yaml | \ + kubectl apply -f - +``` + +The dashboards will be automatically discovered and loaded by Grafana within 30-60 seconds. + +## Configuring Your Operator + +To enable metrics export from your JOSDK operator, ensure your application: + +1. **Has the required dependency** (already included in webpage sample): + ```xml + + io.micrometer + micrometer-registry-otlp + + ``` + +2. **Configures OTLP export** via `otlp-config.yaml`: + ```yaml + otlp: + url: "http://otel-collector-collector.observability.svc.cluster.local:4318/v1/metrics" + step: 15s + batchSize: 15000 + aggregationTemporality: "cumulative" + ``` + +3. **Registers JVM and JOSDK metrics** (see `WebPageOperator.java` for reference implementation) + +## OTLP Endpoints + +The OpenTelemetry Collector provides the following endpoints: + +- **OTLP gRPC**: `otel-collector-collector.observability.svc.cluster.local:4317` +- **OTLP HTTP**: `otel-collector-collector.observability.svc.cluster.local:4318` +- **Prometheus Scrape**: `http://otel-collector-prometheus.observability.svc.cluster.local:8889/metrics` + +## Troubleshooting + +### Check OpenTelemetry Collector Logs +```bash +kubectl logs -n observability -l app.kubernetes.io/name=otel-collector -f +``` + +### Check Prometheus Targets +```bash +kubectl port-forward -n observability svc/kube-prometheus-stack-prometheus 9090:9090 +``` +Open http://localhost:9090/targets and verify the OTLP collector target is UP. + +### Verify Metrics in Prometheus +Open Prometheus UI and search for metrics: +- JVM metrics: `jvm_*` +- Operator metrics: `operator_sdk_*` + +### Check Grafana Data Source +1. Navigate to **Configuration** → **Data Sources** +2. Verify Prometheus data source is configured and working +3. Click **Test** to verify connectivity + +## Uninstalling + +To remove the observability stack: + +```bash +kubectl delete configmap -n observability jvm-metrics-dashboard josdk-operator-metrics-dashboard +kubectl delete -n observability OpenTelemetryCollector otel-collector +helm uninstall -n observability kube-prometheus-stack +helm uninstall -n observability opentelemetry-operator +helm uninstall -n cert-manager cert-manager +kubectl delete namespace observability cert-manager +``` + +## Customizing Dashboards + +The dashboard JSON files can be modified to: +- Add new panels for custom metrics +- Adjust time ranges and refresh intervals +- Change visualization types +- Add templating variables for filtering +- Modify alert thresholds + +After making changes, re-import the dashboard using one of the methods above. + +## Example Queries + +### JVM Metrics +```promql +# Heap memory usage percentage +(jvm_memory_used_bytes{area="heap"} / jvm_memory_max_bytes{area="heap"}) * 100 + +# GC throughput (percentage of time NOT in GC) +100 - (rate(jvm_gc_pause_seconds_sum[5m]) * 100) + +# Thread count trend +jvm_threads_live_threads +``` + +### Operator Metrics +```promql +# Reconciliation success rate +rate(operator_sdk_reconciliations_success_total[5m]) / rate(operator_sdk_reconciliations_started_total[5m]) + +# Average reconciliation time +rate(operator_sdk_controllers_execution_reconcile_seconds_sum[5m]) / rate(operator_sdk_controllers_execution_reconcile_seconds_count[5m]) + +# Queue saturation +operator_sdk_reconciliations_queue_size / on() group_left() max(operator_sdk_reconciliations_queue_size) +``` + +## References + +- [Java Operator SDK Documentation](https://javaoperatorsdk.io) +- [Micrometer OTLP Documentation](https://micrometer.io/docs/registry/otlp) +- [OpenTelemetry Collector](https://opentelemetry.io/docs/collector/) +- [Grafana Dashboards](https://grafana.com/docs/grafana/latest/dashboards/) diff --git a/observability/install-observability.sh b/observability/install-observability.sh new file mode 100755 index 0000000000..dc7430520b --- /dev/null +++ b/observability/install-observability.sh @@ -0,0 +1,308 @@ +#!/bin/bash +# +# Copyright Java Operator SDK Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +echo -e "${GREEN}========================================${NC}" +echo -e "${GREEN}Installing Observability Stack${NC}" +echo -e "${GREEN}OpenTelemetry + Prometheus + Grafana${NC}" +echo -e "${GREEN}========================================${NC}" + +# Check if helm is installed +echo -e "\n${YELLOW}Checking helm installation...${NC}" +if ! command -v helm &> /dev/null; then + echo -e "${RED}Error: helm is not installed${NC}" + echo "Please install helm: https://helm.sh/docs/intro/install/" + exit 1 +fi +echo -e "${GREEN}✓ helm is installed${NC}" + +# Add Helm repositories +echo -e "\n${YELLOW}Adding Helm repositories...${NC}" +helm repo add jetstack https://charts.jetstack.io +helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update +echo -e "${GREEN}✓ Helm repositories added${NC}" + +echo -e "\n${GREEN}========================================${NC}" +echo -e "${GREEN}Installing Components (Parallel)${NC}" +echo -e "${GREEN}========================================${NC}" +echo -e "The following will be installed:" +echo -e " • cert-manager" +echo -e " • OpenTelemetry Operator" +echo -e " • Prometheus & Grafana" +echo -e " • OpenTelemetry Collector" +echo -e " • Service Monitors" +echo -e "\n${YELLOW}All resources will be applied first, then we'll wait for them to become ready.${NC}\n" + +# Install cert-manager (required for OpenTelemetry Operator) +echo -e "\n${YELLOW}Installing cert-manager...${NC}" +if kubectl get namespace cert-manager > /dev/null 2>&1; then + echo -e "${YELLOW}cert-manager namespace already exists, skipping...${NC}" +else + kubectl create namespace cert-manager + helm install cert-manager jetstack/cert-manager \ + --namespace cert-manager \ + --set crds.enabled=true + echo -e "${GREEN}✓ cert-manager installation started${NC}" +fi + +# Create observability namespace +echo -e "\n${YELLOW}Creating observability namespace...${NC}" +kubectl create namespace observability --dry-run=client -o yaml | kubectl apply -f - +echo -e "${GREEN}✓ observability namespace ready${NC}" + +# Install OpenTelemetry Operator +echo -e "\n${YELLOW}Installing OpenTelemetry Operator...${NC}" + +if helm list -n observability | grep -q opentelemetry-operator; then + echo -e "${YELLOW}OpenTelemetry Operator already installed, upgrading...${NC}" + helm upgrade opentelemetry-operator open-telemetry/opentelemetry-operator \ + --namespace observability \ + --set "manager.collectorImage.repository=otel/opentelemetry-collector-contrib" +else + helm install opentelemetry-operator open-telemetry/opentelemetry-operator \ + --namespace observability \ + --set "manager.collectorImage.repository=otel/opentelemetry-collector-contrib" +fi +echo -e "${GREEN}✓ OpenTelemetry Operator installation started${NC}" + +# Install kube-prometheus-stack (includes Prometheus + Grafana) +echo -e "\n${YELLOW}Installing Prometheus and Grafana stack...${NC}" +if helm list -n observability | grep -q kube-prometheus-stack; then + echo -e "${YELLOW}kube-prometheus-stack already installed, upgrading...${NC}" + helm upgrade kube-prometheus-stack prometheus-community/kube-prometheus-stack \ + --namespace observability \ + --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \ + --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \ + --set grafana.adminPassword=admin +else + helm install kube-prometheus-stack prometheus-community/kube-prometheus-stack \ + --namespace observability \ + --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \ + --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \ + --set grafana.adminPassword=admin +fi +echo -e "${GREEN}✓ Prometheus and Grafana installation started${NC}" + +# Create OpenTelemetry Collector instance +echo -e "\n${YELLOW}Creating OpenTelemetry Collector...${NC}" +cat </dev/null || echo -e "${YELLOW}cert-manager already running or skipped${NC}" + +# Wait for observability pods +echo -e "${YELLOW}Checking observability pods...${NC}" +kubectl wait --for=condition=ready pod --all -n observability --timeout=300s + +echo -e "${GREEN}✓ All pods are ready${NC}" + +# Import Grafana dashboards +echo -e "\n${YELLOW}Importing Grafana dashboards...${NC}" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +if [ -f "$SCRIPT_DIR/jvm-metrics-dashboard.json" ]; then + kubectl create configmap jvm-metrics-dashboard \ + --from-file="$SCRIPT_DIR/jvm-metrics-dashboard.json" \ + -n observability \ + --dry-run=client -o yaml | \ + kubectl label --dry-run=client --local -f - grafana_dashboard=1 -o yaml | \ + kubectl apply -f - + echo -e "${GREEN}✓ JVM Metrics dashboard imported${NC}" +else + echo -e "${YELLOW}⚠ JVM Metrics dashboard not found at $SCRIPT_DIR/jvm-metrics-dashboard.json${NC}" +fi + +if [ -f "$SCRIPT_DIR/josdk-operator-metrics-dashboard.json" ]; then + kubectl create configmap josdk-operator-metrics-dashboard \ + --from-file="$SCRIPT_DIR/josdk-operator-metrics-dashboard.json" \ + -n observability \ + --dry-run=client -o yaml | \ + kubectl label --dry-run=client --local -f - grafana_dashboard=1 -o yaml | \ + kubectl apply -f - + echo -e "${GREEN}✓ JOSDK Operator Metrics dashboard imported${NC}" +else + echo -e "${YELLOW}⚠ JOSDK Operator Metrics dashboard not found at $SCRIPT_DIR/josdk-operator-metrics-dashboard.json${NC}" +fi + +echo -e "${GREEN}✓ Dashboards will be available in Grafana shortly${NC}" + +# Get pod statuses +echo -e "\n${GREEN}========================================${NC}" +echo -e "${GREEN}Installation Complete!${NC}" +echo -e "${GREEN}========================================${NC}" + +echo -e "\n${YELLOW}Pod Status:${NC}" +kubectl get pods -n observability + +echo -e "\n${GREEN}========================================${NC}" +echo -e "${GREEN}Access Information${NC}" +echo -e "${GREEN}========================================${NC}" + +echo -e "\n${YELLOW}Grafana:${NC}" +echo -e " Username: ${GREEN}admin${NC}" +echo -e " Password: ${GREEN}admin${NC}" +echo -e " Access with: ${GREEN}kubectl port-forward -n observability svc/kube-prometheus-stack-grafana 3000:80${NC}" +echo -e " Then open: ${GREEN}http://localhost:3000${NC}" + +echo -e "\n${YELLOW}Prometheus:${NC}" +echo -e " Access with: ${GREEN}kubectl port-forward -n observability svc/kube-prometheus-stack-prometheus 9090:9090${NC}" +echo -e " Then open: ${GREEN}http://localhost:9090${NC}" + +echo -e "\n${YELLOW}OpenTelemetry Collector:${NC}" +echo -e " OTLP gRPC endpoint: ${GREEN}otel-collector-collector.observability.svc.cluster.local:4317${NC}" +echo -e " OTLP HTTP endpoint: ${GREEN}otel-collector-collector.observability.svc.cluster.local:4318${NC}" +echo -e " Prometheus metrics: ${GREEN}http://otel-collector-prometheus.observability.svc.cluster.local:8889/metrics${NC}" + +echo -e "\n${YELLOW}Configure your Java Operator to use OpenTelemetry:${NC}" +echo -e " Add dependency: ${GREEN}io.javaoperatorsdk:operator-framework-opentelemetry-support${NC}" +echo -e " Set environment variables:" +echo -e " ${GREEN}OTEL_SERVICE_NAME=your-operator-name${NC}" +echo -e " ${GREEN}OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector-collector.observability.svc.cluster.local:4318${NC}" +echo -e " ${GREEN}OTEL_METRICS_EXPORTER=otlp${NC}" +echo -e " ${GREEN}OTEL_TRACES_EXPORTER=otlp${NC}" + +echo -e "\n${GREEN}========================================${NC}" +echo -e "${GREEN}Grafana Dashboards${NC}" +echo -e "${GREEN}========================================${NC}" +echo -e "\nAutomatically imported dashboards:" +echo -e " - ${GREEN}JOSDK - JVM Metrics${NC} - Java Virtual Machine health and performance" +echo -e " - ${GREEN}JOSDK - Operator Metrics${NC} - Kubernetes operator performance and reconciliation" +echo -e "\nPre-installed Kubernetes dashboards:" +echo -e " - Kubernetes / Compute Resources / Cluster" +echo -e " - Kubernetes / Compute Resources / Namespace (Pods)" +echo -e " - Node Exporter / Nodes" +echo -e "\n${YELLOW}Note:${NC} Dashboards may take 30-60 seconds to appear in Grafana after installation." + +echo -e "\n${YELLOW}To uninstall:${NC}" +echo -e " kubectl delete configmap -n observability jvm-metrics-dashboard josdk-operator-metrics-dashboard" +echo -e " kubectl delete -n observability OpenTelemetryCollector otel-collector" +echo -e " helm uninstall -n observability kube-prometheus-stack" +echo -e " helm uninstall -n observability opentelemetry-operator" +echo -e " helm uninstall -n cert-manager cert-manager" +echo -e " kubectl delete namespace observability cert-manager" + +echo -e "\n${GREEN}Done!${NC}" diff --git a/observability/josdk-operator-metrics-dashboard.json b/observability/josdk-operator-metrics-dashboard.json new file mode 100644 index 0000000000..41916bbb97 --- /dev/null +++ b/observability/josdk-operator-metrics-dashboard.json @@ -0,0 +1,1106 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Rate of reconciliations started per second", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": ["last", "mean"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(operator_sdk_reconciliations_started_total{service_name=\"josdk\"}[5m])) by (controller_name)", + "legendFormat": "{{controller_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Reconciliation Rate (Started)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Success vs Failure rate of reconciliations", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Success" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failure" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": ["last", "mean"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(operator_sdk_reconciliations_success_total{service_name=\"josdk\"}[5m])) by (controller_name)", + "legendFormat": "Success - {{controller_name}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(operator_sdk_reconciliations_failure_total{service_name=\"josdk\"}[5m])) by (controller_name)", + "legendFormat": "Failure - {{controller_name}}", + "range": true, + "refId": "B" + } + ], + "title": "Reconciliation Success vs Failure Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Current number of reconciliations being executed", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 5 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(operator_sdk_reconciliations_executions{service_name=\"josdk\"})", + "legendFormat": "Executing", + "range": true, + "refId": "A" + } + ], + "title": "Currently Executing Reconciliations", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Current reconciliation queue size", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 10 + }, + { + "color": "red", + "value": 50 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 8 + }, + "id": 4, + "options": { + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(operator_sdk_reconciliations_active{service_name=\"josdk\"})", + "legendFormat": "Active", + "range": true, + "refId": "A" + } + ], + "title": "Active Reconciliations", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Total reconciliations started", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 8 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(operator_sdk_reconciliations_started_total{service_name=\"josdk\"})", + "legendFormat": "Total", + "range": true, + "refId": "A" + } + ], + "title": "Total Reconciliations", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Error rate by exception type", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 8 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(operator_sdk_reconciliations_failure_total{service_name=\"josdk\"}[5m]))", + "legendFormat": "Error Rate", + "range": true, + "refId": "A" + } + ], + "title": "Error Rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Controller execution time percentiles", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 7, + "options": { + "legend": { + "calcs": ["last", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum(rate(operator_sdk_reconciliations_execution_seconds_bucket{service_name=\"josdk\"}[5m])) by (le, controller_name))", + "legendFormat": "p50 - {{controller_name}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(operator_sdk_reconciliations_execution_seconds_bucket{service_name=\"josdk\"}[5m])) by (le, controller_name))", + "legendFormat": "p95 - {{controller_name}}", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum(rate(operator_sdk_reconciliations_execution_seconds_bucket{service_name=\"josdk\"}[5m])) by (le, controller_name))", + "legendFormat": "p99 - {{controller_name}}", + "range": true, + "refId": "C" + } + ], + "title": "Reconciliation Execution Time (Percentiles)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Rate of events received by the operator", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 8, + "options": { + "legend": { + "calcs": ["last", "mean"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(operator_sdk_events_received_total{service_name=\"josdk\"}[5m])) by (event, action)", + "legendFormat": "{{event}} - {{action}}", + "range": true, + "refId": "A" + } + ], + "title": "Event Reception Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Failures by controller", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 9, + "options": { + "legend": { + "calcs": ["last", "sum"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(operator_sdk_reconciliations_failure_total{service_name=\"josdk\"}[5m])) by (controller_name)", + "legendFormat": "{{controller_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Failures by Controller", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Controller execution success vs failure", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 10, + "options": { + "legend": { + "calcs": ["last", "mean"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(operator_sdk_controllers_success_total{service_name=\"josdk\"}[5m])) by (controller_name)", + "legendFormat": "Success - {{controller_name}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(operator_sdk_controllers_failure_total{service_name=\"josdk\"}[5m])) by (controller_name)", + "legendFormat": "Failure - {{controller_name}}", + "range": true, + "refId": "B" + } + ], + "title": "Controller Execution Success vs Failure", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Rate of delete events received", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 11, + "options": { + "legend": { + "calcs": ["last", "sum"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(operator_sdk_events_delete_total{service_name=\"josdk\"}[5m])) by (controller_name)", + "legendFormat": "{{controller_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Delete Event Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Rate of retry attempts", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 12, + "options": { + "legend": { + "calcs": ["last", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(operator_sdk_reconciliations_retries_total{service_name=\"josdk\"}[5m])) by (controller_name)", + "legendFormat": "Retries - {{controller_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Reconciliation Retry Rate", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 38, + "style": "dark", + "tags": ["operator", "kubernetes", "josdk"], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "JOSDK - Operator Metrics", + "uid": "josdk-operator-metrics", + "version": 0, + "weekStart": "" +} diff --git a/observability/jvm-metrics-dashboard.json b/observability/jvm-metrics-dashboard.json new file mode 100644 index 0000000000..528f29674e --- /dev/null +++ b/observability/jvm-metrics-dashboard.json @@ -0,0 +1,857 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": ["last", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "jvm_memory_used_bytes{service_name=\"josdk\"}", + "legendFormat": "{{area}} - {{id}}", + "range": true, + "refId": "A" + } + ], + "title": "JVM Memory Used", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": ["last", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "jvm_threads_live{service_name=\"josdk\"}", + "legendFormat": "Live Threads", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "jvm_threads_daemon_threads{service_name=\"josdk\"}", + "legendFormat": "Daemon Threads", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "jvm_threads_peak_threads{service_name=\"josdk\"}", + "legendFormat": "Peak Threads", + "range": true, + "refId": "C" + } + ], + "title": "JVM Threads", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": ["last", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(jvm_gc_pause_milliseconds_sum{service_name=\"josdk\"}[5m])", + "legendFormat": "{{action}} - {{cause}}", + "range": true, + "refId": "A" + } + ], + "title": "GC Pause Time Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "calcs": ["last"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(jvm_gc_pause_milliseconds_count{service_name=\"josdk\"}[5m])", + "legendFormat": "{{action}} - {{cause}}", + "range": true, + "refId": "A" + } + ], + "title": "GC Pause Count Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 16 + }, + "id": 5, + "options": { + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "system_cpu_usage{service_name=\"josdk\"}", + "legendFormat": "CPU Usage", + "range": true, + "refId": "A" + } + ], + "title": "CPU Usage", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 16 + }, + "id": 6, + "options": { + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "jvm_classes_loaded{service_name=\"josdk\"}", + "legendFormat": "Classes Loaded", + "range": true, + "refId": "A" + } + ], + "title": "Classes Loaded", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 16 + }, + "id": 7, + "options": { + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "process_uptime_milliseconds{service_name=\"josdk\"}", + "legendFormat": "Uptime", + "range": true, + "refId": "A" + } + ], + "title": "Process Uptime", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 16 + }, + "id": 8, + "options": { + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "system_cpu_count{service_name=\"josdk\"}", + "legendFormat": "CPU Count", + "range": true, + "refId": "A" + } + ], + "title": "CPU Count", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 9, + "options": { + "legend": { + "calcs": ["last"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(jvm_gc_memory_allocated_bytes_total{service_name=\"josdk\"}[5m])", + "legendFormat": "Allocated", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(jvm_gc_memory_promoted_bytes_total{service_name=\"josdk\"}[5m])", + "legendFormat": "Promoted", + "range": true, + "refId": "B" + } + ], + "title": "GC Memory Allocation Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 10, + "options": { + "legend": { + "calcs": ["last", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "jvm_memory_max_bytes{service_name=\"josdk\", area=\"heap\"}", + "legendFormat": "Max Heap", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "jvm_memory_committed_bytes{service_name=\"josdk\", area=\"heap\"}", + "legendFormat": "Committed Heap", + "range": true, + "refId": "B" + } + ], + "title": "Heap Memory Max vs Committed", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 38, + "style": "dark", + "tags": ["jvm", "java", "josdk"], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "JOSDK - JVM Metrics", + "uid": "josdk-jvm-metrics", + "version": 0, + "weekStart": "" +} diff --git a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetrics.java b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetrics.java index f66bdc47c6..1764390d6f 100644 --- a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetrics.java +++ b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetrics.java @@ -103,8 +103,9 @@ public void cleanupDoneFor(ResourceID resourceID, Map metadata) } @Override - public void finishedReconciliation(HasMetadata resource, Map metadata) { - metricsList.forEach(metrics -> metrics.finishedReconciliation(resource, metadata)); + public void successfullyFinishedReconciliation( + HasMetadata resource, Map metadata) { + metricsList.forEach(metrics -> metrics.successfullyFinishedReconciliation(resource, metadata)); } @Override @@ -113,6 +114,7 @@ public T timeControllerExecution(ControllerExecution execution) throws Ex } @Override + @Deprecated(forRemoval = true) public > T monitorSizeOf(T map, String name) { metricsList.forEach(metrics -> metrics.monitorSizeOf(map, name)); return map; diff --git a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/Metrics.java b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/Metrics.java index 10b2db6774..12578ead24 100644 --- a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/Metrics.java +++ b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/Metrics.java @@ -50,15 +50,30 @@ default void controllerRegistered(Controller controller) default void receivedEvent(Event event, Map metadata) {} /** - * Called right before a resource is dispatched to the ExecutorService for reconciliation. - * + * @deprecated use {@link Metrics#submittedForReconciliation(HasMetadata, RetryInfo, Map)} Called + * right before a resource is dispatched to the ExecutorService for reconciliation. * @param resource the associated with the resource * @param retryInfo the current retry state information for the reconciliation request * @param metadata metadata associated with the resource being processed */ + @Deprecated(forRemoval = true) default void reconcileCustomResource( + HasMetadata resource, RetryInfo retryInfo, Map metadata) { + submittedForReconciliation(resource, retryInfo, metadata); + } + + /** + * Called right before a resource is submitted to the ExecutorService for reconciliation. + * + * @param resource the associated with the resource + * @param retryInfo the current retry state information for the reconciliation request + * @param metadata metadata associated with the resource being processed + */ + default void submittedForReconciliation( HasMetadata resource, RetryInfo retryInfo, Map metadata) {} + default void reconciliationExecutionStarted(HasMetadata resource, Map metadata) {} + /** * Called when a precedent reconciliation for the resource associated with the specified {@link * ResourceID} resulted in the provided exception, resulting in a retry of the reconciliation. @@ -70,8 +85,24 @@ default void reconcileCustomResource( default void failedReconciliation( HasMetadata resource, Exception exception, Map metadata) {} - default void reconciliationExecutionStarted(HasMetadata resource, Map metadata) {} + /** + * Called when the {@link + * io.javaoperatorsdk.operator.api.reconciler.Reconciler#reconcile(HasMetadata, Context)} method + * of the Reconciler associated with the resource associated with the specified {@link ResourceID} + * has successfully finished. + * + * @param resource the {@link ResourceID} associated with the resource being processed + * @param metadata metadata associated with the resource being processed + */ + default void successfullyFinishedReconciliation( + HasMetadata resource, Map metadata) {} + /** + * Always called not only if successfully finished. + * + * @param resource the {@link ResourceID} associated with the resource being processed + * @param metadata metadata associated with the resource being processed + */ default void reconciliationExecutionFinished( HasMetadata resource, Map metadata) {} @@ -85,15 +116,14 @@ default void reconciliationExecutionFinished( default void cleanupDoneFor(ResourceID resourceID, Map metadata) {} /** - * Called when the {@link - * io.javaoperatorsdk.operator.api.reconciler.Reconciler#reconcile(HasMetadata, Context)} method - * of the Reconciler associated with the resource associated with the specified {@link ResourceID} - * has sucessfully finished. - * + * @deprecated use {@link Metrics#successfullyFinishedReconciliation(HasMetadata, Map)} * @param resource the {@link ResourceID} associated with the resource being processed * @param metadata metadata associated with the resource being processed */ - default void finishedReconciliation(HasMetadata resource, Map metadata) {} + @Deprecated(forRemoval = true) + default void finishedReconciliation(HasMetadata resource, Map metadata) { + successfullyFinishedReconciliation(resource, metadata); + } /** * Encapsulates the information about a controller execution i.e. a call to either {@link @@ -185,6 +215,7 @@ default T timeControllerExecution(ControllerExecution execution) throws E * @param the type of the Map being monitored */ @SuppressWarnings("unused") + @Deprecated(forRemoval = true) default > T monitorSizeOf(T map, String name) { return map; } diff --git a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/event/EventProcessor.java b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/event/EventProcessor.java index b476c39614..e36ea9c600 100644 --- a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/event/EventProcessor.java +++ b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/event/EventProcessor.java @@ -298,7 +298,6 @@ synchronized void eventProcessingFinished( cleanupForDeletedEvent(executionScope.getResourceID()); } else if (postExecutionControl.isFinalizerRemoved()) { state.markProcessedMarkForDeletion(); - metrics.cleanupDoneFor(resourceID, metricsMetadata); } else { if (state.eventPresent() || isTriggerOnAllEventAndDeleteEventPresent(state)) { log.debug("Submitting for reconciliation."); @@ -372,20 +371,18 @@ private void handleRetryOnException(ExecutionScope

executionScope, Exception state.eventPresent() || (triggerOnAllEvents() && state.isAdditionalEventPresentAfterDeleteEvent()); state.markEventReceived(triggerOnAllEvents()); - retryAwareErrorLogging(state.getRetry(), eventPresent, exception, executionScope); + metrics.failedReconciliation(executionScope.getResource(), exception, metricsMetadata); if (eventPresent) { log.debug("New events exists for for resource id: {}", resourceID); submitReconciliationExecution(state); return; } Optional nextDelay = state.getRetry().nextDelay(); - nextDelay.ifPresentOrElse( delay -> { log.debug( "Scheduling timer event for retry with delay:{} for resource: {}", delay, resourceID); - metrics.failedReconciliation(executionScope.getResource(), exception, metricsMetadata); retryEventSource().scheduleOnce(resourceID, delay); }, () -> { diff --git a/operator-framework-core/src/test/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetricsTest.java b/operator-framework-core/src/test/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetricsTest.java index 68142048b6..36a3ca0877 100644 --- a/operator-framework-core/src/test/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetricsTest.java +++ b/operator-framework-core/src/test/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetricsTest.java @@ -141,13 +141,13 @@ void cleanupDoneFor_shouldDelegateToAllMetricsInOrder() { } @Test - void finishedReconciliation_shouldDelegateToAllMetricsInOrder() { - aggregatedMetrics.finishedReconciliation(resource, metadata); + void successfullyFinishedReconciliation_shouldDelegateToAllMetricsInOrder() { + aggregatedMetrics.successfullyFinishedReconciliation(resource, metadata); final var inOrder = inOrder(metrics1, metrics2, metrics3); - inOrder.verify(metrics1).finishedReconciliation(resource, metadata); - inOrder.verify(metrics2).finishedReconciliation(resource, metadata); - inOrder.verify(metrics3).finishedReconciliation(resource, metadata); + inOrder.verify(metrics1).successfullyFinishedReconciliation(resource, metadata); + inOrder.verify(metrics2).successfullyFinishedReconciliation(resource, metadata); + inOrder.verify(metrics3).successfullyFinishedReconciliation(resource, metadata); verifyNoMoreInteractions(metrics1, metrics2, metrics3); } diff --git a/sample-operators/mysql-schema/src/main/java/io/javaoperatorsdk/operator/sample/MySQLSchemaOperator.java b/sample-operators/mysql-schema/src/main/java/io/javaoperatorsdk/operator/sample/MySQLSchemaOperator.java index 20dafac5be..3e8a9df13f 100644 --- a/sample-operators/mysql-schema/src/main/java/io/javaoperatorsdk/operator/sample/MySQLSchemaOperator.java +++ b/sample-operators/mysql-schema/src/main/java/io/javaoperatorsdk/operator/sample/MySQLSchemaOperator.java @@ -26,7 +26,7 @@ import org.takes.http.FtBasic; import io.javaoperatorsdk.operator.Operator; -import io.javaoperatorsdk.operator.monitoring.micrometer.MicrometerMetrics; +import io.javaoperatorsdk.operator.monitoring.micrometer.MicrometerMetricsV2; import io.javaoperatorsdk.operator.sample.dependent.ResourcePollerConfig; import io.javaoperatorsdk.operator.sample.dependent.SchemaDependentResource; import io.micrometer.core.instrument.logging.LoggingMeterRegistry; @@ -42,7 +42,8 @@ public static void main(String[] args) throws IOException { new Operator( overrider -> overrider.withMetrics( - MicrometerMetrics.withoutPerResourceMetrics(new LoggingMeterRegistry()))); + new MicrometerMetricsV2.MicrometerMetricsV2Builder(new LoggingMeterRegistry()) + .build())); MySQLSchemaReconciler schemaReconciler = new MySQLSchemaReconciler(); diff --git a/sample-operators/webpage/README.md b/sample-operators/webpage/README.md index 7718d0f2f3..96329d18a9 100644 --- a/sample-operators/webpage/README.md +++ b/sample-operators/webpage/README.md @@ -76,3 +76,6 @@ of your choice. The JAR file is built using your local Maven and JDK and then co 1. Deploy the CRD: `kubectl apply -f target/classes/META-INF/fabric8/webpages.sample.javaoperatorsdk-v1.yml` 2. Deploy the operator: `kubectl apply -f k8s/operator.yaml` + +To install observability components - such as Prometheus, Open Telemetry, Grafana use - execute: +[install-observability.sh](../../observability/install-observability.sh) diff --git a/sample-operators/webpage/k8s/webpage2.yaml b/sample-operators/webpage/k8s/webpage2.yaml new file mode 100644 index 0000000000..e9ae5ab19e --- /dev/null +++ b/sample-operators/webpage/k8s/webpage2.yaml @@ -0,0 +1,34 @@ +# +# Copyright Java Operator SDK Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +apiVersion: "sample.javaoperatorsdk/v1" +kind: WebPage +metadata: +# Use labels to match the resource with different reconciler implementations: +# labels: +# low-level: "true" + name: hellows2 +spec: + exposed: false + html: | + + + Hello Operator World + + + Hello World! + + diff --git a/sample-operators/webpage/pom.xml b/sample-operators/webpage/pom.xml index 6ec60340ae..f8c79cf268 100644 --- a/sample-operators/webpage/pom.xml +++ b/sample-operators/webpage/pom.xml @@ -39,6 +39,13 @@ pom import + + io.micrometer + micrometer-bom + ${micrometer-core.version} + pom + import + @@ -47,6 +54,20 @@ io.javaoperatorsdk operator-framework + + io.javaoperatorsdk + micrometer-support + + + io.micrometer + micrometer-registry-otlp + ${micrometer-core.version} + + + org.yaml + snakeyaml + 2.3 + org.apache.logging.log4j log4j-slf4j2-impl diff --git a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java index 5366dc2e9a..3166f84220 100644 --- a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java +++ b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java @@ -16,14 +16,30 @@ package io.javaoperatorsdk.operator.sample; import java.io.IOException; +import java.io.InputStream; import java.net.InetSocketAddress; +import java.time.Duration; +import java.util.HashMap; +import java.util.Map; +import org.jspecify.annotations.NonNull; +import org.jspecify.annotations.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.yaml.snakeyaml.Yaml; import io.javaoperatorsdk.operator.Operator; +import io.javaoperatorsdk.operator.api.monitoring.Metrics; +import io.javaoperatorsdk.operator.monitoring.micrometer.MicrometerMetricsV2; import io.javaoperatorsdk.operator.sample.probes.LivenessHandler; import io.javaoperatorsdk.operator.sample.probes.StartupHandler; +import io.micrometer.core.instrument.Clock; +import io.micrometer.core.instrument.MeterRegistry; +import io.micrometer.core.instrument.composite.CompositeMeterRegistry; +import io.micrometer.core.instrument.logging.LoggingMeterRegistry; +import io.micrometer.core.instrument.logging.LoggingRegistryConfig; +import io.micrometer.registry.otlp.OtlpConfig; +import io.micrometer.registry.otlp.OtlpMeterRegistry; import com.sun.net.httpserver.HttpServer; @@ -40,7 +56,12 @@ public class WebPageOperator { public static void main(String[] args) throws IOException { log.info("WebServer Operator starting!"); - Operator operator = new Operator(o -> o.withStopOnInformerErrorDuringStartup(false)); + // TODO add test for checking if there are metrics in prometheus + // Load configuration from config.yaml + Metrics metrics = initOTLPMetrics(); + Operator operator = + new Operator(o -> o.withStopOnInformerErrorDuringStartup(false).withMetrics(metrics)); + String reconcilerEnvVar = System.getenv(WEBPAGE_RECONCILER_ENV); if (WEBPAGE_CLASSIC_RECONCILER_ENV_VALUE.equals(reconcilerEnvVar)) { operator.register(new WebPageReconciler()); @@ -58,4 +79,93 @@ public static void main(String[] args) throws IOException { server.setExecutor(null); server.start(); } + + private static @NonNull Metrics initOTLPMetrics() { + CompositeMeterRegistry compositeRegistry = new CompositeMeterRegistry(); + + // Add OTLP registry + Map configProperties = loadConfigFromYaml(); + var otlpConfig = + new OtlpConfig() { + @Override + public String prefix() { + return ""; + } + + @Override + public @Nullable String get(String key) { + return configProperties.get(key); + } + + // these should come from env variables + @Override + public Map resourceAttributes() { + return Map.of("service.name", "josdk", "operator", "webpage"); + } + }; + + MeterRegistry otlpRegistry = new OtlpMeterRegistry(otlpConfig, Clock.SYSTEM); + compositeRegistry.add(otlpRegistry); + + // Add console logging registry if enabled (for development) + // String enableConsoleLogging = System.getenv("METRICS_CONSOLE_LOGGING"); + // todo remove + String enableConsoleLogging = "true"; + if ("true".equalsIgnoreCase(enableConsoleLogging)) { + log.info("Console metrics logging enabled"); + LoggingMeterRegistry loggingRegistry = + new LoggingMeterRegistry( + new LoggingRegistryConfig() { + @Override + public String get(String key) { + return null; + } + + @Override + public Duration step() { + return Duration.ofSeconds(10); // Log metrics every 30 seconds + } + }, + Clock.SYSTEM); + compositeRegistry.add(loggingRegistry); + } + + // Register JVM and system metrics + log.info("Registering JVM and system metrics..."); + // todo add back + // new JvmMemoryMetrics().bindTo(compositeRegistry); + // new JvmGcMetrics().bindTo(compositeRegistry); + // new JvmThreadMetrics().bindTo(compositeRegistry); + // new ClassLoaderMetrics().bindTo(compositeRegistry); + // new ProcessorMetrics().bindTo(compositeRegistry); + // new UptimeMetrics().bindTo(compositeRegistry); + + return MicrometerMetricsV2.newPerResourceCollectingMicrometerMetricsBuilder(compositeRegistry) + .build(); + } + + @SuppressWarnings("unchecked") + private static Map loadConfigFromYaml() { + Map configMap = new HashMap<>(); + try (InputStream inputStream = WebPageOperator.class.getResourceAsStream("/otlp-config.yaml")) { + if (inputStream == null) { + log.warn("otlp-config.yaml not found in resources, using default OTLP configuration"); + return configMap; + } + + Yaml yaml = new Yaml(); + Map yamlData = yaml.load(inputStream); + + // Navigate to otlp section and map properties directly + Map otlp = (Map) yamlData.get("otlp"); + if (otlp != null) { + otlp.forEach((key, value) -> configMap.put("otlp." + key, value.toString())); + } + + log.info("Loaded OTLP configuration from otlp-config.yaml: {}", configMap); + } catch (IOException e) { + log.error("Error loading otlp-config.yaml", e); + } + return configMap; + } } diff --git a/sample-operators/webpage/src/main/resources/log4j2.xml b/sample-operators/webpage/src/main/resources/log4j2.xml index 0bf270c7e6..7cced1edbd 100644 --- a/sample-operators/webpage/src/main/resources/log4j2.xml +++ b/sample-operators/webpage/src/main/resources/log4j2.xml @@ -19,11 +19,11 @@ - + - + diff --git a/sample-operators/webpage/src/main/resources/otlp-config.yaml b/sample-operators/webpage/src/main/resources/otlp-config.yaml new file mode 100644 index 0000000000..17d773eb70 --- /dev/null +++ b/sample-operators/webpage/src/main/resources/otlp-config.yaml @@ -0,0 +1,23 @@ +# +# Copyright Java Operator SDK Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +otlp: + # OTLP Collector endpoint - see observability/install-observability.sh for setup + url: "http://localhost:4318/v1/metrics" +# url: "http://otel-collector-collector.observability.svc.cluster.local:4318/v1/metrics" + step: 15s + batchSize: 15000 + aggregationTemporality: "cumulative"