From 3cccd102ba543e02725d247893729e5c73b38295 Mon Sep 17 00:00:00 2001
From: GitLab Bot <gitlab-bot@gitlab.com>
Date: Wed, 20 Apr 2022 10:00:54 +0000
Subject: Add latest changes from gitlab-org/gitlab@14-10-stable-ee

---
 .../dashboards/error_budget_detail.md              | 127 +++++++++++++
 .../img/error_budget_detail_7d_budget.png          | Bin 0 -> 20753 bytes
 .../dashboards/img/error_budget_detail_sli.png     | Bin 0 -> 50340 bytes
 .../img/error_budget_detail_sli_detail.png         | Bin 0 -> 97895 bytes
 ...error_budget_detail_stage_group_aggregation.png | Bin 0 -> 25253 bytes
 .../img/stage_group_dashboards_28d_budget.png      | Bin 0 -> 16913 bytes
 .../img/stage_group_dashboards_annotation.png      | Bin 0 -> 13544 bytes
 .../img/stage_group_dashboards_debug_1.png         | Bin 0 -> 41296 bytes
 .../img/stage_group_dashboards_debug_2.png         | Bin 0 -> 37361 bytes
 .../img/stage_group_dashboards_debug_3.png         | Bin 0 -> 51282 bytes
 .../img/stage_group_dashboards_filters.png         | Bin 0 -> 10648 bytes
 .../img/stage_group_dashboards_metrics.png         | Bin 0 -> 38776 bytes
 .../stage_group_dashboards_time_customization.png  | Bin 0 -> 20025 bytes
 .../img/stage_group_dashboards_time_filter.png     | Bin 0 -> 22641 bytes
 .../stage_group_observability/dashboards/index.md  |  70 ++++++++
 .../dashboards/stage_group_dashboard.md            | 200 +++++++++++++++++++++
 .../stage_group_dashboards_error_attribution.png   | Bin 0 -> 61561 bytes
 .../stage_group_dashboards_service_sli_detail.png  | Bin 0 -> 41130 bytes
 doc/development/stage_group_observability/index.md | 138 ++++++++++++++
 19 files changed, 535 insertions(+)
 create mode 100644 doc/development/stage_group_observability/dashboards/error_budget_detail.md
 create mode 100644 doc/development/stage_group_observability/dashboards/img/error_budget_detail_7d_budget.png
 create mode 100644 doc/development/stage_group_observability/dashboards/img/error_budget_detail_sli.png
 create mode 100644 doc/development/stage_group_observability/dashboards/img/error_budget_detail_sli_detail.png
 create mode 100644 doc/development/stage_group_observability/dashboards/img/error_budget_detail_stage_group_aggregation.png
 create mode 100644 doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_28d_budget.png
 create mode 100644 doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_annotation.png
 create mode 100644 doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_debug_1.png
 create mode 100644 doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_debug_2.png
 create mode 100644 doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_debug_3.png
 create mode 100644 doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_filters.png
 create mode 100644 doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_metrics.png
 create mode 100644 doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_time_customization.png
 create mode 100644 doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_time_filter.png
 create mode 100644 doc/development/stage_group_observability/dashboards/index.md
 create mode 100644 doc/development/stage_group_observability/dashboards/stage_group_dashboard.md
 create mode 100644 doc/development/stage_group_observability/img/stage_group_dashboards_error_attribution.png
 create mode 100644 doc/development/stage_group_observability/img/stage_group_dashboards_service_sli_detail.png
 create mode 100644 doc/development/stage_group_observability/index.md

(limited to 'doc/development/stage_group_observability')

diff --git a/doc/development/stage_group_observability/dashboards/error_budget_detail.md b/doc/development/stage_group_observability/dashboards/error_budget_detail.md
new file mode 100644
index 00000000000..19f98d404e7
--- /dev/null
+++ b/doc/development/stage_group_observability/dashboards/error_budget_detail.md
@@ -0,0 +1,127 @@
+---
+stage: Platforms
+group: Scalability
+info: To determine the technical writer assigned to the Stage/Group associated with this page, see https://about.gitlab.com/handbook/engineering/ux/technical-writing/#assignments
+---
+
+# Error budget detail dashboard
+
+With error budget detailed dashboards you can explore the error budget
+spent at specific moments in time. By default, the dashboard shows
+the past 28 days. You can adjust it with the [time range controls](index.md#time-range-controls)
+or by selecting a range on one of the graphs.
+
+This dashboard is the same kind of dashboard we use for service level
+monitoring. For example, see the
+[overview dashboard for the web service](https://dashboards.gitlab.net/d/web-main) (GitLab internal).
+
+## Error budget panels
+
+On top of each dashboard, there's the same panel with the [error budget](../index.md#error-budget).
+Here, the time based targets adjust depending on the range.
+For example, while the budget was 20 minutes per 28 days, it is only 1/4 of that for 7 days:
+
+![5m budget in 7 days](img/error_budget_detail_7d_budget.png)
+
+Also, keep in mind that Grafana rounds the numbers. In this example the
+total time spent is 5 minutes and 24 seconds, so 24 seconds over
+budget.
+
+The attribution panels also show only failures that occurred
+within the selected range.
+
+These two panels represent a view of the "official" error budget: they
+take into account if an SLI was ignored.
+The [attribution panels](../index.md#check-where-budget-is-being-spent) show which components
+contributed the most over the selected period.
+
+The panels below take into account all SLIs that contribute to GitLab.com availability.
+This includes SLIs that are ignored for the official error budget.
+
+## Time series for aggregations
+
+The time series panels for aggregations all contain three panels:
+
+- Apdex: the [Apdex score](https://en.wikipedia.org/wiki/Apdex) for one or more SLIs. Higher score is better.
+- Error Ratio: the error ratio for one or more SLIs. Lower is better.
+- Requests Per Second: the number of operations per second. Higher means a bigger impact on the error budget.
+
+The Apdex and error-ratio panels also contain two alerting thresholds:
+
+- The one-hour threshold: the fast burn rate.
+
+  When this line is crossed, we've spent 2% of our monthly budget in the last hour.
+
+- The six-hour threshold: the slow burn rate.
+
+  When this line is crossed, we've spent 2% of our budget in the last six hours.
+
+If there is no error-ratio or Apdex for a certain SLI, the panel is hidden.
+
+Read more about these alerting windows in
+[Google SRE workbook](https://sre.google/workbook/alerting-on-slos/#recommended_time_windows_and_burn_rates_f).
+
+We don't have alerting on these metrics for stage groups.
+This work is being discussed in [epic 615](https://gitlab.com/groups/gitlab-com/gl-infra/-/epics/615).
+If this is something you would like for your group, let us know there.
+
+### Stage group aggregation
+
+![stage group aggregation graphs](img/error_budget_detail_stage_group_aggregation.png)
+
+The stage group aggregation shows a graph with the Apdex and errors
+portion of the error budget over time. The lower a dip in the Apdex
+graph or the higher a peak on the error ratio graph, the more budget
+was spent at that moment.
+
+The third graph shows the sum of all the request rates for all
+SLIs. Higher means there was more traffic.
+
+To zoom in on a particular moment where a lot of budget was spent, select the appropriate time in
+the graph.
+
+### Service-level indicators
+
+![Rails requests service level indicator](img/error_budget_detail_sli.png)
+
+This time series shows a breakdown of each SLI that could be contributing to the
+error budget for a stage group. Similar to the stage group
+aggregation, it contains an Apdex score, error ratio, and request
+rate.
+
+Here we also display an explanation panel, describing the SLI and
+linking to other monitoring tools. The links to logs (📖) or
+visualizations (📈) in Kibana are scoped to the feature categories
+for your stage group, and limited to the range selected. Keep in mind
+that we only keep logs in Kibana for seven days.
+
+In the graphs, there is a single line per service. In the previous example image,
+`rails_requests` is an SLI for the `web`, `api` and `git` services.
+
+Sidekiq is not included in this dashboard. We're tracking this in
+[epic 700](https://gitlab.com/groups/gitlab-com/gl-infra/-/epics/700).
+
+### SLI detail
+
+![Rails requests SLI detail](img/error_budget_detail_sli_detail.png)
+
+The SLI details row shows a breakdown of a specific SLI based on the
+labels present on the source metrics.
+
+For example, in the previous image, the `rails_requests` SLI has an `endpoint_id` label.
+We can show how much a certain endpoint was requested (RPS), and how much it contributed to the error
+budget spend.
+
+For Apdex we show the **Apdex Attribution** panel. The more prominent
+color is the one that contributed most to the spend. To see the
+top spending endpoint over the entire range, sort by the average.
+
+For error ratio we show an error rate. To see which label contributed most to the spend, sort by the
+average.
+
+We don't have endpoint information available for Rails errors. This work is being planned in
+[epic 663](https://gitlab.com/groups/gitlab-com/gl-infra/-/epics/663).
+
+The number of series to be loaded in the SLI details graphs is very
+high when compared to the other aggregations. Because of this, it's not possible to
+load more than a few days' worth of data.
diff --git a/doc/development/stage_group_observability/dashboards/img/error_budget_detail_7d_budget.png b/doc/development/stage_group_observability/dashboards/img/error_budget_detail_7d_budget.png
new file mode 100644
index 00000000000..1b2996d7d26
Binary files /dev/null and b/doc/development/stage_group_observability/dashboards/img/error_budget_detail_7d_budget.png differ
diff --git a/doc/development/stage_group_observability/dashboards/img/error_budget_detail_sli.png b/doc/development/stage_group_observability/dashboards/img/error_budget_detail_sli.png
new file mode 100644
index 00000000000..0472e35b0cb
Binary files /dev/null and b/doc/development/stage_group_observability/dashboards/img/error_budget_detail_sli.png differ
diff --git a/doc/development/stage_group_observability/dashboards/img/error_budget_detail_sli_detail.png b/doc/development/stage_group_observability/dashboards/img/error_budget_detail_sli_detail.png
new file mode 100644
index 00000000000..99530886ae9
Binary files /dev/null and b/doc/development/stage_group_observability/dashboards/img/error_budget_detail_sli_detail.png differ
diff --git a/doc/development/stage_group_observability/dashboards/img/error_budget_detail_stage_group_aggregation.png b/doc/development/stage_group_observability/dashboards/img/error_budget_detail_stage_group_aggregation.png
new file mode 100644
index 00000000000..d679637dcc4
Binary files /dev/null and b/doc/development/stage_group_observability/dashboards/img/error_budget_detail_stage_group_aggregation.png differ
diff --git a/doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_28d_budget.png b/doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_28d_budget.png
new file mode 100644
index 00000000000..eb164dd3f68
Binary files /dev/null and b/doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_28d_budget.png differ
diff --git a/doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_annotation.png b/doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_annotation.png
new file mode 100644
index 00000000000..3776d87e5bb
Binary files /dev/null and b/doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_annotation.png differ
diff --git a/doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_debug_1.png b/doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_debug_1.png
new file mode 100644
index 00000000000..309fad89120
Binary files /dev/null and b/doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_debug_1.png differ
diff --git a/doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_debug_2.png b/doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_debug_2.png
new file mode 100644
index 00000000000..2aad9ab5592
Binary files /dev/null and b/doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_debug_2.png differ
diff --git a/doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_debug_3.png b/doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_debug_3.png
new file mode 100644
index 00000000000..38647410ffd
Binary files /dev/null and b/doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_debug_3.png differ
diff --git a/doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_filters.png b/doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_filters.png
new file mode 100644
index 00000000000..27a836bc36d
Binary files /dev/null and b/doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_filters.png differ
diff --git a/doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_metrics.png b/doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_metrics.png
new file mode 100644
index 00000000000..6b6faff6e3b
Binary files /dev/null and b/doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_metrics.png differ
diff --git a/doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_time_customization.png b/doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_time_customization.png
new file mode 100644
index 00000000000..49e61183b7c
Binary files /dev/null and b/doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_time_customization.png differ
diff --git a/doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_time_filter.png b/doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_time_filter.png
new file mode 100644
index 00000000000..81a3dc789f1
Binary files /dev/null and b/doc/development/stage_group_observability/dashboards/img/stage_group_dashboards_time_filter.png differ
diff --git a/doc/development/stage_group_observability/dashboards/index.md b/doc/development/stage_group_observability/dashboards/index.md
new file mode 100644
index 00000000000..f4e646c8634
--- /dev/null
+++ b/doc/development/stage_group_observability/dashboards/index.md
@@ -0,0 +1,70 @@
+---
+stage: Platforms
+group: Scalability
+info: To determine the technical writer assigned to the Stage/Group associated with this page, see https://about.gitlab.com/handbook/engineering/ux/technical-writing/#assignments
+---
+
+# Dashboards for stage groups
+
+We generate a lot of dashboards acting as windows to the metrics we
+use to monitor GitLab.com. Most of our dashboards are generated from
+Jsonnet in the
+[runbooks repository](https://gitlab.com/gitlab-com/runbooks/-/tree/master/dashboards#dashboard-source).
+Anyone can contribute to these, adding new dashboards or modifying
+existing ones.
+
+When adding new dashboards for your stage groups, tagging them with
+`stage_group:<group name>` cross-links the dashboard on other
+dashboards with the same tag. You can create dashboards for stage groups
+in the [`dashboards/stage-groups`](https://gitlab.com/gitlab-com/runbooks/-/tree/master/dashboards/stage-groups)
+directory. Directories can't be nested more than one level deep.
+
+To see a list of all the dashboards for your stage group:
+
+1. In Grafana, go to the [Dashboard browser](https://dashboards.gitlab.net/dashboards?tag=stage-groups).
+1. To see all of the dashboards for a specific group, filter for `stage_group:<group name>`.
+
+Some generated dashboards are already available:
+
+1. [Stage group dashboard](stage_group_dashboard.md): a customizable
+   dashboard with tailored metrics per group.
+1. [Error budget detail dashboard](error_budget_detail.md): a
+   dashboard allowing to explore the error budget spend over time and
+   over multiple SLIs.
+
+## Time range controls
+
+![Default time filter](img/stage_group_dashboards_time_filter.png)
+
+By default, all the times are in UTC time zone.
+[We use UTC when communicating in Engineering.](https://about.gitlab.com/handbook/communication/#writing-style-guidelines)
+
+All metrics recorded in the GitLab production system have
+[one-year retention](https://gitlab.com/gitlab-cookbooks/gitlab-prometheus/-/blob/31526b03fef823e2f9b3cda7c75dcd28a12418a3/attributes/prometheus.rb#L40).
+
+You can also zoom in and filter the time range directly on a graph. For more information, see the
+[Grafana Time Range Controls](https://grafana.com/docs/grafana/latest/dashboards/time-range-controls/)
+documentation.
+
+## Filters and annotations
+
+On each dashboard, there are two filters and some annotation switches on the top of the page.
+
+Some special events are meaningful to development and operational activities.
+[Grafana annotations](https://grafana.com/docs/grafana/latest/dashboards/annotations/) mark them
+directly on the graphs.
+
+![Filters and annotations](img/stage_group_dashboards_filters.png)
+
+| Name            | Type       | Description |
+| --------------- | ---------- | ----------- |
+| `PROMETHEUS_DS` | filter     | Filter the selective [Prometheus data sources](https://about.gitlab.com/handbook/engineering/monitoring/#prometheus). The default value is `Global`, which aggregates the data from all available data sources. Most of the time, you don't need to care about this filter. |
+| `environment`   | filter     | Filter the environment the metrics are fetched from. The default setting is production (`gprd`). For other options, see [Production Environment mapping](https://about.gitlab.com/handbook/engineering/infrastructure/production/architecture/#environments). |
+| `stage`         | filter     | Filter metrics by stage: `main` or `cny` for canary. Default is `main` |
+| `deploy`        | annotation | Mark a deployment event on the GitLab.com SaaS platform. |
+| `canary-deploy` | annotation | Mark a [canary deployment](https://about.gitlab.com/handbook/engineering/#canary-testing) event on the GitLab.com SaaS platform. |
+| `feature-flags` | annotation | Mark the time point when a feature flag is updated. |
+
+Example of a feature flag annotation displayed on a dashboard panel:
+
+![Annotations](img/stage_group_dashboards_annotation.png)
diff --git a/doc/development/stage_group_observability/dashboards/stage_group_dashboard.md b/doc/development/stage_group_observability/dashboards/stage_group_dashboard.md
new file mode 100644
index 00000000000..c1831cfce69
--- /dev/null
+++ b/doc/development/stage_group_observability/dashboards/stage_group_dashboard.md
@@ -0,0 +1,200 @@
+---
+stage: Platforms
+group: Scalability
+info: To determine the technical writer assigned to the Stage/Group associated with this page, see https://about.gitlab.com/handbook/engineering/ux/technical-writing/#assignments
+---
+
+# Stage group dashboard
+
+The stage group dashboard is generated dashboard that contains metrics
+for common components used by most stage groups. The dashboard is
+fully customizable and owned by the stage groups.
+
+This page explains what is on these dashboards, how to use their
+contents, and how they can be customized.
+
+## Dashboard contents
+
+### Error budget panels
+
+![28 day budget](img/stage_group_dashboards_28d_budget.png)
+
+The top panels display the [error budget](../index.md#error-budget).
+These panels always show the 28 days before the end time selected in the
+[time range controls](index.md#time-range-controls). This data doesn't
+follow the selected range. It does respect the filters for environment
+and stage.
+
+### Metrics panels
+
+![Metrics panels](img/stage_group_dashboards_metrics.png)
+
+Although most of the metrics displayed in the panels are self-explanatory in their title and nearby
+description, note the following:
+
+- The events are counted, measured, accumulated, collected, and stored as
+  [time series](https://prometheus.io/docs/concepts/data_model/). The data is calculated using
+  statistical methods to produce metrics. It means that metrics are approximately correct and
+  meaningful over a time period. They help you get an overview of the stage of a system over time.
+  They are not meant to give you precise numbers of a discrete event.
+
+  If you need a higher level of accuracy, use another monitoring tool, such as
+  [logs](https://about.gitlab.com/handbook/engineering/monitoring/#logs).
+  Read the following examples for more explanations.
+- All the rate metrics' units are `requests per second`. The default aggregate time frame is 1 minute.
+
+  For example, a panel shows the requests per second number at `2020-12-25 00:42:00` to be `34.13`.
+  It means at the minute 42 (from `2020-12-25 00:42:00` to `2020-12-25 00:42:59` ), there are
+  approximately `34.13 * 60 = ~ 2047` requests processed by the web servers.
+- You might encounter some gotchas related to decimal fraction and rounding up frequently, especially
+  in low-traffic cases. For example, the error rate of `RepositoryUpdateMirrorWorker` at
+  `2020-12-25 02:04:00` is `0.07`, equivalent to `4.2` jobs per minute. The raw result is
+  `0.06666666667`, equivalent to 4 jobs per minute.
+- All the rate metrics are more accurate when the data is big enough. The default floating-point
+  precision is 2. In some extremely low panels, you can see `0.00`, even though there is still some
+  real traffic.
+
+To inspect the raw data of the panel for further calculation, select **Inspect** from the dropdown
+list of a panel. Queries, raw data, and panel JSON structure are available.
+Read more at [Grafana panel inspection](https://grafana.com/docs/grafana/latest/panels/inspect-panel/).
+
+All the dashboards are powered by [Grafana](https://grafana.com/), a frontend for displaying metrics.
+Grafana consumes the data returned from queries to backend Prometheus data source, then presents it
+with visualizations. The stage group dashboards are built to serve the most common use cases with a
+limited set of filters and pre-built queries. Grafana provides a way to explore and visualize the
+metrics data with [Grafana Explore](https://grafana.com/docs/grafana/latest/explore/). This requires
+some knowledge of the [Prometheus PromQL query language](https://prometheus.io/docs/prometheus/latest/querying/basics/).
+
+## Example: Debugging with dashboards
+
+Example debugging workflow:
+
+1. A team member in the Code Review group has merged an MR which got deployed to production.
+1. To verify the deployment, you can check the
+   [Code Review group's dashboard](https://dashboards.gitlab.net/d/stage-groups-code_review/stage-groups-group-dashboard-create-code-review?orgId=1).
+1. Sidekiq Error Rate panel shows an elevated error rate, specifically `UpdateMergeRequestsWorker`.
+
+  ![Debug 1](img/stage_group_dashboards_debug_1.png)
+
+1. If you select **Kibana: Kibana Sidekiq failed request logs** in the **Extra links** section, you can filter for `UpdateMergeRequestsWorker` and read through the logs.
+
+  ![Debug 2](img/stage_group_dashboards_debug_2.png)
+
+1. With [Sentry](https://sentry.gitlab.net/gitlab/gitlabcom/) you can find the exception where you
+   can filter by transaction type and `correlation_id` from Kibana's result item.
+
+  ![Debug 3](img/stage_group_dashboards_debug_3.png)
+
+1. A precise exception, including a stack trace, job arguments, and other information should now appear.
+
+Happy debugging!
+
+## Customizing the dashboard
+
+All Grafana dashboards at GitLab are generated from the [Jsonnet files](https://github.com/grafana/grafonnet-lib)
+stored in [the runbooks project](https://gitlab.com/gitlab-com/runbooks/-/tree/master/dashboards).
+Particularly, the stage group dashboards definitions are stored in
+[`/dashboards/stage-groups`](https://gitlab.com/gitlab-com/runbooks/-/tree/master/dashboards/stage-groups).
+
+By convention, each group has a corresponding Jsonnet file. The dashboards are synced with GitLab
+[stage group data](https://gitlab.com/gitlab-com/www-gitlab-com/-/raw/master/data/stages.yml) every
+month.
+
+Expansion and customization are one of the key principles used when we designed this system.
+To customize your group's dashboard, edit the corresponding file and follow the
+[Runbook workflow](https://gitlab.com/gitlab-com/runbooks/-/tree/master/dashboards#dashboard-source).
+The dashboard is updated after the MR is merged.
+
+Looking at an autogenerated file, for example,
+[`product_planning.dashboard.jsonnet`](https://gitlab.com/gitlab-com/runbooks/-/blob/master/dashboards/stage-groups/product_planning.dashboard.jsonnet):
+
+```jsonnet
+// This file is autogenerated using scripts/update_stage_groups_dashboards.rb
+// Please feel free to customize this file.
+local stageGroupDashboards = import './stage-group-dashboards.libsonnet';
+
+stageGroupDashboards.dashboard('product_planning')
+.stageGroupDashboardTrailer()
+```
+
+We provide basic customization to filter out the components essential to your group's activities.
+By default, only the `web`, `api`, and `sidekiq` components are available in the dashboard, while
+`git` is hidden. See [how to enable available components and optional graphs](#optional-graphs).
+
+You can also append further information or custom metrics to a dashboard. The following example
+adds some links and a total request rate to the top of the page:
+
+```jsonnet
+local stageGroupDashboards = import './stage-group-dashboards.libsonnet';
+local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet';
+local basic = import 'grafana/basic.libsonnet';
+
+stageGroupDashboards.dashboard('source_code')
+.addPanel(
+  grafana.text.new(
+    title='Group information',
+    mode='markdown',
+    content=|||
+      Useful link for the Source Code Management group dashboard:
+      - [Issue list](https://gitlab.com/groups/gitlab-org/-/issues?scope=all&state=opened&label_name%5B%5D=repository)
+      - [Epic list](https://gitlab.com/groups/gitlab-org/-/epics?label_name[]=repository)
+    |||,
+  ),
+  gridPos={ x: 0, y: 0, w: 24, h: 4 }
+)
+.addPanel(
+  basic.timeseries(
+    title='Total Request Rate',
+    yAxisLabel='Requests per Second',
+    decimals=2,
+    query=|||
+      sum (
+        rate(gitlab_transaction_duration_seconds_count{
+          env='$environment',
+          environment='$environment',
+          feature_category=~'source_code_management',
+        }[$__interval])
+      )
+    |||
+  ),
+  gridPos={ x: 0, y: 0, w: 24, h: 7 }
+)
+.stageGroupDashboardTrailer()
+```
+
+![Stage Group Dashboard Customization](img/stage_group_dashboards_time_customization.png)
+
+<i class="fa fa-youtube-play youtube" aria-hidden="true"></i>
+If you want to see the workflow in action, we've recorded a pairing session on customizing a dashboard,
+available on [GitLab Unfiltered](https://youtu.be/shEd_eiUjdI).
+
+For deeper customization and more complicated metrics, visit the
+[Grafonnet lib](https://github.com/grafana/grafonnet-lib) project and the
+[GitLab Prometheus Metrics](../../../administration/monitoring/prometheus/gitlab_metrics.md#gitlab-prometheus-metrics)
+documentation.
+
+### Optional graphs
+
+Some graphs aren't relevant for all groups, so they aren't added to
+the dashboard by default. They can be added by customizing the
+dashboard.
+
+By default, only the `web`, `api`, and `sidekiq` metrics are
+shown. If you wish to see the metrics from the `git` fleet (or any
+other component that might be added in the future), you can configure it as follows:
+
+```jsonnet
+stageGroupDashboards
+.dashboard('source_code', components=stageGroupDashboards.supportedComponents)
+.stageGroupDashboardTrailer()
+```
+
+If your group is interested in Sidekiq job durations and their
+thresholds, you can add these graphs by calling the `.addSidekiqJobDurationByUrgency` function:
+
+```jsonnet
+stageGroupDashboards
+.dashboard('access')
+.addSidekiqJobDurationByUrgency()
+.stageGroupDashboardTrailer()
+```
diff --git a/doc/development/stage_group_observability/img/stage_group_dashboards_error_attribution.png b/doc/development/stage_group_observability/img/stage_group_dashboards_error_attribution.png
new file mode 100644
index 00000000000..f6ea7c004ac
Binary files /dev/null and b/doc/development/stage_group_observability/img/stage_group_dashboards_error_attribution.png differ
diff --git a/doc/development/stage_group_observability/img/stage_group_dashboards_service_sli_detail.png b/doc/development/stage_group_observability/img/stage_group_dashboards_service_sli_detail.png
new file mode 100644
index 00000000000..5dc32063709
Binary files /dev/null and b/doc/development/stage_group_observability/img/stage_group_dashboards_service_sli_detail.png differ
diff --git a/doc/development/stage_group_observability/index.md b/doc/development/stage_group_observability/index.md
new file mode 100644
index 00000000000..868e55735e8
--- /dev/null
+++ b/doc/development/stage_group_observability/index.md
@@ -0,0 +1,138 @@
+---
+stage: Platforms
+group: Scalability
+info: To determine the technical writer assigned to the Stage/Group associated with this page, see https://about.gitlab.com/handbook/engineering/ux/technical-writing/#assignments
+---
+
+# Observability for stage groups
+
+Observability is about bringing visibility into a system to see and
+understand the state of each component, with context, to support
+performance tuning and debugging. To run a SaaS platform at scale, a
+rich and detailed observability platform is needed.
+
+To make information available to [stage groups](https://about.gitlab.com/handbook/product/categories/#hierarchy),
+we are aggregating metrics by feature category and then show
+this information on [dashboards](dashboards/index.md) tailored to the groups. Only metrics
+for the features built by the group are visible on their
+dashboards.
+
+With a filtered view, groups can discover bugs and performance regressions that could otherwise
+be missed when viewing aggregated data.
+
+For more specific information on dashboards, see:
+
+- [Dashboards](dashboards/index.md): a general overview of where to find dashboards
+  and how to use them.
+- [Stage group dashboard](dashboards/stage_group_dashboard.md): how to use and customize the stage group dashboard.
+- [Error budget detail](dashboards/error_budget_detail.md): how to explore error budget over time.
+
+## Error budget
+
+The error budget is calculated from the same [Service Level Indicators](https://en.wikipedia.org/wiki/Service_level_indicator) (SLIs)
+that we use to monitor GitLab.com. The 28-day availability number for a
+stage group is comparable to the
+[monthly availability](https://about.gitlab.com/handbook/engineering/infrastructure/performance-indicators/#gitlabcom-availability)
+we calculate for GitLab.com, except it's scoped to the features of a group.
+
+To learn more about how we use error budgets, see the
+[Engineering Error Budgets](https://about.gitlab.com/handbook/engineering/error-budgets/) handbook page.
+
+By default, the first row of panels on both dashboards shows the
+[error budget for the stage group](https://about.gitlab.com/handbook/engineering/error-budgets/#budget-spend-by-stage-group).
+This row shows how features owned by the group contribute to our
+[overall availability](https://about.gitlab.com/handbook/engineering/infrastructure/performance-indicators/#gitlabcom-availability).
+
+The official budget is aggregated over the 28 days. You can see it on the
+[stage group dashboard](dashboards/stage_group_dashboard.md).
+The [error budget detail dashboard](dashboards/error_budget_detail.md)
+allows customizing the range.
+
+We show the information in two formats:
+
+- Availability: this number can be compared to GitLab.com overall
+  availability target of 99.95% uptime.
+- Budget Spent: time over the past 28 days that features owned by the group have not been performing
+  adequately.
+
+The budget is calculated based on indicators per component. Each
+component can have two indicators:
+
+- [Apdex](https://en.wikipedia.org/wiki/Apdex): the rate of operations that performed adequately.
+
+  The threshold for "performing adequately" is stored in our
+  [metrics catalog](https://gitlab.com/gitlab-com/runbooks/-/tree/master/metrics-catalog)
+  and depends on the service in question. For the Puma (Rails) component of the
+  [API](https://gitlab.com/gitlab-com/runbooks/-/blob/f22f40b2c2eab37d85e23ccac45e658b2c914445/metrics-catalog/services/api.jsonnet#L127),
+  [Git](https://gitlab.com/gitlab-com/runbooks/-/blob/f22f40b2c2eab37d85e23ccac45e658b2c914445/metrics-catalog/services/git.jsonnet#L216),
+  and
+  [Web](https://gitlab.com/gitlab-com/runbooks/-/blob/f22f40b2c2eab37d85e23ccac45e658b2c914445/metrics-catalog/services/web.jsonnet#L154)
+  services, that threshold is **5 seconds** when not opted in to the
+  [`rails_requests` SLI](../application_slis/rails_request_apdex.md).
+
+  We've made this target configurable in [this project](https://gitlab.com/groups/gitlab-com/gl-infra/-/epics/525).
+  To learn how to customize the request Apdex, see
+  [Rails request Apdex SLI](../application_slis/rails_request_apdex.md).
+  This new Apdex measurement is not part of the error budget until you
+  [opt in](https://gitlab.com/gitlab-com/gl-infra/scalability/-/issues/1451).
+
+  For Sidekiq job execution, the threshold depends on the
+  [job urgency](../sidekiq/worker_attributes.md#job-urgency). It is
+  [currently](https://gitlab.com/gitlab-com/runbooks/-/blob/f22f40b2c2eab37d85e23ccac45e658b2c914445/metrics-catalog/services/lib/sidekiq-helpers.libsonnet#L25-38)
+  **10 seconds** for high-urgency jobs and **5 minutes** for other jobs.
+
+  Some stage groups might have more services. The thresholds for them are also in the metrics catalog.
+
+- Error rate: The rate of operations that had errors.
+
+The calculation of the ratio happens as follows:
+
+```math
+\frac {operations\_meeting\_apdex + (total\_operations - operations\_with\_errors)} {total\_apdex\_measurements + total\_operations}
+```
+
+## Check where budget is being spent
+
+Both the [stage group dashboard](dashboards/stage_group_dashboard.md)
+and the [error budget detail dashboard](dashboards/error_budget_detail.md)
+show panels to see where the error budget was spent. The stage group
+dashboard always shows a fixed 28 days. The error budget detail
+dashboard allows drilling down to the SLIs over time.
+
+The row below the error budget row is collapsed by default. Expanding
+it shows which component and violation type had the most offending
+operations in the past 28 days.
+
+![Error attribution](img/stage_group_dashboards_error_attribution.png)
+
+The first panel on the left shows a table with the number of errors per
+component. Digging into the first row in that table has
+the biggest impact on the budget spent.
+
+Commonly, the components that spend most of the budget are Sidekiq or Puma. The panel in
+the center explains what different violation types mean and how to dig
+deeper in the logs.
+
+The panel on the right provides links to Kibana that should reveal
+which endpoints or Sidekiq jobs are causing the errors.
+
+<i class="fa fa-youtube-play youtube" aria-hidden="true"></i>
+To learn how to use these panels and logs for
+determining which Rails endpoints are slow,
+see the [Error Budget Attribution for Purchase group](https://youtu.be/M9u6unON7bU) video.
+
+Other components visible in the table come from
+[service-level indicators](https://sre.google/sre-book/service-level-objectives/) (SLIs) defined
+in the [metrics catalog](https://gitlab.com/gitlab-com/runbooks/-/blob/master/metrics-catalog/README.md).
+
+For those types of failures, you can follow the link to the service
+dashboard linked from the `type` column. The service dashboard
+contains a row specifically for the SLI that is causing the budget
+spent, with links to logs and a description of what the
+component means.
+
+For example, see the `server` component of the `web-pages` service:
+
+![web-pages-server-component SLI](img/stage_group_dashboards_service_sli_detail.png)
+
+To add more SLIs tailored to specific features, you can use an [Application SLI](../application_slis/index.md).
-- 
cgit v1.2.3