#!/usr/bin/env ruby
# frozen_string_literal: true
require 'gitlab'
require 'json'
class Reporter
GITLAB_COM_API_V4_ENDPOINT = "https://gitlab.com/api/v4"
QA_EVALUATION_PROJECT_ID = 52020045 # https://gitlab.com/gitlab-org/ai-powered/ai-framework/qa-evaluation
AGGREGATED_REPORT_ISSUE_IID = 1 # https://gitlab.com/gitlab-org/ai-powered/ai-framework/qa-evaluation/-/issues/1
IDENTIFIABLE_NOTE_TAG = 'gitlab-org/ai-powered/ai-framework:duo-chat-qa-evaluation'
GRADE_TO_EMOJI_MAPPING = {
correct: ":white_check_mark:",
incorrect: ":x:",
unexpected: ":warning:"
}.freeze
def run
if pipeline_running_on_master_branch?
snippet_web_url = upload_data_as_snippet
report_issue_url = create_report_issue
update_aggregation_issue(report_issue_url, snippet_web_url)
else
save_report_as_artifact
post_or_update_report_note
end
end
def markdown_report
@report ||= <<~MARKDOWN
## GitLab Duo Chat QA evaluation
Report generated for "#{ENV['CI_JOB_NAME']}". This report is generated and refreshed automatically. Do not edit.
LLMs have been asked to evaluate GitLab Duo Chat's answers.
:white_check_mark: : LLM evaluated the answer as `CORRECT`.
:x: : LLM evaluated the answer as `INCORRECT`.
:warning: : LLM did not evaluate correctly or the evaluation request might have failed.
### Summary
- The total number of evaluations: #{summary_numbers[:total]}
- The number of evaluations in which all LLMs graded `CORRECT`: #{summary_numbers[:correct]} (#{summary_numbers[:correct_ratio]}%)
- Note: if an evaluation request failed or its response was not parsable, it was ignored. For example, :white_check_mark: :warning: would count as `CORRECT`.
- The number of evaluations in which all LLMs graded `INCORRECT`: #{summary_numbers[:incorrect]} (#{summary_numbers[:incorrect_ratio]}%)
- Note: if an evaluation request failed or its response was not parsable, it was ignored. For example, :x: :warning: would count as `INCORRECT`.
- The number of evaluations in which LLMs disagreed: #{summary_numbers[:disagreed]} (#{summary_numbers[:disagreed_ratio]}%)
### Evaluations
#{eval_content}
MARKDOWN
# Do this to avoid pinging users in notes/issues.
quote_usernames(@report)
end
private
def quote_usernames(text)
text.gsub(/(@\w+)/, '`\\1`')
end
def pipeline_running_on_master_branch?
ENV['CI_COMMIT_BRANCH'] == ENV['CI_DEFAULT_BRANCH']
end
def utc_timestamp
@utc_timestamp ||= Time.now.utc
end
def upload_data_as_snippet
filename = "#{utc_timestamp.to_i}.json"
title = utc_timestamp.to_s
snippet_content = ::JSON.pretty_generate({
commit: ENV["CI_COMMIT_SHA"],
pipeline_url: ENV["CI_PIPELINE_URL"],
data: report_data
})
puts "Creating a snippet #{filename}."
snippet = qa_evaluation_project_client.create_snippet(
QA_EVALUATION_PROJECT_ID,
{
title: title,
files: [{ file_path: filename, content: snippet_content }],
visibility: 'private'
}
)
snippet.web_url
end
def create_report_issue
puts "Creating a report issue."
issue_title = "Report #{utc_timestamp}"
new_issue = qa_evaluation_project_client.create_issue(
QA_EVALUATION_PROJECT_ID, issue_title, { description: markdown_report }
)
new_issue.web_url
end
def update_aggregation_issue(report_issue_url, snippet_web_url)
puts "Updating the aggregated report issue."
new_line = ["\n|"]
new_line << "#{utc_timestamp} |"
new_line << "#{summary_numbers[:total]} |"
new_line << "#{summary_numbers[:correct_ratio]}% |"
new_line << "#{summary_numbers[:incorrect_ratio]}% |"
new_line << "#{summary_numbers[:disagreed_ratio]}% |"
new_line << "#{report_issue_url} |"
new_line << "#{snippet_web_url} |"
new_line = new_line.join(' ')
aggregated_report_issue = qa_evaluation_project_client.issue(QA_EVALUATION_PROJECT_ID, AGGREGATED_REPORT_ISSUE_IID)
updated_description = aggregated_report_issue.description + new_line
qa_evaluation_project_client.edit_issue(
QA_EVALUATION_PROJECT_ID, AGGREGATED_REPORT_ISSUE_IID, { description: updated_description }
)
end
def save_report_as_artifact
artifact_path = File.join(base_dir, ENV['QA_EVAL_REPORT_FILENAME'])
puts "Saving #{artifact_path}"
File.write(artifact_path, markdown_report)
end
def post_or_update_report_note
note = existing_report_note
if note && note.type != 'DiscussionNote'
# The latest note has not led to a discussion. Update it.
gitlab_project_client.edit_merge_request_note(ci_project_id, merge_request_iid, note.id, markdown_report)
puts "Updated comment."
else
# This is the first note or the latest note has been discussed on the MR.
# Don't update, create new note instead.
gitlab_project_client.create_merge_request_note(ci_project_id, merge_request_iid, markdown_report)
puts "Posted comment."
end
end
def existing_report_note
# Look for an existing note using `IDENTIFIABLE_NOTE_TAG`
gitlab_project_client
.merge_request_notes(ci_project_id, merge_request_iid)
.auto_paginate
.select { |note| note.body.include? IDENTIFIABLE_NOTE_TAG }
.max_by { |note| Time.parse(note.created_at) }
end
def gitlab_project_client
@gitlab_project_client ||= Gitlab.client(
endpoint: GITLAB_COM_API_V4_ENDPOINT,
private_token: ENV['PROJECT_TOKEN_FOR_CI_SCRIPTS_API_USAGE']
)
end
def qa_evaluation_project_client
@qa_evaluation_project_client ||= Gitlab.client(
endpoint: GITLAB_COM_API_V4_ENDPOINT,
private_token: ENV['CHAT_QA_EVALUATION_PROJECT_TOKEN_FOR_CI_SCRIPTS_API_USAGE']
)
end
def base_dir
ENV['CI_PROJECT_DIR'] || "./"
end
def merge_request_iid
ENV['CI_MERGE_REQUEST_IID']
end
def ci_project_id
ENV['CI_PROJECT_ID']
end
def report_data
@report_data ||= Dir[File.join(base_dir, "tmp/duo_chat/qa*.json")]
.flat_map { |file| JSON.parse(File.read(file)) }
end
def eval_content
report_data
.sort_by { |a| a["question"] }
.map do |data|
<<~MARKDOWN
#{correctness_indicator(data)}
`"#{data['question']}"`
(context: `#{data['resource']}`)
#### Resource
`#{data['resource']}`
#### Answer
#{data['answer']}
#### LLM Evaluation
Tools used: #{data['tools_used']}
#{evalutions(data)}
Model | Grade | Details |