Add latest changes from gitlab-org/gitlab@master

author: GitLab Bot <gitlab-bot@gitlab.com> 2023-11-03 00:10:29 +0300
committer: GitLab Bot <gitlab-bot@gitlab.com> 2023-11-03 00:10:29 +0300
commit: fecb8ece925c48cf64969f1ecced12e4c4497706 (patch)
tree: a25112ce0985b27a390721109354fd89effa9b83 /scripts
parent: eed7260f13c0a3139876e3659603f3d803e8fcd7 (diff)
1 files changed, 231 insertions, 0 deletions
diff --git a/scripts/duo_chat/reporter.rb b/scripts/duo_chat/reporter.rb
new file mode 100755
index 00000000000..f13481cf41b
--- /dev/null
+++ b/scripts/duo_chat/reporter.rb
@@ -0,0 +1,231 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+
+require 'gitlab'
+require 'json'
+
+class Reporter
+  IDENTIFIABLE_NOTE_TAG = 'gitlab-org/ai-powered/ai-framework:duo-chat-qa-evaluation-'
+
+  GRADE_TO_EMOJI_MAPPING = {
+    correct: ":white_check_mark:",
+    incorrect: ":x:",
+    unexpected: ":warning:"
+  }.freeze
+
+  def run
+    merge_request_iid = ENV['CI_MERGE_REQUEST_IID']
+    ci_project_id = ENV['CI_PROJECT_ID']
+
+    puts "Saving #{artifact_path}"
+    File.write(artifact_path, report_note)
+
+    # Look for an existing note
+    report_notes = com_gitlab_client
+      .merge_request_notes(ci_project_id, merge_request_iid)
+      .auto_paginate
+      .select do |note|
+        note.body.include? note_identifier_tag
+      end
+
+    note = report_notes.max_by { |note| Time.parse(note.created_at) }
+
+    if note && note.type != 'DiscussionNote'
+      # The latest note has not led to a discussion. Update it.
+      com_gitlab_client.edit_merge_request_note(ci_project_id, merge_request_iid, note.id, report_note)
+
+      puts "Updated comment."
+    else
+      # This is the first note or the latest note has been discussed on the MR.
+      # Don't update, create new note instead.
+      com_gitlab_client.create_merge_request_note(ci_project_id, merge_request_iid, report_note)
+
+      puts "Posted comment."
+    end
+  end
+
+  private
+
+  def report_filename
+    "#{ENV['DUO_RSPEC']}.md"
+  end
+
+  def artifact_path
+    File.join(ENV['CI_PROJECT_DIR'], report_filename)
+  end
+
+  def note_identifier_tag
+    "#{IDENTIFIABLE_NOTE_TAG}#{ENV['DUO_RSPEC']}"
+  end
+
+  def com_gitlab_client
+    @com_gitlab_client ||= Gitlab.client(
+      endpoint: "https://gitlab.com/api/v4",
+      private_token: ENV['PROJECT_TOKEN_FOR_CI_SCRIPTS_API_USAGE']
+    )
+  end
+
+  def report_note
+    report = <<~MARKDOWN
+    <!-- #{note_identifier_tag} -->
+
+    ## GitLab Duo Chat QA evaluation
+
+    Report generated for "#{ENV['CI_JOB_NAME']}". This report is generated and refreshed automatically. Do not edit.
+
+    LLMs have been asked to evaluate GitLab Duo Chat's answers.
+
+    :white_check_mark: : LLM evaluated the answer as `CORRECT`.
+
+    :x: : LLM evaluated the answer as `INCORRECT`.
+
+    :warning: : LLM did not evaluate correctly or the evaluation request might have failed.
+
+    ### Summary
+
+    - The total number of evaluations: #{summary_numbers[:total]}
+
+    - The number of evaluations in which all LLMs graded `CORRECT`: #{summary_numbers[:correct]} (#{summary_numbers[:correct_ratio]}%)
+
+      - Note: if an evaluation request failed or its response was not parsable, it was ignored. For example, :white_check_mark: :warning: would count as `CORRECT`.
+
+    - The number of evaluations in which all LLMs graded `INCORRECT`: #{summary_numbers[:incorrect]} (#{summary_numbers[:incorrect_ratio]}%)
+
+      - Note: if an evaluation request failed or its response was not parsable, it was ignored. For example, :x: :warning: would count as `INCORRECT`.
+
+    - The number of evaluations in which LLMs disagreed:  #{summary_numbers[:disagreed]} (#{summary_numbers[:disagreed_ratio]}%)
+
+
+    ### Evaluations
+
+    #{eval_content}
+
+
+    MARKDOWN
+
+    if report.length > 1000000
+      return <<~MARKDOWN
+      <!-- #{note_identifier_tag} -->
+
+      ## GitLab Duo Chat QA evaluation
+
+      Report generated for "#{ENV['CI_JOB_NAME']}". This report is generated and refreshed automatically. Do not edit.
+
+      **:warning: the evaluation report is too long (> `1000000`) and cannot be posted as a note.**
+
+      Please check out the artifact for the CI job "#{ENV['CI_JOB_NAME']}":
+
+      https://gitlab.com/gitlab-org/gitlab/-/jobs/#{ENV['CI_JOB_ID']}/artifacts/file/#{report_filename}
+
+      MARKDOWN
+    end
+
+    report
+  end
+
+  def report_data
+    @report_data ||= Dir[File.join(ENV['CI_PROJECT_DIR'], "tmp/duo_chat/qa*.json")]
+      .map { |file| JSON.parse(File.read(file)) }
+  end
+
+  def eval_content
+    report_data
+      .sort_by { |a| a["question"] }
+      .map do |data|
+        <<~MARKDOWN
+        <details>
+
+        <summary>
+
+        #{correctness_indicator(data)}
+
+        `"#{data['question']}"`
+
+        (context: `#{data['resource']}`)
+
+        </summary>
+
+        #### Resource
+
+        `#{data['resource']}`
+
+        #### Answer
+
+        #{data['answer']}
+
+        #### LLM Evaluation
+
+        #{evalutions(data)}
+
+
+        </details>
+
+        MARKDOWN
+      end
+      .join
+  end
+
+  def summary_numbers
+    @graded_evaluations ||= report_data.map { |data| data["evaluations"].map { |eval| parse_grade(eval) } }
+
+    total = @graded_evaluations.size
+    correct = @graded_evaluations.count { |grades| !(grades.include? :incorrect) }
+    incorrect = @graded_evaluations.count { |grades| !(grades.include? :correct) }
+    disagreed = @graded_evaluations.count { |grades| (grades.include? :correct) && (grades.include? :incorrect) }
+
+    {
+      total: total,
+      correct: correct,
+      correct_ratio: (correct.to_f / total * 100).round(1),
+      incorrect: incorrect,
+      incorrect_ratio: (incorrect.to_f / total * 100).round(1),
+      disagreed: disagreed,
+      disagreed_ratio: (disagreed.to_f / total * 100).round(1)
+    }
+  end
+
+  def parse_grade(eval)
+    return :correct if eval["response"].match?(/Grade: CORRECT/i)
+    return :incorrect if eval["response"].match?(/Grade: INCORRECT/i)
+
+    # If the LLM's evaluation includes neither CORRECT nor CORRECT, flag it.
+    :unexpected
+  end
+
+  def correctness_indicator(data)
+    data["evaluations"].map { |eval| parse_grade(eval) }.map { |grade| GRADE_TO_EMOJI_MAPPING[grade] }.join(' ')
+  end
+
+  def evalutions(data)
+    rows = data["evaluations"].map do |eval|
+      grade = parse_grade(eval)
+
+      <<~MARKDOWN
+      <tr>
+        <td>#{eval['model']}</td>
+        <td>
+          #{GRADE_TO_EMOJI_MAPPING[grade]}
+        </td>
+        <td>
+          #{eval['response']}
+        </td
+      </tr>
+
+      MARKDOWN
+    end
+    .join
+
+    <<~MARKDOWN
+    <table>
+      <tr>
+        <td>Model</td>
+        <td>Grade</td>
+        <td>Details</td>
+      </tr>
+      #{rows}
+    </table>
+    MARKDOWN
+  end
+end
+
+Reporter.new.run
author	GitLab Bot <gitlab-bot@gitlab.com>	2023-11-03 00:10:29 +0300
committer	GitLab Bot <gitlab-bot@gitlab.com>	2023-11-03 00:10:29 +0300
commit	fecb8ece925c48cf64969f1ecced12e4c4497706 (patch)
tree	a25112ce0985b27a390721109354fd89effa9b83 /scripts
parent	eed7260f13c0a3139876e3659603f3d803e8fcd7 (diff)