Welcome to mirror list, hosted at ThFree Co, Russian Federation.

parse_commit.go « catfile « git « internal - gitlab.com/gitlab-org/gitaly.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 1b22d9d4a312c749cdc8d29b09156637ca3a51c4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
package catfile

import (
	"bytes"
	"errors"
	"fmt"
	"io"
	"strings"

	"gitlab.com/gitlab-org/gitaly/v16/internal/git"
	"gitlab.com/gitlab-org/gitaly/v16/internal/helper"
	"gitlab.com/gitlab-org/gitaly/v16/proto/go/gitalypb"
)

const (
	gpgSignaturePrefix       = "gpgsig"
	gpgSignaturePrefixSha256 = "gpgsig-sha256"
)

// parseCommitState is a type used to define the current state while parsing
// a commit message.
//
// To understand the state macihne of parsing commit messages, we need
// to understand the different sections of a commit message.
//
// # Sections of a commit message
//
// Let's consider a sample commit message
//
//	tree 798e5474fafac9754ee6b82ab17af8d70df4fbd3
//	parent 86f06b3f55e6334abb99fc168e2dd925895c4e49
//	author John doe <bugfixer@email.com> 1699964265 +0100
//	committer John doe <bugfixer@email.com> 1699964265 +0100
//	gpgsig -----BEGIN PGP SIGNATURE-----
//
//	 iHUEABYKAB0WIQReCOKeBZren2AFN0T+9BKLUsDX/wUCZVNlaQAKCRD+9BKLUsDX
//	 /219AP9j8jfQuLieg0Fl8xrOS74eJguYqIsPYI6lPDUvM5XmgQEAkhDUoWFd0ypR
//	 vXTEU/0CxcaXmlco/ThX2rCYwEUT6wA=
//	 =Wt+j
//	 -----END PGP SIGNATURE-----
//
//	Commit subject
//
// With this, we can see that commit messages have
//   - The first section consiting of headers. This is everything before
//     commit message. In our example this consists of the tree, parent,
//     author, committer and gpgsig.
//   - The headers can also contain the signature. This is either gpgsig
//     or gpgsig-256.
//   - After the first line, all lines of the signature start with a ' '
//     space character.
//   - Any headers post the signature are not parsed but will still be
//     considered as part of the signature payload.
//   - Post the headers, there is a newline to differentiate the upcoming
//     commit body. The commit body consists of the commit subject and the
//     message.
//
// Using this information we can now write a parser which represents a state
// machine which can be used to parse commits.
type parseCommitState uint

const (
	parseCommitStateHeader parseCommitState = iota
	parseCommitStateSignature
	parseCommitStateUnexpected
	parseCommitStateBody
	parseCommitStateEnd
)

// SignatureData holds the raw data used to validate a signed commit.
type SignatureData struct {
	// Signatures refers to the signatures present in the commit. Note that
	// Git only considers the first signature when parsing commits
	Signatures [][]byte
	// Payload refers to the commit data which is signed by the signature,
	// generally this is everything apart from the signature in the commit.
	// Headers present after the signature are not considered in the payload.
	Payload []byte
}

// Commit wraps the gitalypb.GitCommit structure and includes signature information.
type Commit struct {
	*gitalypb.GitCommit
	SignatureData SignatureData
}

// ParseCommit implements a state machine to parse the various sections
// of a commit. To understand the state machine, see the definition
// for parseState above.
//
// The goal is to maintain feature parity with how git [1] (see
// parse_buffer_signed_by_header()) itself parses commits. This ensures
// that we throw errors only wherever git does.
//
// [1]: https://gitlab.com/gitlab-org/git/-/blob/master/commit.c
func (p *parser) ParseCommit(object git.Object) (*Commit, error) {
	commit := &gitalypb.GitCommit{Id: object.ObjectID().String()}
	var payload []byte
	currentSignatureIndex := 0
	signatures := [][]byte{}

	bytesRemaining := object.ObjectSize()
	p.bufferedReader.Reset(object)

	for state := parseCommitStateHeader; state != parseCommitStateEnd; {
		receivedEOF := false

		line, err := p.bufferedReader.ReadString('\n')
		if errors.Is(err, io.EOF) {
			receivedEOF = true
		} else if err != nil {
			return nil, fmt.Errorf("parse raw commit: %w", err)
		}
		bytesRemaining -= int64(len(line))

		// If the line only consists of a newline, we can skip
		// the state to commit body.
		if line == "\n" {
			state = parseCommitStateBody
		}

		switch state {
		case parseCommitStateHeader:
			key, value, ok := strings.Cut(line, " ")
			if !ok {
				// TODO: Current tests allow empty commits, we might want
				// to change this behavior.
				goto loopEnd
			}

			// For headers, we trim the newline to make it easier
			// to parse.
			value = strings.TrimSuffix(value, "\n")

			switch key {
			case "parent":
				commit.ParentIds = append(commit.ParentIds, value)
			case "author":
				commit.Author = parseCommitAuthor(value)
			case "committer":
				commit.Committer = parseCommitAuthor(value)
			case "tree":
				commit.TreeId = value
			case "encoding":
				commit.Encoding = value
			case gpgSignaturePrefix, gpgSignaturePrefixSha256:
				// Since Git only considers the first signature, we only
				// capture the first signature's type.
				commit.SignatureType = detectSignatureType(value)

				state = parseCommitStateSignature
				signatures = append(signatures, []byte(value+"\n"))

				goto loopEnd
			}

			payload = append(payload, []byte(line)...)

		case parseCommitStateSignature:
			if after, ok := strings.CutPrefix(line, " "); ok {
				// All signature lines, must start with a ' ' (space).
				signatures[currentSignatureIndex] = append(signatures[currentSignatureIndex], []byte(after)...)
				goto loopEnd
			} else {
				currentSignatureIndex++

				// Multiple signatures might be present in the commit.
				if key, value, ok := strings.Cut(line, " "); ok {
					if key == gpgSignaturePrefix || key == gpgSignaturePrefixSha256 {
						signatures = append(signatures, []byte(value))
						goto loopEnd
					}
				}

				// If there is no ' ' (space), it means there is some unexpected
				// data.
				//
				// Note that we don't go back to parsing headers. This is because
				// any headers which are present after the signature are not parsed
				// by Git as information. But, they still constitute to the signature
				// payload. So any data after the signature and before the commit body
				// is considered unexpected.
				state = parseCommitStateUnexpected
			}

			fallthrough

		case parseCommitStateUnexpected:
			// If the line is only a newline, that means we have reached
			// the commit body. If not, we keep looping till we do.
			if line != "\n" {
				payload = append(payload, []byte(line)...)
				goto loopEnd
			}

			fallthrough

		case parseCommitStateBody:
			payload = append(payload, []byte(line)...)

			body := make([]byte, bytesRemaining)
			if _, err := io.ReadFull(p.bufferedReader, body); err != nil {
				return nil, fmt.Errorf("reading commit message: %w", err)
			}

			// After we have copied the body, we must make sure that there really is no
			// additional data. For once, this is to detect bugs in our implementation where we
			// would accidentally have truncated the commit message. On the other hand, we also
			// need to do this such that we observe the EOF, which we must observe in order to
			// unblock reading the next object.
			//
			// This all feels a bit complicated, where it would be much easier to just read into
			// a preallocated `bytes.Buffer`. But this complexity is indeed required to optimize
			// allocations. So if you want to change this, please make sure to execute the
			// `BenchmarkListAllCommits` benchmark.
			if n, err := io.Copy(io.Discard, p.bufferedReader); err != nil {
				return nil, fmt.Errorf("reading commit message: %w", err)
			} else if n != 0 {
				return nil, fmt.Errorf(
					"commit message exceeds expected length %v by %v bytes",
					object.ObjectSize(), n,
				)
			}

			if len(body) > 0 {
				commit.Subject = subjectFromBody(body)
				commit.BodySize = int64(len(body))
				commit.Body = body
				if max := helper.MaxCommitOrTagMessageSize; len(body) > max {
					commit.Body = commit.Body[:max]
				}
				payload = append(payload, body...)
			}

			state = parseCommitStateEnd
		}

	loopEnd:
		if receivedEOF {
			state = parseCommitStateEnd
		}
	}

	for i, signature := range signatures {
		signatures[i] = bytes.TrimSuffix(signature, []byte("\n"))
	}

	return &Commit{
		GitCommit:     commit,
		SignatureData: SignatureData{Signatures: signatures, Payload: payload},
	}, nil
}