parser/pageparser: Split the page lexer into some more files

See #5534
author: Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com> 2018-12-17 22:54:06 +0300
committer: Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com> 2018-12-20 22:08:01 +0300
commit: a8853f1c5ace30ae8d256ad374bdb280c95d4228 (patch)
tree: db4bdd65b5ae9bfe7894a4a9bdf687a5d0063381 /parser
parent: 4d93aca27dfdebc9e06948ccf37a7922dac09d65 (diff)
3 files changed, 524 insertions, 489 deletions
diff --git a/parser/pageparser/pagelexer.go b/parser/pageparser/pagelexer.go
index 5802c318b..d11e88403 100644
--- a/parser/pageparser/pagelexer.go
+++ b/parser/pageparser/pagelexer.go
@@ -29,18 +29,6 @@ const eof = -1
 // returns the next state in scanner.
 type stateFunc func(*pageLexer) stateFunc
 
-type lexerShortcodeState struct {
-	currLeftDelimItem  ItemType
-	currRightDelimItem ItemType
-	isInline           bool
-	currShortcodeName  string          // is only set when a shortcode is in opened state
-	closingState       int             // > 0 = on its way to be closed
-	elementStepNum     int             // step number in element
-	paramElements      int             // number of elements (name + value = 2) found first
-	openShortcodes     map[string]bool // set of shortcodes in open state
-
-}
-
 type pageLexer struct {
 	input      []byte
 	stateStart stateFunc
@@ -102,17 +90,6 @@ func (l *pageLexer) run() *pageLexer {
 	return l
 }
 
-// Shortcode syntax
-var (
-	leftDelimSc            = []byte("{{")
-	leftDelimScNoMarkup    = []byte("{{<")
-	rightDelimScNoMarkup   = []byte(">}}")
-	leftDelimScWithMarkup  = []byte("{{%")
-	rightDelimScWithMarkup = []byte("%}}")
-	leftComment            = []byte("/*") // comments in this context us used to to mark shortcodes as "not really a shortcode"
-	rightComment           = []byte("*/")
-)
-
 // Page syntax
 var (
 	byteOrderMark     = '\ufeff'
@@ -293,11 +270,6 @@ func lexMainSection(l *pageLexer) stateFunc {
 
 }
 
-func (l *pageLexer) isShortCodeStart() bool {
-	return l.hasPrefix(leftDelimScWithMarkup) || l.hasPrefix(leftDelimScNoMarkup)
-
-}
-
 func (l *pageLexer) posFirstNonWhiteSpace() int {
 	f := func(c rune) bool {
 		return !unicode.IsSpace(c)
@@ -305,69 +277,6 @@ func (l *pageLexer) posFirstNonWhiteSpace() int {
 	return bytes.IndexFunc(l.input[l.pos:], f)
 }
 
-func lexIntroSection(l *pageLexer) stateFunc {
-	l.summaryDivider = summaryDivider
-
-LOOP:
-	for {
-		r := l.next()
-		if r == eof {
-			break
-		}
-
-		switch {
-		case r == '+':
-			return l.lexFrontMatterSection(TypeFrontMatterTOML, r, "TOML", delimTOML)
-		case r == '-':
-			return l.lexFrontMatterSection(TypeFrontMatterYAML, r, "YAML", delimYAML)
-		case r == '{':
-			return lexFrontMatterJSON
-		case r == '#':
-			return lexFrontMatterOrgMode
-		case r == byteOrderMark:
-			l.emit(TypeIgnore)
-		case !isSpace(r) && !isEndOfLine(r):
-			if r == '<' {
-				l.backup()
-				if l.hasPrefix(htmlCommentStart) {
-					// This may be commented out front mattter, which should
-					// still be read.
-					l.consumeToNextLine()
-					l.isInHTMLComment = true
-					l.emit(TypeIgnore)
-					continue LOOP
-				} else {
-					if l.pos > l.start {
-						l.emit(tText)
-					}
-					l.next()
-					// This is the start of a plain HTML document with no
-					// front matter. I still can contain shortcodes, so we
-					// have to keep looking.
-					l.emit(TypeHTMLStart)
-				}
-			}
-			break LOOP
-		}
-	}
-
-	// Now move on to the shortcodes.
-	return lexMainSection
-}
-
-func lexEndFromtMatterHTMLComment(l *pageLexer) stateFunc {
-	l.isInHTMLComment = false
-	right := l.index(htmlCommentEnd)
-	if right == -1 {
-		return l.errorf("starting HTML comment with no end")
-	}
-	l.pos += right + len(htmlCommentEnd)
-	l.emit(TypeIgnore)
-
-	// Now move on to the shortcodes.
-	return lexMainSection
-}
-
 func lexDone(l *pageLexer) stateFunc {
 
 	// Done!
@@ -378,385 +287,10 @@ func lexDone(l *pageLexer) stateFunc {
 	return nil
 }
 
-func lexFrontMatterJSON(l *pageLexer) stateFunc {
-	// Include the left delimiter
-	l.backup()
-
-	var (
-		inQuote bool
-		level   int
-	)
-
-	for {
-
-		r := l.next()
-
-		switch {
-		case r == eof:
-			return l.errorf("unexpected EOF parsing JSON front matter")
-		case r == '{':
-			if !inQuote {
-				level++
-			}
-		case r == '}':
-			if !inQuote {
-				level--
-			}
-		case r == '"':
-			inQuote = !inQuote
-		case r == '\\':
-			// This may be an escaped quote. Make sure it's not marked as a
-			// real one.
-			l.next()
-		}
-
-		if level == 0 {
-			break
-		}
-	}
-
-	l.consumeCRLF()
-	l.emit(TypeFrontMatterJSON)
-
-	return lexMainSection
-}
-
-func lexFrontMatterOrgMode(l *pageLexer) stateFunc {
-	/*
-		#+TITLE: Test File For chaseadamsio/goorgeous
-		#+AUTHOR: Chase Adams
-		#+DESCRIPTION: Just another golang parser for org content!
-	*/
-
-	l.summaryDivider = summaryDividerOrg
-
-	l.backup()
-
-	if !l.hasPrefix(delimOrg) {
-		return lexMainSection
-	}
-
-	// Read lines until we no longer see a #+ prefix
-LOOP:
-	for {
-
-		r := l.next()
-
-		switch {
-		case r == '\n':
-			if !l.hasPrefix(delimOrg) {
-				break LOOP
-			}
-		case r == eof:
-			break LOOP
-
-		}
-	}
-
-	l.emit(TypeFrontMatterORG)
-
-	return lexMainSection
-
-}
-
 func (l *pageLexer) printCurrentInput() {
 	fmt.Printf("input[%d:]: %q", l.pos, string(l.input[l.pos:]))
 }
 
-// Handle YAML or TOML front matter.
-func (l *pageLexer) lexFrontMatterSection(tp ItemType, delimr rune, name string, delim []byte) stateFunc {
-
-	for i := 0; i < 2; i++ {
-		if r := l.next(); r != delimr {
-			return l.errorf("invalid %s delimiter", name)
-		}
-	}
-
-	// Let front matter start at line 1
-	wasEndOfLine := l.consumeCRLF()
-	// We don't care about the delimiters.
-	l.ignore()
-
-	var r rune
-
-	for {
-		if !wasEndOfLine {
-			r = l.next()
-			if r == eof {
-				return l.errorf("EOF looking for end %s front matter delimiter", name)
-			}
-		}
-
-		if wasEndOfLine || isEndOfLine(r) {
-			if l.hasPrefix(delim) {
-				l.emit(tp)
-				l.pos += 3
-				l.consumeCRLF()
-				l.ignore()
-				break
-			}
-		}
-
-		wasEndOfLine = false
-	}
-
-	return lexMainSection
-}
-
-func lexShortcodeLeftDelim(l *pageLexer) stateFunc {
-	l.pos += len(l.currentLeftShortcodeDelim())
-	if l.hasPrefix(leftComment) {
-		return lexShortcodeComment
-	}
-	l.emit(l.currentLeftShortcodeDelimItem())
-	l.elementStepNum = 0
-	l.paramElements = 0
-	return lexInsideShortcode
-}
-
-func lexShortcodeComment(l *pageLexer) stateFunc {
-	posRightComment := l.index(append(rightComment, l.currentRightShortcodeDelim()...))
-	if posRightComment <= 1 {
-		return l.errorf("comment must be closed")
-	}
-	// we emit all as text, except the comment markers
-	l.emit(tText)
-	l.pos += len(leftComment)
-	l.ignore()
-	l.pos += posRightComment - len(leftComment)
-	l.emit(tText)
-	l.pos += len(rightComment)
-	l.ignore()
-	l.pos += len(l.currentRightShortcodeDelim())
-	l.emit(tText)
-	return lexMainSection
-}
-
-func lexShortcodeRightDelim(l *pageLexer) stateFunc {
-	l.closingState = 0
-	l.pos += len(l.currentRightShortcodeDelim())
-	l.emit(l.currentRightShortcodeDelimItem())
-	return lexMainSection
-}
-
-// either:
-// 1. param
-// 2. "param" or "param\"
-// 3. param="123" or param="123\"
-// 4. param="Some \"escaped\" text"
-func lexShortcodeParam(l *pageLexer, escapedQuoteStart bool) stateFunc {
-
-	first := true
-	nextEq := false
-
-	var r rune
-
-	for {
-		r = l.next()
-		if first {
-			if r == '"' {
-				// a positional param with quotes
-				if l.paramElements == 2 {
-					return l.errorf("got quoted positional parameter. Cannot mix named and positional parameters")
-				}
-				l.paramElements = 1
-				l.backup()
-				return lexShortcodeQuotedParamVal(l, !escapedQuoteStart, tScParam)
-			}
-			first = false
-		} else if r == '=' {
-			// a named param
-			l.backup()
-			nextEq = true
-			break
-		}
-
-		if !isAlphaNumericOrHyphen(r) {
-			l.backup()
-			break
-		}
-	}
-
-	if l.paramElements == 0 {
-		l.paramElements++
-
-		if nextEq {
-			l.paramElements++
-		}
-	} else {
-		if nextEq && l.paramElements == 1 {
-			return l.errorf("got named parameter '%s'. Cannot mix named and positional parameters", l.current())
-		} else if !nextEq && l.paramElements == 2 {
-			return l.errorf("got positional parameter '%s'. Cannot mix named and positional parameters", l.current())
-		}
-	}
-
-	l.emit(tScParam)
-	return lexInsideShortcode
-
-}
-
-func lexShortcodeQuotedParamVal(l *pageLexer, escapedQuotedValuesAllowed bool, typ ItemType) stateFunc {
-	openQuoteFound := false
-	escapedInnerQuoteFound := false
-	escapedQuoteState := 0
-
-Loop:
-	for {
-		switch r := l.next(); {
-		case r == '\\':
-			if l.peek() == '"' {
-				if openQuoteFound && !escapedQuotedValuesAllowed {
-					l.backup()
-					break Loop
-				} else if openQuoteFound {
-					// the coming quoute is inside
-					escapedInnerQuoteFound = true
-					escapedQuoteState = 1
-				}
-			}
-		case r == eof, r == '\n':
-			return l.errorf("unterminated quoted string in shortcode parameter-argument: '%s'", l.current())
-		case r == '"':
-			if escapedQuoteState == 0 {
-				if openQuoteFound {
-					l.backup()
-					break Loop
-
-				} else {
-					openQuoteFound = true
-					l.ignore()
-				}
-			} else {
-				escapedQuoteState = 0
-			}
-
-		}
-	}
-
-	if escapedInnerQuoteFound {
-		l.ignoreEscapesAndEmit(typ)
-	} else {
-		l.emit(typ)
-	}
-
-	r := l.next()
-
-	if r == '\\' {
-		if l.peek() == '"' {
-			// ignore the escaped closing quote
-			l.ignore()
-			l.next()
-			l.ignore()
-		}
-	} else if r == '"' {
-		// ignore closing quote
-		l.ignore()
-	} else {
-		// handled by next state
-		l.backup()
-	}
-
-	return lexInsideShortcode
-}
-
-// Inline shortcodes has the form {{< myshortcode.inline >}}
-var inlineIdentifier = []byte("inline ")
-
-// scans an alphanumeric inside shortcode
-func lexIdentifierInShortcode(l *pageLexer) stateFunc {
-	lookForEnd := false
-Loop:
-	for {
-		switch r := l.next(); {
-		case isAlphaNumericOrHyphen(r):
-		// Allow forward slash inside names to make it possible to create namespaces.
-		case r == '/':
-		case r == '.':
-			l.isInline = l.hasPrefix(inlineIdentifier)
-			if !l.isInline {
-				return l.errorf("period in shortcode name only allowed for inline identifiers")
-			}
-		default:
-			l.backup()
-			word := string(l.input[l.start:l.pos])
-			if l.closingState > 0 && !l.openShortcodes[word] {
-				return l.errorf("closing tag for shortcode '%s' does not match start tag", word)
-			} else if l.closingState > 0 {
-				l.openShortcodes[word] = false
-				lookForEnd = true
-			}
-
-			l.closingState = 0
-			l.currShortcodeName = word
-			l.openShortcodes[word] = true
-			l.elementStepNum++
-			if l.isInline {
-				l.emit(tScNameInline)
-			} else {
-				l.emit(tScName)
-			}
-			break Loop
-		}
-	}
-
-	if lookForEnd {
-		return lexEndOfShortcode
-	}
-	return lexInsideShortcode
-}
-
-func lexEndOfShortcode(l *pageLexer) stateFunc {
-	l.isInline = false
-	if l.hasPrefix(l.currentRightShortcodeDelim()) {
-		return lexShortcodeRightDelim
-	}
-	switch r := l.next(); {
-	case isSpace(r):
-		l.ignore()
-	default:
-		return l.errorf("unclosed shortcode")
-	}
-	return lexEndOfShortcode
-}
-
-// scans the elements inside shortcode tags
-func lexInsideShortcode(l *pageLexer) stateFunc {
-	if l.hasPrefix(l.currentRightShortcodeDelim()) {
-		return lexShortcodeRightDelim
-	}
-	switch r := l.next(); {
-	case r == eof:
-		// eol is allowed inside shortcodes; this may go to end of document before it fails
-		return l.errorf("unclosed shortcode action")
-	case isSpace(r), isEndOfLine(r):
-		l.ignore()
-	case r == '=':
-		l.ignore()
-		return lexShortcodeQuotedParamVal(l, l.peek() != '\\', tScParamVal)
-	case r == '/':
-		if l.currShortcodeName == "" {
-			return l.errorf("got closing shortcode, but none is open")
-		}
-		l.closingState++
-		l.emit(tScClose)
-	case r == '\\':
-		l.ignore()
-		if l.peek() == '"' {
-			return lexShortcodeParam(l, true)
-		}
-	case l.elementStepNum > 0 && (isAlphaNumericOrHyphen(r) || r == '"'): // positional params can have quotes
-		l.backup()
-		return lexShortcodeParam(l, false)
-	case isAlphaNumeric(r):
-		l.backup()
-		return lexIdentifierInShortcode
-	default:
-		return l.errorf("unrecognized character in shortcode action: %#U. Note: Parameters with non-alphanumeric args must be quoted", r)
-	}
-	return lexInsideShortcode
-}
-
 // state helpers
 
 func (l *pageLexer) index(sep []byte) int {
@@ -767,29 +301,6 @@ func (l *pageLexer) hasPrefix(prefix []byte) bool {
 	return bytes.HasPrefix(l.input[l.pos:], prefix)
 }
 
-func (l *pageLexer) currentLeftShortcodeDelimItem() ItemType {
-	return l.currLeftDelimItem
-}
-
-func (l *pageLexer) currentRightShortcodeDelimItem() ItemType {
-	return l.currRightDelimItem
-}
-
-func (l *pageLexer) currentLeftShortcodeDelim() []byte {
-	if l.currLeftDelimItem == tLeftDelimScWithMarkup {
-		return leftDelimScWithMarkup
-	}
-	return leftDelimScNoMarkup
-
-}
-
-func (l *pageLexer) currentRightShortcodeDelim() []byte {
-	if l.currRightDelimItem == tRightDelimScWithMarkup {
-		return rightDelimScWithMarkup
-	}
-	return rightDelimScNoMarkup
-}
-
 // helper functions
 
 // returns the min index >= 0
diff --git a/parser/pageparser/pagelexer_intro.go b/parser/pageparser/pagelexer_intro.go
new file mode 100644
index 000000000..56dd4224d
--- /dev/null
+++ b/parser/pageparser/pagelexer_intro.go
@@ -0,0 +1,202 @@
+// Copyright 2018 The Hugo Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pageparser provides a parser for Hugo content files (Markdown, HTML etc.) in Hugo.
+// This implementation is highly inspired by the great talk given by Rob Pike called "Lexical Scanning in Go"
+// It's on YouTube, Google it!.
+// See slides here: http://cuddle.googlecode.com/hg/talk/lex.html
+package pageparser
+
+func lexIntroSection(l *pageLexer) stateFunc {
+	l.summaryDivider = summaryDivider
+
+LOOP:
+	for {
+		r := l.next()
+		if r == eof {
+			break
+		}
+
+		switch {
+		case r == '+':
+			return l.lexFrontMatterSection(TypeFrontMatterTOML, r, "TOML", delimTOML)
+		case r == '-':
+			return l.lexFrontMatterSection(TypeFrontMatterYAML, r, "YAML", delimYAML)
+		case r == '{':
+			return lexFrontMatterJSON
+		case r == '#':
+			return lexFrontMatterOrgMode
+		case r == byteOrderMark:
+			l.emit(TypeIgnore)
+		case !isSpace(r) && !isEndOfLine(r):
+			if r == '<' {
+				l.backup()
+				if l.hasPrefix(htmlCommentStart) {
+					// This may be commented out front mattter, which should
+					// still be read.
+					l.consumeToNextLine()
+					l.isInHTMLComment = true
+					l.emit(TypeIgnore)
+					continue LOOP
+				} else {
+					if l.pos > l.start {
+						l.emit(tText)
+					}
+					l.next()
+					// This is the start of a plain HTML document with no
+					// front matter. I still can contain shortcodes, so we
+					// have to keep looking.
+					l.emit(TypeHTMLStart)
+				}
+			}
+			break LOOP
+		}
+	}
+
+	// Now move on to the shortcodes.
+	return lexMainSection
+}
+
+func lexEndFromtMatterHTMLComment(l *pageLexer) stateFunc {
+	l.isInHTMLComment = false
+	right := l.index(htmlCommentEnd)
+	if right == -1 {
+		return l.errorf("starting HTML comment with no end")
+	}
+	l.pos += right + len(htmlCommentEnd)
+	l.emit(TypeIgnore)
+
+	// Now move on to the shortcodes.
+	return lexMainSection
+}
+
+func lexFrontMatterJSON(l *pageLexer) stateFunc {
+	// Include the left delimiter
+	l.backup()
+
+	var (
+		inQuote bool
+		level   int
+	)
+
+	for {
+
+		r := l.next()
+
+		switch {
+		case r == eof:
+			return l.errorf("unexpected EOF parsing JSON front matter")
+		case r == '{':
+			if !inQuote {
+				level++
+			}
+		case r == '}':
+			if !inQuote {
+				level--
+			}
+		case r == '"':
+			inQuote = !inQuote
+		case r == '\\':
+			// This may be an escaped quote. Make sure it's not marked as a
+			// real one.
+			l.next()
+		}
+
+		if level == 0 {
+			break
+		}
+	}
+
+	l.consumeCRLF()
+	l.emit(TypeFrontMatterJSON)
+
+	return lexMainSection
+}
+
+func lexFrontMatterOrgMode(l *pageLexer) stateFunc {
+	/*
+		#+TITLE: Test File For chaseadamsio/goorgeous
+		#+AUTHOR: Chase Adams
+		#+DESCRIPTION: Just another golang parser for org content!
+	*/
+
+	l.summaryDivider = summaryDividerOrg
+
+	l.backup()
+
+	if !l.hasPrefix(delimOrg) {
+		return lexMainSection
+	}
+
+	// Read lines until we no longer see a #+ prefix
+LOOP:
+	for {
+
+		r := l.next()
+
+		switch {
+		case r == '\n':
+			if !l.hasPrefix(delimOrg) {
+				break LOOP
+			}
+		case r == eof:
+			break LOOP
+
+		}
+	}
+
+	l.emit(TypeFrontMatterORG)
+
+	return lexMainSection
+
+}
+
+// Handle YAML or TOML front matter.
+func (l *pageLexer) lexFrontMatterSection(tp ItemType, delimr rune, name string, delim []byte) stateFunc {
+
+	for i := 0; i < 2; i++ {
+		if r := l.next(); r != delimr {
+			return l.errorf("invalid %s delimiter", name)
+		}
+	}
+
+	// Let front matter start at line 1
+	wasEndOfLine := l.consumeCRLF()
+	// We don't care about the delimiters.
+	l.ignore()
+
+	var r rune
+
+	for {
+		if !wasEndOfLine {
+			r = l.next()
+			if r == eof {
+				return l.errorf("EOF looking for end %s front matter delimiter", name)
+			}
+		}
+
+		if wasEndOfLine || isEndOfLine(r) {
+			if l.hasPrefix(delim) {
+				l.emit(tp)
+				l.pos += 3
+				l.consumeCRLF()
+				l.ignore()
+				break
+			}
+		}
+
+		wasEndOfLine = false
+	}
+
+	return lexMainSection
+}
diff --git a/parser/pageparser/pagelexer_shortcode.go b/parser/pageparser/pagelexer_shortcode.go
new file mode 100644
index 000000000..fe182459a
--- /dev/null
+++ b/parser/pageparser/pagelexer_shortcode.go
@@ -0,0 +1,322 @@
+// Copyright 2018 The Hugo Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pageparser provides a parser for Hugo content files (Markdown, HTML etc.) in Hugo.
+// This implementation is highly inspired by the great talk given by Rob Pike called "Lexical Scanning in Go"
+// It's on YouTube, Google it!.
+// See slides here: http://cuddle.googlecode.com/hg/talk/lex.html
+package pageparser
+
+type lexerShortcodeState struct {
+	currLeftDelimItem  ItemType
+	currRightDelimItem ItemType
+	isInline           bool
+	currShortcodeName  string          // is only set when a shortcode is in opened state
+	closingState       int             // > 0 = on its way to be closed
+	elementStepNum     int             // step number in element
+	paramElements      int             // number of elements (name + value = 2) found first
+	openShortcodes     map[string]bool // set of shortcodes in open state
+
+}
+
+// Shortcode syntax
+var (
+	leftDelimSc            = []byte("{{")
+	leftDelimScNoMarkup    = []byte("{{<")
+	rightDelimScNoMarkup   = []byte(">}}")
+	leftDelimScWithMarkup  = []byte("{{%")
+	rightDelimScWithMarkup = []byte("%}}")
+	leftComment            = []byte("/*") // comments in this context us used to to mark shortcodes as "not really a shortcode"
+	rightComment           = []byte("*/")
+)
+
+func (l *pageLexer) isShortCodeStart() bool {
+	return l.hasPrefix(leftDelimScWithMarkup) || l.hasPrefix(leftDelimScNoMarkup)
+}
+
+func lexShortcodeLeftDelim(l *pageLexer) stateFunc {
+	l.pos += len(l.currentLeftShortcodeDelim())
+	if l.hasPrefix(leftComment) {
+		return lexShortcodeComment
+	}
+	l.emit(l.currentLeftShortcodeDelimItem())
+	l.elementStepNum = 0
+	l.paramElements = 0
+	return lexInsideShortcode
+}
+
+func lexShortcodeComment(l *pageLexer) stateFunc {
+	posRightComment := l.index(append(rightComment, l.currentRightShortcodeDelim()...))
+	if posRightComment <= 1 {
+		return l.errorf("comment must be closed")
+	}
+	// we emit all as text, except the comment markers
+	l.emit(tText)
+	l.pos += len(leftComment)
+	l.ignore()
+	l.pos += posRightComment - len(leftComment)
+	l.emit(tText)
+	l.pos += len(rightComment)
+	l.ignore()
+	l.pos += len(l.currentRightShortcodeDelim())
+	l.emit(tText)
+	return lexMainSection
+}
+
+func lexShortcodeRightDelim(l *pageLexer) stateFunc {
+	l.closingState = 0
+	l.pos += len(l.currentRightShortcodeDelim())
+	l.emit(l.currentRightShortcodeDelimItem())
+	return lexMainSection
+}
+
+// either:
+// 1. param
+// 2. "param" or "param\"
+// 3. param="123" or param="123\"
+// 4. param="Some \"escaped\" text"
+func lexShortcodeParam(l *pageLexer, escapedQuoteStart bool) stateFunc {
+
+	first := true
+	nextEq := false
+
+	var r rune
+
+	for {
+		r = l.next()
+		if first {
+			if r == '"' {
+				// a positional param with quotes
+				if l.paramElements == 2 {
+					return l.errorf("got quoted positional parameter. Cannot mix named and positional parameters")
+				}
+				l.paramElements = 1
+				l.backup()
+				return lexShortcodeQuotedParamVal(l, !escapedQuoteStart, tScParam)
+			}
+			first = false
+		} else if r == '=' {
+			// a named param
+			l.backup()
+			nextEq = true
+			break
+		}
+
+		if !isAlphaNumericOrHyphen(r) {
+			l.backup()
+			break
+		}
+	}
+
+	if l.paramElements == 0 {
+		l.paramElements++
+
+		if nextEq {
+			l.paramElements++
+		}
+	} else {
+		if nextEq && l.paramElements == 1 {
+			return l.errorf("got named parameter '%s'. Cannot mix named and positional parameters", l.current())
+		} else if !nextEq && l.paramElements == 2 {
+			return l.errorf("got positional parameter '%s'. Cannot mix named and positional parameters", l.current())
+		}
+	}
+
+	l.emit(tScParam)
+	return lexInsideShortcode
+
+}
+
+func lexShortcodeQuotedParamVal(l *pageLexer, escapedQuotedValuesAllowed bool, typ ItemType) stateFunc {
+	openQuoteFound := false
+	escapedInnerQuoteFound := false
+	escapedQuoteState := 0
+
+Loop:
+	for {
+		switch r := l.next(); {
+		case r == '\\':
+			if l.peek() == '"' {
+				if openQuoteFound && !escapedQuotedValuesAllowed {
+					l.backup()
+					break Loop
+				} else if openQuoteFound {
+					// the coming quoute is inside
+					escapedInnerQuoteFound = true
+					escapedQuoteState = 1
+				}
+			}
+		case r == eof, r == '\n':
+			return l.errorf("unterminated quoted string in shortcode parameter-argument: '%s'", l.current())
+		case r == '"':
+			if escapedQuoteState == 0 {
+				if openQuoteFound {
+					l.backup()
+					break Loop
+
+				} else {
+					openQuoteFound = true
+					l.ignore()
+				}
+			} else {
+				escapedQuoteState = 0
+			}
+
+		}
+	}
+
+	if escapedInnerQuoteFound {
+		l.ignoreEscapesAndEmit(typ)
+	} else {
+		l.emit(typ)
+	}
+
+	r := l.next()
+
+	if r == '\\' {
+		if l.peek() == '"' {
+			// ignore the escaped closing quote
+			l.ignore()
+			l.next()
+			l.ignore()
+		}
+	} else if r == '"' {
+		// ignore closing quote
+		l.ignore()
+	} else {
+		// handled by next state
+		l.backup()
+	}
+
+	return lexInsideShortcode
+}
+
+// Inline shortcodes has the form {{< myshortcode.inline >}}
+var inlineIdentifier = []byte("inline ")
+
+// scans an alphanumeric inside shortcode
+func lexIdentifierInShortcode(l *pageLexer) stateFunc {
+	lookForEnd := false
+Loop:
+	for {
+		switch r := l.next(); {
+		case isAlphaNumericOrHyphen(r):
+		// Allow forward slash inside names to make it possible to create namespaces.
+		case r == '/':
+		case r == '.':
+			l.isInline = l.hasPrefix(inlineIdentifier)
+			if !l.isInline {
+				return l.errorf("period in shortcode name only allowed for inline identifiers")
+			}
+		default:
+			l.backup()
+			word := string(l.input[l.start:l.pos])
+			if l.closingState > 0 && !l.openShortcodes[word] {
+				return l.errorf("closing tag for shortcode '%s' does not match start tag", word)
+			} else if l.closingState > 0 {
+				l.openShortcodes[word] = false
+				lookForEnd = true
+			}
+
+			l.closingState = 0
+			l.currShortcodeName = word
+			l.openShortcodes[word] = true
+			l.elementStepNum++
+			if l.isInline {
+				l.emit(tScNameInline)
+			} else {
+				l.emit(tScName)
+			}
+			break Loop
+		}
+	}
+
+	if lookForEnd {
+		return lexEndOfShortcode
+	}
+	return lexInsideShortcode
+}
+
+func lexEndOfShortcode(l *pageLexer) stateFunc {
+	l.isInline = false
+	if l.hasPrefix(l.currentRightShortcodeDelim()) {
+		return lexShortcodeRightDelim
+	}
+	switch r := l.next(); {
+	case isSpace(r):
+		l.ignore()
+	default:
+		return l.errorf("unclosed shortcode")
+	}
+	return lexEndOfShortcode
+}
+
+// scans the elements inside shortcode tags
+func lexInsideShortcode(l *pageLexer) stateFunc {
+	if l.hasPrefix(l.currentRightShortcodeDelim()) {
+		return lexShortcodeRightDelim
+	}
+	switch r := l.next(); {
+	case r == eof:
+		// eol is allowed inside shortcodes; this may go to end of document before it fails
+		return l.errorf("unclosed shortcode action")
+	case isSpace(r), isEndOfLine(r):
+		l.ignore()
+	case r == '=':
+		l.ignore()
+		return lexShortcodeQuotedParamVal(l, l.peek() != '\\', tScParamVal)
+	case r == '/':
+		if l.currShortcodeName == "" {
+			return l.errorf("got closing shortcode, but none is open")
+		}
+		l.closingState++
+		l.emit(tScClose)
+	case r == '\\':
+		l.ignore()
+		if l.peek() == '"' {
+			return lexShortcodeParam(l, true)
+		}
+	case l.elementStepNum > 0 && (isAlphaNumericOrHyphen(r) || r == '"'): // positional params can have quotes
+		l.backup()
+		return lexShortcodeParam(l, false)
+	case isAlphaNumeric(r):
+		l.backup()
+		return lexIdentifierInShortcode
+	default:
+		return l.errorf("unrecognized character in shortcode action: %#U. Note: Parameters with non-alphanumeric args must be quoted", r)
+	}
+	return lexInsideShortcode
+}
+
+func (l *pageLexer) currentLeftShortcodeDelimItem() ItemType {
+	return l.currLeftDelimItem
+}
+
+func (l *pageLexer) currentRightShortcodeDelimItem() ItemType {
+	return l.currRightDelimItem
+}
+
+func (l *pageLexer) currentLeftShortcodeDelim() []byte {
+	if l.currLeftDelimItem == tLeftDelimScWithMarkup {
+		return leftDelimScWithMarkup
+	}
+	return leftDelimScNoMarkup
+
+}
+
+func (l *pageLexer) currentRightShortcodeDelim() []byte {
+	if l.currRightDelimItem == tRightDelimScWithMarkup {
+		return rightDelimScWithMarkup
+	}
+	return rightDelimScNoMarkup
+}
author	Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>	2018-12-17 22:54:06 +0300
committer	Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>	2018-12-20 22:08:01 +0300
commit	a8853f1c5ace30ae8d256ad374bdb280c95d4228 (patch)
tree	db4bdd65b5ae9bfe7894a4a9bdf687a5d0063381 /parser
parent	4d93aca27dfdebc9e06948ccf37a7922dac09d65 (diff)