parser/pageparser: Add front matter etc. support

See #5324
author: Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com> 2018-10-17 14:48:55 +0300
committer: Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com> 2018-10-22 20:57:43 +0300
commit: 2fdc4a24d5450a98cf38a4456e8e0e8e97a3343d (patch)
tree: 409814d04e5b6454abd56a230894bd0e78e3cfb5 /parser/pageparser
parent: f6863e1ef725f654a4c869ef4955f9add6908a46 (diff)
4 files changed, 344 insertions, 67 deletions
diff --git a/parser/pageparser/item.go b/parser/pageparser/item.go
index ae2f6cbc9..f7495c90e 100644
--- a/parser/pageparser/item.go
+++ b/parser/pageparser/item.go
@@ -73,10 +73,10 @@ func (i Item) String() string {
 		return i.Val
 	case i.typ > tKeywordMarker:
 		return fmt.Sprintf("<%s>", i.Val)
-	case len(i.Val) > 20:
-		return fmt.Sprintf("%.20q...", i.Val)
+	case len(i.Val) > 50:
+		return fmt.Sprintf("%v:%.20q...", i.typ, i.Val)
 	}
-	return fmt.Sprintf("[%s]", i.Val)
+	return fmt.Sprintf("%v:[%s]", i.typ, i.Val)
 }
 
 type itemType int
@@ -85,6 +85,15 @@ const (
 	tError itemType = iota
 	tEOF
 
+	// page items
+	tHTMLLead          // <
+	tSummaryDivider    // <!--more-->
+	tSummaryDividerOrg // # more
+	tFrontMatterYAML
+	tFrontMatterTOML
+	tFrontMatterJSON
+	tFrontMatterORG
+
 	// shortcode items
 	tLeftDelimScNoMarkup
 	tRightDelimScNoMarkup
@@ -95,8 +104,7 @@ const (
 	tScParam
 	tScParamVal
 
-	//itemIdentifier
-	tText // plain text, used for everything outside the shortcodes
+	tText // plain text
 
 	// preserved for later - keywords come after this
 	tKeywordMarker
diff --git a/parser/pageparser/pagelexer.go b/parser/pageparser/pagelexer.go
index 5267c5634..0c97becde 100644
--- a/parser/pageparser/pagelexer.go
+++ b/parser/pageparser/pagelexer.go
@@ -44,13 +44,15 @@ type lexerShortcodeState struct {
 }
 
 type pageLexer struct {
-	name    string
-	input   string
-	state   stateFunc
-	pos     pos // input position
-	start   pos // item start position
-	width   pos // width of last element
-	lastPos pos // position of the last item returned by nextItem
+	input      string
+	stateStart stateFunc
+	state      stateFunc
+	pos        pos // input position
+	start      pos // item start position
+	width      pos // width of last element
+	lastPos    pos // position of the last item returned by nextItem
+
+	contentSections int
 
 	lexerShortcodeState
 
@@ -63,18 +65,18 @@ func Parse(s string) *Tokens {
 }
 
 func ParseFrom(s string, from int) *Tokens {
-	lexer := newPageLexer("default", s, pos(from))
+	lexer := newPageLexer(s, pos(from), lexMainSection) // TODO(bep) 2errors
 	lexer.run()
 	return &Tokens{lexer: lexer}
 }
 
 // note: the input position here is normally 0 (start), but
 // can be set if position of first shortcode is known
-func newPageLexer(name, input string, inputPosition pos) *pageLexer {
+func newPageLexer(input string, inputPosition pos, stateStart stateFunc) *pageLexer {
 	lexer := &pageLexer{
-		name:  name,
-		input: input,
-		pos:   inputPosition,
+		input:      input,
+		pos:        inputPosition,
+		stateStart: stateStart,
 		lexerShortcodeState: lexerShortcodeState{
 			currLeftDelimItem:  tLeftDelimScNoMarkup,
 			currRightDelimItem: tRightDelimScNoMarkup,
@@ -88,14 +90,13 @@ func newPageLexer(name, input string, inputPosition pos) *pageLexer {
 
 // main loop
 func (l *pageLexer) run() *pageLexer {
-	for l.state = lexTextOutsideShortcodes; l.state != nil; {
+	for l.state = l.stateStart; l.state != nil; {
 		l.state = l.state(l)
 	}
 	return l
 }
 
-// state functions
-
+// Shortcode syntax
 const (
 	leftDelimScNoMarkup    = "{{<"
 	rightDelimScNoMarkup   = ">}}"
@@ -105,6 +106,12 @@ const (
 	rightComment           = "*/"
 )
 
+// Page syntax
+const (
+	summaryDivider    = "<!--more-->"
+	summaryDividerOrg = "# more"
+)
+
 func (l *pageLexer) next() rune {
 	if int(l.pos) >= len(l.input) {
 		l.width = 0
@@ -178,11 +185,21 @@ func (l *pageLexer) nextItem() Item {
 	return item
 }
 
-// scans until an opening shortcode opening bracket.
-// if no shortcodes, it will keep on scanning until EOF
-func lexTextOutsideShortcodes(l *pageLexer) stateFunc {
+func (l *pageLexer) consumeCRLF() bool {
+	var consumed bool
+	for _, r := range crLf {
+		if l.next() != r {
+			l.backup()
+		} else {
+			consumed = true
+		}
+	}
+	return consumed
+}
+
+func lexMainSection(l *pageLexer) stateFunc {
 	for {
-		if strings.HasPrefix(l.input[l.pos:], leftDelimScWithMarkup) || strings.HasPrefix(l.input[l.pos:], leftDelimScNoMarkup) {
+		if l.isShortCodeStart() {
 			if l.pos > l.start {
 				l.emit(tText)
 			}
@@ -194,12 +211,79 @@ func lexTextOutsideShortcodes(l *pageLexer) stateFunc {
 				l.currRightDelimItem = tRightDelimScNoMarkup
 			}
 			return lexShortcodeLeftDelim
+		}
 
+		if l.contentSections <= 1 {
+			if strings.HasPrefix(l.input[l.pos:], summaryDivider) {
+				if l.pos > l.start {
+					l.emit(tText)
+				}
+				l.contentSections++
+				l.pos += pos(len(summaryDivider))
+				l.emit(tSummaryDivider)
+			} else if strings.HasPrefix(l.input[l.pos:], summaryDividerOrg) {
+				if l.pos > l.start {
+					l.emit(tText)
+				}
+				l.contentSections++
+				l.pos += pos(len(summaryDividerOrg))
+				l.emit(tSummaryDividerOrg)
+			}
 		}
-		if l.next() == eof {
+
+		r := l.next()
+		if r == eof {
 			break
 		}
+
 	}
+
+	return lexDone
+
+}
+
+func (l *pageLexer) isShortCodeStart() bool {
+	return strings.HasPrefix(l.input[l.pos:], leftDelimScWithMarkup) || strings.HasPrefix(l.input[l.pos:], leftDelimScNoMarkup)
+}
+
+func lexIntroSection(l *pageLexer) stateFunc {
+LOOP:
+	for {
+		r := l.next()
+		if r == eof {
+			break
+		}
+
+		switch {
+		case r == '+':
+			return l.lexFrontMatterSection(tFrontMatterTOML, r, "TOML", "+++")
+		case r == '-':
+			return l.lexFrontMatterSection(tFrontMatterYAML, r, "YAML", "---")
+		case r == '{':
+			return lexFrontMatterJSON
+		case r == '#':
+			return lexFrontMatterOrgMode
+		case !isSpace(r) && !isEndOfLine(r):
+			if r == '<' {
+				l.emit(tHTMLLead)
+				// Not need to look further. Hugo treats this as plain HTML,
+				// no front matter, no shortcodes, no nothing.
+				l.pos = pos(len(l.input))
+				l.emit(tText)
+				break LOOP
+			}
+			return l.errorf("failed to detect front matter type; got unknown identifier %q", r)
+		}
+	}
+
+	l.contentSections = 1
+
+	// Now move on to the shortcodes.
+	return lexMainSection
+}
+
+func lexDone(l *pageLexer) stateFunc {
+
 	// Done!
 	if l.pos > l.start {
 		l.emit(tText)
@@ -208,6 +292,122 @@ func lexTextOutsideShortcodes(l *pageLexer) stateFunc {
 	return nil
 }
 
+func lexFrontMatterJSON(l *pageLexer) stateFunc {
+	// Include the left delimiter
+	l.backup()
+
+	var (
+		inQuote bool
+		level   int
+	)
+
+	for {
+
+		r := l.next()
+
+		switch {
+		case r == eof:
+			return l.errorf("unexpected EOF parsing JSON front matter")
+		case r == '{':
+			if !inQuote {
+				level++
+			}
+		case r == '}':
+			if !inQuote {
+				level--
+			}
+		case r == '"':
+			inQuote = !inQuote
+		case r == '\\':
+			// This may be an escaped quote. Make sure it's not marked as a
+			// real one.
+			l.next()
+		}
+
+		if level == 0 {
+			break
+		}
+	}
+
+	l.consumeCRLF()
+	l.emit(tFrontMatterJSON)
+
+	return lexMainSection
+}
+
+func lexFrontMatterOrgMode(l *pageLexer) stateFunc {
+	/*
+		#+TITLE: Test File For chaseadamsio/goorgeous
+		#+AUTHOR: Chase Adams
+		#+DESCRIPTION: Just another golang parser for org content!
+	*/
+
+	const prefix = "#+"
+
+	l.backup()
+
+	if !strings.HasPrefix(l.input[l.pos:], prefix) {
+		// TODO(bep) consider error
+		return lexMainSection
+	}
+
+	// Read lines until we no longer see a #+ prefix
+LOOP:
+	for {
+
+		r := l.next()
+
+		switch {
+		case r == '\n':
+			if !strings.HasPrefix(l.input[l.pos:], prefix) {
+				break LOOP
+			}
+		case r == eof:
+			break LOOP
+
+		}
+	}
+
+	l.emit(tFrontMatterORG)
+
+	return lexMainSection
+
+}
+
+// Handle YAML or TOML front matter.
+func (l *pageLexer) lexFrontMatterSection(tp itemType, delimr rune, name, delim string) stateFunc {
+	for i := 0; i < 2; i++ {
+		if r := l.next(); r != delimr {
+			return l.errorf("invalid %s delimiter", name)
+		}
+	}
+
+	if !l.consumeCRLF() {
+		return l.errorf("invalid %s delimiter", name)
+	}
+
+	// We don't care about the delimiters.
+	l.ignore()
+
+	for {
+		r := l.next()
+		if r == eof {
+			return l.errorf("EOF looking for end %s front matter delimiter", name)
+		}
+		if isEndOfLine(r) {
+			if strings.HasPrefix(l.input[l.pos:], delim) {
+				l.emit(tp)
+				l.pos += 3
+				l.consumeCRLF()
+				l.ignore()
+				break
+			}
+		}
+	}
+
+	return lexMainSection
+}
+
 func lexShortcodeLeftDelim(l *pageLexer) stateFunc {
 	l.pos += pos(len(l.currentLeftShortcodeDelim()))
 	if strings.HasPrefix(l.input[l.pos:], leftComment) {
@@ -234,14 +434,14 @@ func lexShortcodeComment(l *pageLexer) stateFunc {
 	l.ignore()
 	l.pos += pos(len(l.currentRightShortcodeDelim()))
 	l.emit(tText)
-	return lexTextOutsideShortcodes
+	return lexMainSection
 }
 
 func lexShortcodeRightDelim(l *pageLexer) stateFunc {
 	l.closingState = 0
 	l.pos += pos(len(l.currentRightShortcodeDelim()))
 	l.emit(l.currentRightShortcodeDelimItem())
-	return lexTextOutsideShortcodes
+	return lexMainSection
 }
 
 // either:
@@ -485,6 +685,8 @@ func isAlphaNumericOrHyphen(r rune) bool {
 	return isAlphaNumeric(r) || r == '-'
 }
 
+var crLf = []rune{'\r', '\n'}
+
 func isEndOfLine(r rune) bool {
 	return r == '\r' || r == '\n'
 }
diff --git a/parser/pageparser/pageparser_intro_test.go b/parser/pageparser/pageparser_intro_test.go
new file mode 100644
index 000000000..3dc08c776
--- /dev/null
+++ b/parser/pageparser/pageparser_intro_test.go
@@ -0,0 +1,103 @@
+// Copyright 2018 The Hugo Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pageparser
+
+import (
+	"fmt"
+	"strings"
+	"testing"
+)
+
+type lexerTest struct {
+	name  string
+	input string
+	items []Item
+}
+
+var (
+	tstJSON                = `{ "a": { "b": "\"Hugo\"}" } }`
+	tstHTMLLead            = Item{tHTMLLead, 0, "  <"}
+	tstFrontMatterTOML     = Item{tFrontMatterTOML, 0, "foo = \"bar\"\n"}
+	tstFrontMatterYAML     = Item{tFrontMatterYAML, 0, "foo: \"bar\"\n"}
+	tstFrontMatterYAMLCRLF = Item{tFrontMatterYAML, 0, "foo: \"bar\"\r\n"}
+	tstFrontMatterJSON     = Item{tFrontMatterJSON, 0, tstJSON + "\r\n"}
+	tstSomeText            = Item{tText, 0, "\nSome text.\n"}
+	tstSummaryDivider      = Item{tSummaryDivider, 0, "<!--more-->"}
+	tstSummaryDividerOrg   = Item{tSummaryDividerOrg, 0, "# more"}
+
+	tstORG = `
+#+TITLE: T1
+#+AUTHOR: A1
+#+DESCRIPTION: D1
+`
+	tstFrontMatterORG = Item{tFrontMatterORG, 0, tstORG}
+)
+
+var crLfReplacer = strings.NewReplacer("\r", "#", "\n", "$")
+
+// TODO(bep) a way to toggle ORG mode vs the rest.
+var frontMatterTests = []lexerTest{
+	{"empty", "", []Item{tstEOF}},
+	{"HTML Document", `  <html>  `, []Item{tstHTMLLead, Item{tText, 0, "html>  "}, tstEOF}},
+	{"YAML front matter", "---\nfoo: \"bar\"\n---\n\nSome text.\n", []Item{tstFrontMatterYAML, tstSomeText, tstEOF}},
+	// Note that we keep all bytes as they are, but we need to handle CRLF
+	{"YAML front matter CRLF", "---\r\nfoo: \"bar\"\r\n---\n\nSome text.\n", []Item{tstFrontMatterYAMLCRLF, tstSomeText, tstEOF}},
+	{"TOML front matter", "+++\nfoo = \"bar\"\n+++\n\nSome text.\n", []Item{tstFrontMatterTOML, tstSomeText, tstEOF}},
+	{"JSON front matter", tstJSON + "\r\n\nSome text.\n", []Item{tstFrontMatterJSON, tstSomeText, tstEOF}},
+	{"ORG front matter", tstORG + "\nSome text.\n", []Item{tstFrontMatterORG, tstSomeText, tstEOF}},
+	{"Summary divider ORG", tstORG + "\nSome text.\n# more\nSome text.\n", []Item{tstFrontMatterORG, tstSomeText, tstSummaryDividerOrg, tstSomeText, tstEOF}},
+	{"Summary divider", "+++\nfoo = \"bar\"\n+++\n\nSome text.\n<!--more-->\nSome text.\n", []Item{tstFrontMatterTOML, tstSomeText, tstSummaryDivider, tstSomeText, tstEOF}},
+}
+
+func TestFrontMatter(t *testing.T) {
+	t.Parallel()
+	for i, test := range frontMatterTests {
+		items := collect(test.name, test.input, false, lexIntroSection)
+		if !equal(items, test.items) {
+			got := crLfReplacer.Replace(fmt.Sprint(items))
+			expected := crLfReplacer.Replace(fmt.Sprint(test.items))
+			t.Errorf("[%d] %s: got\n\t%v\nexpected\n\t%v", i, test.name, got, expected)
+		}
+	}
+}
+
+func collect(name, input string, skipFrontMatter bool, stateStart stateFunc) (items []Item) {
+	l := newPageLexer(input, 0, stateStart)
+	l.run()
+
+	for {
+		item := l.nextItem()
+		items = append(items, item)
+		if item.typ == tEOF || item.typ == tError {
+			break
+		}
+	}
+	return
+}
+
+// no positional checking, for now ...
+func equal(i1, i2 []Item) bool {
+	if len(i1) != len(i2) {
+		return false
+	}
+	for k := range i1 {
+		if i1[k].typ != i2[k].typ {
+			return false
+		}
+		if i1[k].Val != i2[k].Val {
+			return false
+		}
+	}
+	return true
+}
diff --git a/parser/pageparser/pageparser_test.go b/parser/pageparser/pageparser_shortcode_test.go
index ceb439a65..525c7452f 100644
--- a/parser/pageparser/pageparser_test.go
+++ b/parser/pageparser/pageparser_shortcode_test.go
@@ -13,15 +13,7 @@
 
 package pageparser
 
-import (
-	"testing"
-)
-
-type shortCodeLexerTest struct {
-	name  string
-	input string
-	items []Item
-}
+import "testing"
 
 var (
 	tstEOF       = Item{tEOF, 0, ""}
@@ -39,7 +31,7 @@ var (
 	tstVal       = Item{tScParamVal, 0, "Hello World"}
 )
 
-var shortCodeLexerTests = []shortCodeLexerTest{
+var shortCodeLexerTests = []lexerTest{
 	{"empty", "", []Item{tstEOF}},
 	{"spaces", " \t\n", []Item{{tText, 0, " \t\n"}, tstEOF}},
 	{"text", `to be or not`, []Item{{tText, 0, "to be or not"}, tstEOF}},
@@ -159,7 +151,7 @@ var shortCodeLexerTests = []shortCodeLexerTest{
 func TestShortcodeLexer(t *testing.T) {
 	t.Parallel()
 	for i, test := range shortCodeLexerTests {
-		items := collect(&test)
+		items := collect(test.name, test.input, true, lexMainSection)
 		if !equal(items, test.items) {
 			t.Errorf("[%d] %s: got\n\t%v\nexpected\n\t%v", i, test.name, items, test.items)
 		}
@@ -170,38 +162,10 @@ func BenchmarkShortcodeLexer(b *testing.B) {
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
 		for _, test := range shortCodeLexerTests {
-			items := collect(&test)
+			items := collect(test.name, test.input, true, lexMainSection)
 			if !equal(items, test.items) {
 				b.Errorf("%s: got\n\t%v\nexpected\n\t%v", test.name, items, test.items)
 			}
 		}
 	}
 }
-
-func collect(t *shortCodeLexerTest) (items []Item) {
-	l := newPageLexer(t.name, t.input, 0).run()
-	for {
-		item := l.nextItem()
-		items = append(items, item)
-		if item.typ == tEOF || item.typ == tError {
-			break
-		}
-	}
-	return
-}
-
-// no positional checking, for now ...
-func equal(i1, i2 []Item) bool {
-	if len(i1) != len(i2) {
-		return false
-	}
-	for k := range i1 {
-		if i1[k].typ != i2[k].typ {
-			return false
-		}
-		if i1[k].Val != i2[k].Val {
-			return false
-		}
-	}
-	return true
-}
author	Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>	2018-10-17 14:48:55 +0300
committer	Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>	2018-10-22 20:57:43 +0300
commit	2fdc4a24d5450a98cf38a4456e8e0e8e97a3343d (patch)
tree	409814d04e5b6454abd56a230894bd0e78e3cfb5 /parser/pageparser
parent	f6863e1ef725f654a4c869ef4955f9add6908a46 (diff)