// Copyright 2017 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package commentparser does a basic parse over a source file and returns all // of the comments from the code. This is useful for when you want to analyze // text written in comments (like copyright notices) but not in the code // itself. package commentparser import ( "bytes" "strings" "unicode/utf8" "github.com/google/licenseclassifier/commentparser/language" ) const ( eofInString = "%d:EOF in string" eofInSingleLineComment = "%d:EOF in single line comment" eofInMultilineComment = "%d:EOF in multiline comment" ) // Parse parses the input data and returns the comments. func Parse(contents []byte, lang language.Language) Comments { if len(contents) == 0 { return nil } c := string(contents) if !strings.HasSuffix(c, "\n") { // Force a terminating newline if one isn't present. c += "\n" } i := &input{ s: c, lang: lang, offset: 0, pos: position{line: 1, lineRune: []int{0}}, } i.lex() return i.comments } // Comment is either a single line or multiline comment in a source code file. // A single line comment has StartLine equal to EndLine. The lines are 1-based. type Comment struct { StartLine int EndLine int Text string } // Comments allows us to treat a slice of comments as a unit. type Comments []*Comment // ChunkIterator returns a read-only channel and generates the comments in a // goroutine, then closes the channel. func (c Comments) ChunkIterator() <-chan Comments { ch := make(chan Comments) go func() { defer close(ch) if len(c) == 0 { return } prevChunk := c[0] for index := 0; index < len(c); index++ { var chunk Comments for ; index < len(c); index++ { if c[index].StartLine > prevChunk.StartLine+1 { break } if c[index].StartLine == prevChunk.StartLine+2 { if c[index].StartLine != c[index].EndLine || prevChunk.StartLine != prevChunk.EndLine { break } } chunk = append(chunk, c[index]) prevChunk = c[index] } if len(chunk) == 0 { break } ch <- chunk if index >= len(c) { break } prevChunk = c[index] index-- } }() return ch } // StartLine is the line number (1-based) the first part of the comment block // starts on. func (c Comments) StartLine() int { if len(c) == 0 { return 0 } return c[0].StartLine } // String creates a string out of the text of the comments. Comment begin and // end markers are removed. func (c Comments) String() string { var s []string for _, cmt := range c { s = append(s, cmt.Text) } return strings.Join(s, "\n") } // position records the location of a lexeme. type position struct { line int // Line number of input: 1-based lineRune []int // Rune offset from beginning of line: 0-based } // input holds the current state of the lexer. type input struct { s string // Entire input. lang language.Language // Source code language. offset int // Offset into input. pos position // Current position in the input. comments Comments // Comments in the source file. } // lex is called to obtain the comments. func (i *input) lex() { for { c, ok := i.peekRune() if !ok { break } switch c { case '"', '\'', '`': // String // Ignore strings because they could contain comment // start or end sequences which we need to ignore. if i.lang == language.HTML { // Quotes in HTML-like files aren't meaningful, // because it's basically plain text break } ok, hasEscape := i.lang.QuoteCharacter(c) if !ok { break } var content bytes.Buffer isDocString := false quote := string(c) if i.lang == language.Python { if c == '\'' && i.match("'''") { quote = "'''" // Assume module-level docstrings start at the // beginning of a line. Function docstrings not // supported. if i.pos.lineRune[len(i.pos.lineRune)-1] == 3 { isDocString = true } } else if c == '"' && i.match(`"""`) { quote = `"""` if i.pos.lineRune[len(i.pos.lineRune)-1] == 3 { isDocString = true } } else { i.readRune() // Eat quote. } } else { i.readRune() // Eat quote. } startLine := i.pos.line for { c, ok = i.peekRune() if !ok { return } if hasEscape && c == '\\' { i.readRune() // Eat escape. } else if i.match(quote) { break } else if (i.lang == language.JavaScript || i.lang == language.Perl) && c == '\n' { // JavaScript and Perl allow you to // specify regexes without quotes, but // which contain quotes. So treat the // newline as terminating the string. break } c := i.readRune() if isDocString { content.WriteRune(c) } if i.eof() { return } } if isDocString { i.comments = append(i.comments, &Comment{ StartLine: startLine, EndLine: i.pos.line, Text: content.String(), }) } default: startLine := i.pos.line var comment bytes.Buffer if ok, start, end := i.multiLineComment(); ok { // Multiline comment nesting := 0 startLine := i.pos.line for { if i.eof() { return } c := i.readRune() comment.WriteRune(c) if i.lang.NestedComments() && i.match(start) { // Allows nested comments. comment.WriteString(start) nesting++ } if i.match(end) { if nesting > 0 { comment.WriteString(end) nesting-- } else { break } } } i.comments = append(i.comments, &Comment{ StartLine: startLine, EndLine: i.pos.line, Text: comment.String(), }) } else if i.singleLineComment() { // Single line comment for { if i.eof() { return } c = i.readRune() if c == '\n' { i.unreadRune(c) break } comment.WriteRune(c) } i.comments = append(i.comments, &Comment{ StartLine: startLine, EndLine: i.pos.line, Text: comment.String(), }) } } i.readRune() // Ignore non-comments. } } // singleLineComment returns 'true' if we've run across a single line comment // in the given language. func (i *input) singleLineComment() bool { if i.match(i.lang.SingleLineCommentStart()) { return true } if i.lang == language.SQL { return i.match(language.MySQL.SingleLineCommentStart()) } else if i.lang == language.ObjectiveC { return i.match(language.Matlab.SingleLineCommentStart()) } return false } // multiLineComment returns 'true' if we've run across a multiline comment in // the given language. func (i *input) multiLineComment() (bool, string, string) { if s := i.lang.MultilineCommentStart(); i.match(s) { return true, s, i.lang.MultilineCommentEnd() } if i.lang == language.SQL { if s := language.MySQL.MultilineCommentStart(); i.match(s) { return true, s, language.MySQL.MultilineCommentEnd() } } else if i.lang == language.ObjectiveC { if s := language.Matlab.MultilineCommentStart(); i.match(s) { return true, s, language.Matlab.MultilineCommentEnd() } } return false, "", "" } // match returns 'true' if the next tokens in the stream match the given // string. func (i *input) match(s string) bool { if s == "" { return false } saved := s var read []rune for len(s) > 0 && !i.eof() { r, size := utf8.DecodeRuneInString(s) if c, ok := i.peekRune(); ok && c == r { read = append(read, c) } else { // No match. Push the tokens we read back onto the stack. for idx := len(read) - 1; idx >= 0; idx-- { i.unreadRune(read[idx]) } return false } s = s[size:] i.readRune() // Eat token. } return string(read) == saved } // eof reports whether the input has reached the end of the file. func (i *input) eof() bool { return len(i.s) <= i.offset } // peekRune returns the next rune in the input without consuming it. func (i *input) peekRune() (rune, bool) { if i.eof() { return rune(0), false } r, _ := utf8.DecodeRuneInString(i.s[i.offset:]) return r, true } // readRune consumes and returns the next rune in the input. func (i *input) readRune() rune { r, size := utf8.DecodeRuneInString(i.s[i.offset:]) if r == '\n' { i.pos.line++ i.pos.lineRune = append(i.pos.lineRune, 0) } else { i.pos.lineRune[len(i.pos.lineRune)-1]++ } i.offset += size return r } // unreadRune winds the lexer's state back to before the rune was read. func (i *input) unreadRune(c rune) { p := make([]byte, utf8.UTFMax) size := utf8.EncodeRune(p, c) i.offset -= size if c == '\n' { i.pos.line-- if len(i.pos.lineRune) > 1 { i.pos.lineRune = i.pos.lineRune[:len(i.pos.lineRune)-1] } else { i.pos.lineRune[len(i.pos.lineRune)-1] = 0 } } else { i.pos.lineRune[len(i.pos.lineRune)-1]-- } }