mattermost/server/channels/app/password/phcparser/parser.go

// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
// See LICENSE.txt for license information.

// Package parser provides a type to parse strings conformant to the PHC string format:
// https://github.com/P-H-C/phc-string-format/blob/master/phc-sf-spec.md
package phcparser

import (
	"bufio"
	"bytes"
	"fmt"
	"io"
)

// PHC represents a PHC string, with all its parts already parsed:
type PHC struct {
	// Id is the identifier of the hashing function.
	Id string

	// Version is an optional string containing the specific version of the
	// hashing function used.
	Version string

	// Params is a map of parameters, containing a set of all parameter names
	// with their corresponding values.
	Params map[string]string

	// Salt is the base64-encoded salt used when hashing the original password.
	Salt string

	// Hash is the base64-encoded hash generated when hashing the original
	// password with the function specified by all other parameters.
	Hash string
}

// Parser is a wrapper of a limited bufio.Reader that will parse its input into
// a [PHC].
type Parser struct {
	reader *bufio.Reader
}

// MaxRunes is the maximum number of runes allowed in a PHC string. If the
// string is longer, the remaining runes are ignored.
const MaxRunes = 256

// New builds a new [Parser], limiting the input to [MaxRunes] runes.
func New(r io.Reader) *Parser {
	return &Parser{reader: bufio.NewReader(io.LimitReader(r, MaxRunes))}
}

// Token represents a minimal unit of meaning in the parsed string.
type Token uint

const (
	// ILLEGAL is a token representing an illegal token
	ILLEGAL Token = 1 << iota

	// Separator tokens
	// EOF is a token representing the end of the input
	EOF
	// DOLLARSIGN is a token representing a '$'
	DOLLARSIGN
	// COMMA is a token representing a ','
	COMMA
	// EQUALSIGN is a token representing a '='
	EQUALSIGN

	// Literals
	// FUNCTIONID is a token representing a non-empty set of any of the following symbols:
	// [a-z0-9-]
	FUNCTIONID
	// PARAMNAME is a token representing a non-empty set of any of the following symbols:
	// [a-z0-9-]
	PARAMNAME
	// PARAMVALUE is a token representing a non-empty set of any of the following symbols:
	// [a-zA-Z0-9/+.-]
	PARAMVALUE
	// B64ENCODED is a token representing a non-empty set of any of the following symbols:
	// [A-Za-z0-9+/]
	B64ENCODED
)

const (
	// IDENT is a generic identifier that represents any of its possibilities:
	// either a FUNCTIONID, a PARAMNAME, a PARAMVALUE or a B64ENCODED
	IDENT Token = FUNCTIONID | PARAMNAME | PARAMVALUE | B64ENCODED
)

// eof is a constant literal representing EOF
const eof = rune(0)

// [a-z]
func isLowercaseLetter(ch rune) bool {
	return (ch >= 'a' && ch <= 'z')
}

// [A-Za-z]
func isLetter(ch rune) bool {
	return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z')
}

// [0-9]
func isDigit(ch rune) bool {
	return (ch >= '0' && ch <= '9')
}

// [A-Za-z0-9+/]
func isB64(ch rune) bool {
	return isLetter(ch) || isDigit(ch) || ch == '+' || ch == '/'
}

// [/+.-]
func isSymbol(ch rune) bool {
	return ch == '/' || ch == '+' || ch == '.' || ch == '-'
}

// [a-z0-9-]
func isLowercaseLetterOrDigitOrMinus(ch rune) bool {
	return isLowercaseLetter(ch) || isDigit(ch) || ch == '-'
}

// [a-zA-Z0-9/+.-]
func isLetterOrDigitOrSymbol(ch rune) bool {
	return isLetter(ch) || isDigit(ch) || isSymbol(ch)
}

// no identifiers allowed
func none(ch rune) bool {
	return false
}

// read reads a single rune, returning [eof] in case of any error.
func (p *Parser) read() rune {
	ch, _, err := p.reader.ReadRune()
	if err != nil {
		return eof
	}
	return ch
}

// unread unreads a single rune
func (p *Parser) unread() { _ = p.reader.UnreadRune() }

// scan scans either an identifier whose runes are allowed by the provided
// function, or a single separator token: EOF $ , =
func (p *Parser) scan(isIdentAllowedRune func(rune) bool) (tok Token, lit string) {
	ch := p.read()

	if isIdentAllowedRune(ch) {
		p.unread()
		return p.scanIdent(isIdentAllowedRune)
	}

	switch ch {
	case eof:
		return EOF, ""
	case '$':
		return DOLLARSIGN, string(ch)
	case ',':
		return COMMA, string(ch)
	case '=':
		return EQUALSIGN, string(ch)
	}

	return ILLEGAL, string(ch)
}

// scanIdent scans a series of contiguous runes allowed by the provided function
// that form a single identifier.
func (p *Parser) scanIdent(isIdentAllowedRune func(rune) bool) (tok Token, lit string) {
	var buf bytes.Buffer
	buf.WriteRune(p.read())

	for {
		ch := p.read()
		if ch == eof {
			break
		}

		if !isIdentAllowedRune(ch) {
			p.unread()
			break
		}

		_, _ = buf.WriteRune(ch)
	}

	// On success, return the generic IDENT, the check for each specific
	// identifier is already done with isIdentAllowedRune
	return IDENT, buf.String()
}

// scanSeparator scans one of the separator tokens:
// - EOF
// - $
// - ,
// - =
func (p *Parser) scanSeparator() (tok Token, lit string) {
	return p.scan(none)
}

// parseToken returns the literal string of an expected token, or an error.
// expected can be an ORed expression of different tokens, like
//
//	EOF | DOLLARSIGN | FUNCTIONID
//
// In this case, any of those tokens are allowd, and its literal will be returned.
func (p *Parser) parseToken(expected Token) (string, error) {
	var allowedRuneFunc func(rune) bool
	switch expected {
	case FUNCTIONID, PARAMNAME:
		allowedRuneFunc = isLowercaseLetterOrDigitOrMinus
	case PARAMVALUE:
		allowedRuneFunc = isLetterOrDigitOrSymbol
	case B64ENCODED:
		allowedRuneFunc = isB64
	default:
		allowedRuneFunc = none
	}

	token, literal := p.scan(allowedRuneFunc)
	if token&expected == 0 {
		return "", fmt.Errorf("found %q, expected '$'", literal)
	}

	return literal, nil
}

// parseFunctionId parses a function ID
func (p *Parser) parseFunctionId() (string, error) {
	literal, err := p.parseToken(DOLLARSIGN)
	if err != nil {
		return literal, fmt.Errorf("found %q, expected '$'", literal)
	}

	literal, err = p.parseToken(FUNCTIONID)
	if err != nil {
		return literal, fmt.Errorf("found %q, expected a function identifier", literal)
	}
	return literal, nil
}

// parseHash parses a base64-encoded hash
func (p *Parser) parseHash() (string, error) {
	// We parse the hash
	hash, err := p.parseToken(B64ENCODED)
	if err != nil {
		return "", fmt.Errorf("found %q, expected the hash", hash)
	}

	// and make sure that the string finishes right after it
	literal, err := p.parseToken(EOF)
	if err != nil {
		return "", fmt.Errorf("found %q, expected EOF", literal)
	}

	return hash, nil
}

// parseParamsRHS parses an equal sign followed by a parameter value, returning
// only the parameter value.
func (p *Parser) parseParamRHS() (string, error) {
	if literal, err := p.parseToken(EQUALSIGN); err != nil {
		return literal, err
	}

	return p.parseToken(PARAMVALUE)
}

// Parse parses the [Parser]'s reader into a [PHC].
//
// This function will return an error along with an empty [PHC] when the provided
// input is not PHC-compliant.
func (p *Parser) Parse() (PHC, error) {
	// Initialize the returned PHC and its inner parameters map
	out := PHC{}
	out.Params = make(map[string]string)

	// Start parsing: first, we expect '$functionId'
	id, err := p.parseFunctionId()
	if err != nil {
		return PHC{}, fmt.Errorf("failed to parse function ID: %w", err)
	}
	out.Id = id

	// Now we expect either EOF, or to continue parsing with a '$'
	switch token, literal := p.scanSeparator(); token {
	case EOF:
		// Just a function identifier is valid, according to the spec
		return out, nil
	case DOLLARSIGN:
		// We continue parsing
		break
	default:
		return PHC{}, fmt.Errorf("found %q, expected '$' or EOF", literal)
	}

	// There was a '$', so we expect now another identifier, which can either be:
	// - The version key, "v",
	// - A parameter name
	// - The salt
	// B64ENCODED is a superset of PARAMNAME (which is also a superset of "v"),
	// sso we allow the former because we don't know yet what we're parsing.
	versionKeyOrParamNameOrSalt, err := p.parseToken(B64ENCODED)
	if err != nil {
		return PHC{}, fmt.Errorf("found %q, expected the version key, 'v', a parameter name or the salt: %w", versionKeyOrParamNameOrSalt, err)
	}

	// If it's the version key, then we know now that we are parsing
	// '$v=versionStr', and we expect now '=versionStr'
	if versionKeyOrParamNameOrSalt == "v" {
		versionStr, err := p.parseParamRHS()
		if err != nil {
			return PHC{}, fmt.Errorf("failed parsing version string: %w", err)
		}
		out.Version = versionStr

		// Now we expect either EOF, or to continue parsing with a '$'
		switch token, literal := p.scanSeparator(); token {
		case EOF:
			// Just a function identifier + version is valid, according to the spec
			return out, nil
		case DOLLARSIGN:
			// We continue parsing
			break
		default:
			return PHC{}, fmt.Errorf("found %q, expected '$' or EOF", literal)
		}

		// Read the next ident into the variable we had before, so we can continue
		// the logic regardless of whether this block was executed or not.
		versionKeyOrParamNameOrSalt, err = p.parseToken(B64ENCODED)
		if err != nil {
			return PHC{}, fmt.Errorf("found %q, expected a parameter name or the version key, 'v'", versionKeyOrParamNameOrSalt)
		}
	}

	// Now, we either didn't have a version key, or we have already parsed it,
	// so we are left with either a parameter name or the salt.
	paramNameOrSalt := versionKeyOrParamNameOrSalt

	// We know which one by scaning the next token:
	switch token, literal := p.scanSeparator(); token {
	// If the following token is '=', then it was a parameter name, and we
	// expect now '=value'
	case EQUALSIGN:
		paramName := paramNameOrSalt
		// Additional validation for the parameter name not to have the invalid
		// value "v"
		if paramName == "v" {
			return PHC{}, fmt.Errorf("found 'v' as a parameter name, which is only allowed as the version key")
		}
		// Now we parse '=value'
		paramValue, err := p.parseToken(PARAMVALUE)
		if err != nil {
			return PHC{}, fmt.Errorf("found %q, expected a value for parameter %q", paramValue, paramName)
		}

		// And we store the parameter
		out.Params[paramName] = paramValue

	// If the following token is '$' or EOF, then it was the salt, so we store it,
	// and optionally parse the hash
	case DOLLARSIGN, EOF:
		salt := paramNameOrSalt
		out.Salt = salt

		// If the token was '$', then now we expect a hash
		if token == DOLLARSIGN {
			hash, err := p.parseHash()
			if err != nil {
				return PHC{}, err
			}
			out.Hash = hash
		}

		return out, nil
	// Otherwise, we have an error
	default:
		return PHC{}, fmt.Errorf("found %q, expected either '$', or '=' or EOF", literal)
	}

	// If we are here, it means that we just parsed a parameter value, so now we
	// have three possibilities (in a loop):
	// - If we see EOF, then we're done!
	// - If we see a comma, then we expect another name=value pair, and we
	//   restart the loop
	// - If we see '$', then we need to parse 'salt[$hash]', and we finish
	for {
		switch token, literal := p.scanSeparator(); token {
		// We're done!
		case EOF:
			return out, nil
		// Parse a name=value pair, and continue the loop
		case COMMA:
			paramName, err := p.parseToken(PARAMNAME)
			if err != nil {
				return PHC{}, err
			}

			paramValue, err := p.parseParamRHS()
			if err != nil {
				return PHC{}, fmt.Errorf("failed parsing value from parameter %q: %w", paramName, err)
			}
			out.Params[paramName] = paramValue
		// Parse a salt and an optional hash, and finish
		case DOLLARSIGN:
			salt, err := p.parseToken(B64ENCODED)
			if err != nil {
				return PHC{}, err
			}
			out.Salt = salt

			switch token, newLiteral := p.scanSeparator(); token {
			// If what we parsed was a $, then now we expect a $hash
			case DOLLARSIGN:
				hash, err := p.parseHash()
				if err != nil {
					return PHC{}, err
				}
				out.Hash = hash
				return out, nil
			// If what we parsed was an EOF, then we return successfully
			case EOF:
				return out, nil
			// Otherwise, we have an error
			default:
				return PHC{}, fmt.Errorf("found %q, expected either '$', or EOF", newLiteral)
			}
		default:
			return PHC{}, fmt.Errorf("found %q, expected either ',', '$' or EOF", literal)
		}
	}
}