import json from six import unichr from ..error import GraphQLSyntaxError # Necessary for static type checking if False: # flake8: noqa from typing import Optional, Any, List from .source import Source __all__ = ["Token", "Lexer", "TokenKind", "get_token_desc", "get_token_kind_desc"] class Token(object): __slots__ = "kind", "start", "end", "value" def __init__(self, kind, start, end, value=None): # type: (int, int, int, Optional[str]) -> None self.kind = kind self.start = start self.end = end self.value = value def __repr__(self): # type: () -> str return u"".format( get_token_kind_desc(self.kind), self.start, self.end, repr(self.value) ) def __eq__(self, other): # type: (Any) -> bool return ( isinstance(other, Token) and self.kind == other.kind and self.start == other.start and self.end == other.end and self.value == other.value ) class Lexer(object): __slots__ = "source", "prev_position" def __init__(self, source): # type: (Source) -> None self.source = source self.prev_position = 0 def next_token(self, reset_position=None): # type: (Optional[int]) -> Token if reset_position is None: reset_position = self.prev_position token = read_token(self.source, reset_position) self.prev_position = token.end return token class TokenKind(object): EOF = 1 BANG = 2 DOLLAR = 3 PAREN_L = 4 PAREN_R = 5 SPREAD = 6 COLON = 7 EQUALS = 8 AT = 9 BRACKET_L = 10 BRACKET_R = 11 BRACE_L = 12 PIPE = 13 BRACE_R = 14 NAME = 15 VARIABLE = 16 INT = 17 FLOAT = 18 STRING = 19 def get_token_desc(token): # type: (Token) -> str if token.value: return u'{} "{}"'.format(get_token_kind_desc(token.kind), token.value) else: return get_token_kind_desc(token.kind) def get_token_kind_desc(kind): # type: (int) -> str return TOKEN_DESCRIPTION[kind] TOKEN_DESCRIPTION = { TokenKind.EOF: "EOF", TokenKind.BANG: "!", TokenKind.DOLLAR: "$", TokenKind.PAREN_L: "(", TokenKind.PAREN_R: ")", TokenKind.SPREAD: "...", TokenKind.COLON: ":", TokenKind.EQUALS: "=", TokenKind.AT: "@", TokenKind.BRACKET_L: "[", TokenKind.BRACKET_R: "]", TokenKind.BRACE_L: "{", TokenKind.PIPE: "|", TokenKind.BRACE_R: "}", TokenKind.NAME: "Name", TokenKind.VARIABLE: "Variable", TokenKind.INT: "Int", TokenKind.FLOAT: "Float", TokenKind.STRING: "String", } def char_code_at(s, pos): # type: (str, int) -> Optional[int] if 0 <= pos < len(s): return ord(s[pos]) return None PUNCT_CODE_TO_KIND = { ord("!"): TokenKind.BANG, ord("$"): TokenKind.DOLLAR, ord("("): TokenKind.PAREN_L, ord(")"): TokenKind.PAREN_R, ord(":"): TokenKind.COLON, ord("="): TokenKind.EQUALS, ord("@"): TokenKind.AT, ord("["): TokenKind.BRACKET_L, ord("]"): TokenKind.BRACKET_R, ord("{"): TokenKind.BRACE_L, ord("|"): TokenKind.PIPE, ord("}"): TokenKind.BRACE_R, } def print_char_code(code): # type: (Optional[int]) -> str if code is None: return "" if code < 0x007F: return json.dumps(unichr(code)) return '"\\u%04X"' % code def read_token(source, from_position): # type: (Source, int) -> Token """Gets the next token from the source starting at the given position. This skips over whitespace and comments until it finds the next lexable token, then lexes punctuators immediately or calls the appropriate helper fucntion for more complicated tokens.""" body = source.body body_length = len(body) position = position_after_whitespace(body, from_position) if position >= body_length: return Token(TokenKind.EOF, position, position) code = char_code_at(body, position) if code: if code < 0x0020 and code not in (0x0009, 0x000A, 0x000D): raise GraphQLSyntaxError( source, position, u"Invalid character {}.".format(print_char_code(code)) ) kind = PUNCT_CODE_TO_KIND.get(code) if kind is not None: return Token(kind, position, position + 1) if code == 46: # . if ( char_code_at(body, position + 1) == char_code_at(body, position + 2) == 46 ): return Token(TokenKind.SPREAD, position, position + 3) elif 65 <= code <= 90 or code == 95 or 97 <= code <= 122: # A-Z, _, a-z return read_name(source, position) elif code == 45 or 48 <= code <= 57: # -, 0-9 return read_number(source, position, code) elif code == 34: # " return read_string(source, position) raise GraphQLSyntaxError( source, position, u"Unexpected character {}.".format(print_char_code(code)) ) ignored_whitespace_characters = frozenset( [ # BOM 0xFEFF, # White Space 0x0009, # tab 0x0020, # space # Line Terminator 0x000A, # new line 0x000D, # carriage return # Comma 0x002C, ] ) def position_after_whitespace(body, start_position): # type: (str, int) -> int """Reads from body starting at start_position until it finds a non-whitespace or commented character, then returns the position of that character for lexing.""" body_length = len(body) position = start_position while position < body_length: code = char_code_at(body, position) if code in ignored_whitespace_characters: position += 1 elif code == 35: # #, skip comments position += 1 while position < body_length: code = char_code_at(body, position) if not ( code is not None and (code > 0x001F or code == 0x0009) and code not in (0x000A, 0x000D) ): break position += 1 else: break return position def read_number(source, start, first_code): # type: (Source, int, Optional[int]) -> Token r"""Reads a number token from the source file, either a float or an int depending on whether a decimal point appears. Int: -?(0|[1-9][0-9]*) Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)?""" code = first_code body = source.body position = start is_float = False if code == 45: # - position += 1 code = char_code_at(body, position) if code == 48: # 0 position += 1 code = char_code_at(body, position) if code is not None and 48 <= code <= 57: raise GraphQLSyntaxError( source, position, u"Invalid number, unexpected digit after 0: {}.".format( print_char_code(code) ), ) else: position = read_digits(source, position, code) code = char_code_at(body, position) if code == 46: # . is_float = True position += 1 code = char_code_at(body, position) position = read_digits(source, position, code) code = char_code_at(body, position) if code in (69, 101): # E e is_float = True position += 1 code = char_code_at(body, position) if code in (43, 45): # + - position += 1 code = char_code_at(body, position) position = read_digits(source, position, code) return Token( TokenKind.FLOAT if is_float else TokenKind.INT, start, position, body[start:position], ) def read_digits(source, start, first_code): # type: (Source, int, Optional[int]) -> int body = source.body position = start code = first_code if code is not None and 48 <= code <= 57: # 0 - 9 while True: position += 1 code = char_code_at(body, position) if not (code is not None and 48 <= code <= 57): break return position raise GraphQLSyntaxError( source, position, u"Invalid number, expected digit but got: {}.".format(print_char_code(code)), ) ESCAPED_CHAR_CODES = { 34: '"', 47: "/", 92: "\\", 98: "\b", 102: "\f", 110: "\n", 114: "\r", 116: "\t", } def read_string(source, start): # type: (Source, int) -> Token """Reads a string token from the source file. "([^"\\\u000A\u000D\u2028\u2029]|(\\(u[0-9a-fA-F]{4}|["\\/bfnrt])))*" """ body = source.body body_length = len(body) position = start + 1 chunk_start = position code = 0 # type: Optional[int] value = [] # type: List[str] append = value.append while position < body_length: code = char_code_at(body, position) if code in ( None, # LineTerminator 0x000A, 0x000D, # Quote 34, ): break if code < 0x0020 and code != 0x0009: # type: ignore raise GraphQLSyntaxError( source, position, u"Invalid character within String: {}.".format(print_char_code(code)), ) position += 1 if code == 92: # \ append(body[chunk_start : position - 1]) code = char_code_at(body, position) escaped = ESCAPED_CHAR_CODES.get(code) # type: ignore if escaped is not None: append(escaped) elif code == 117: # u char_code = uni_char_code( char_code_at(body, position + 1) or 0, char_code_at(body, position + 2) or 0, char_code_at(body, position + 3) or 0, char_code_at(body, position + 4) or 0, ) if char_code < 0: raise GraphQLSyntaxError( source, position, u"Invalid character escape sequence: \\u{}.".format( body[position + 1 : position + 5] ), ) append(unichr(char_code)) position += 4 else: raise GraphQLSyntaxError( source, position, u"Invalid character escape sequence: \\{}.".format( unichr(code) # type: ignore ), ) position += 1 chunk_start = position if code != 34: # Quote (") raise GraphQLSyntaxError(source, position, "Unterminated string") append(body[chunk_start:position]) return Token(TokenKind.STRING, start, position + 1, u"".join(value)) def uni_char_code(a, b, c, d): # type: (int, int, int, int) -> int """Converts four hexidecimal chars to the integer that the string represents. For example, uniCharCode('0','0','0','f') will return 15, and uniCharCode('0','0','f','f') returns 255. Returns a negative number on error, if a char was invalid. This is implemented by noting that char2hex() returns -1 on error, which means the result of ORing the char2hex() will also be negative. """ return char2hex(a) << 12 | char2hex(b) << 8 | char2hex(c) << 4 | char2hex(d) def char2hex(a): # type: (int) -> int """Converts a hex character to its integer value. '0' becomes 0, '9' becomes 9 'A' becomes 10, 'F' becomes 15 'a' becomes 10, 'f' becomes 15 Returns -1 on error.""" if 48 <= a <= 57: # 0-9 return a - 48 elif 65 <= a <= 70: # A-F return a - 55 elif 97 <= a <= 102: # a-f return a - 87 return -1 def read_name(source, position): # type: (Source, int) -> Token """Reads an alphanumeric + underscore name from the source. [_A-Za-z][_0-9A-Za-z]*""" body = source.body body_length = len(body) end = position + 1 while end != body_length: code = char_code_at(body, end) if not ( code is not None and ( code == 95 or 48 <= code <= 57 # _ or 65 <= code <= 90 # 0-9 or 97 <= code <= 122 # A-Z # a-z ) ): break end += 1 return Token(TokenKind.NAME, position, end, body[position:end])