|
- package depends.extractor.python.union;
-
- import org.antlr.v4.runtime.CharStream;
- import org.antlr.v4.runtime.CommonToken;
- import org.antlr.v4.runtime.Lexer;
- import org.antlr.v4.runtime.Token;
-
- import depends.extractor.python.PythonLexer;
-
- import java.util.Stack;
-
- public abstract class PythonLexerBase extends Lexer {
- public static int TabSize = 8;
-
- // The amount of opened braces, brackets and parenthesis.
- private int _opened;
-
- // The stack that keeps track of the indentation level.
- private Stack<Integer> _indents = new Stack<>();
-
- // A circular buffer where extra tokens are pushed on (see the NEWLINE and WS lexer rules).
- private int _firstTokensInd;
- private int _lastTokenInd;
- private Token[] _buffer = new Token[32];
- private Token _lastToken;
-
- protected PythonLexerBase(CharStream input) {
- super(input);
- }
-
- @Override
- public void emit(Token token) {
- super.setToken(token);
-
- if (_buffer[_firstTokensInd] != null)
- {
- _lastTokenInd = IncTokenInd(_lastTokenInd);
-
- if (_lastTokenInd == _firstTokensInd)
- {
- // Enlarge buffer
- Token[] newArray = new Token[_buffer.length * 2];
- int destInd = newArray.length - (_buffer.length - _firstTokensInd);
-
- System.arraycopy(_buffer, 0, newArray, 0, _firstTokensInd);
- System.arraycopy(_buffer, _firstTokensInd, newArray, destInd, _buffer.length - _firstTokensInd);
-
- _firstTokensInd = destInd;
- _buffer = newArray;
- }
- }
-
- _buffer[_lastTokenInd] = token;
- _lastToken = token;
- }
-
- @Override
- public Token nextToken() {
- // Check if the end-of-file is ahead and there are still some DEDENTS expected.
- if (_input.LA(1) == EOF && _indents.size() > 0)
- {
- if (_buffer[_lastTokenInd] == null || _buffer[_lastTokenInd].getType() != PythonLexer.LINE_BREAK)
- {
- // First emit an extra line break that serves as the end of the statement.
- emit(PythonLexer.LINE_BREAK);
- }
-
- // Now emit as much DEDENT tokens as needed.
- while (_indents.size() != 0)
- {
- emit(PythonLexer.DEDENT);
- _indents.pop();
- }
- }
-
- Token next = super.nextToken();
-
- if (_buffer[_firstTokensInd] == null)
- {
- return next;
- }
-
- Token result = _buffer[_firstTokensInd];
- _buffer[_firstTokensInd] = null;
-
- if (_firstTokensInd != _lastTokenInd)
- {
- _firstTokensInd = IncTokenInd(_firstTokensInd);
- }
-
- return result;
- }
-
- protected void HandleNewLine() {
- emit(PythonLexer.NEWLINE, HIDDEN, getText());
-
- char next = (char) _input.LA(1);
-
- // Process whitespaces in HandleSpaces
- if (next != ' ' && next != '\t' && IsNotNewLineOrComment(next))
- {
- ProcessNewLine(0);
- }
- }
-
- protected void HandleSpaces() {
- char next = (char) _input.LA(1);
-
- if ((_lastToken == null || _lastToken.getType() == PythonLexer.NEWLINE) && IsNotNewLineOrComment(next))
- {
- // Calculates the indentation of the provided spaces, taking the
- // following rules into account:
- //
- // "Tabs are replaced (from left to right) by one to eight spaces
- // such that the total number of characters up to and including
- // the replacement is a multiple of eight [...]"
- //
- // -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation
-
- int indent = 0;
- String text = getText();
-
- for (int i = 0; i < text.length(); i++) {
- indent += text.charAt(i) == '\t' ? TabSize - indent % TabSize : 1;
- }
-
- ProcessNewLine(indent);
- }
-
- emit(PythonLexer.WS, HIDDEN, getText());
- }
-
- protected void IncIndentLevel() {
- _opened++;
- }
-
- protected void DecIndentLevel() {
- if (_opened > 0) {
- --_opened;
- }
- }
-
- private boolean IsNotNewLineOrComment(char next) {
- return _opened == 0 && next != '\r' && next != '\n' && next != '\f' && next != '#';
- }
-
- private void ProcessNewLine(int indent) {
- emit(PythonLexer.LINE_BREAK);
-
- int previous = _indents.size() == 0 ? 0 : _indents.peek();
-
- if (indent > previous)
- {
- _indents.push(indent);
- emit(PythonLexer.INDENT);
- }
- else
- {
- // Possibly emit more than 1 DEDENT token.
- while (_indents.size() != 0 && _indents.peek() > indent)
- {
- emit(PythonLexer.DEDENT);
- _indents.pop();
- }
- }
- }
-
- private int IncTokenInd(int ind) {
- return (ind + 1) % _buffer.length;
- }
-
- private void emit(int tokenType) {
- emit(tokenType, DEFAULT_TOKEN_CHANNEL, "");
- }
-
- private void emit(int tokenType, int channel, String text) {
- int charIndex = getCharIndex();
- CommonToken token = new CommonToken(_tokenFactorySourcePair, tokenType, channel, charIndex - text.length(), charIndex);
- token.setLine(getLine());
- token.setCharPositionInLine(getCharPositionInLine());
- token.setText(text);
-
- emit(token);
- }
- }
|