antlr · RobEin · Feb 10, 2025
diff --git a/python/python2_7_18/CSharp/AssemblyInfo.cs b/python/python2_7_18/CSharp/AssemblyInfo.cs
@@ -0,0 +1,2 @@
+[assembly: CLSCompliant(true)]
+
diff --git a/python/python2_7_18/CSharp/PythonLexerBase.cs b/python/python2_7_18/CSharp/PythonLexerBase.cs
@@ -34,10 +34,10 @@ THE SOFTWARE.
 public abstract class PythonLexerBase : Lexer
 {
     // A stack that keeps track of the indentation lengths
-    private Stack<int> indentLengthStack;
+    private Stack<int> indentLengthStack = new();
     // A list where tokens are waiting to be loaded into the token stream
-    private LinkedList<IToken> pendingTokens;
-    
+    private LinkedList<IToken> pendingTokens = new();
+
     // last pending token types
     private int previousPendingTokenType;
     private int lastPendingTokenTypeFromDefaultChannel;
@@ -49,26 +49,24 @@ public abstract class PythonLexerBase : Lexer
     private bool wasTabIndentation;
     private bool wasIndentationMixedWithSpacesAndTabs;
 
-    private IToken curToken; // current (under processing) token
-    private IToken ffgToken; // following (look ahead) token
+    private IToken curToken = null!; // current (under processing) token
+    private IToken ffgToken = null!; // following (look ahead) token
 
     private const int INVALID_LENGTH = -1;
     private const string ERR_TXT = " ERROR: ";
 
     protected PythonLexerBase(ICharStream input) : base(input)
     {
-        this.Init();
     }
 
     protected PythonLexerBase(ICharStream input, TextWriter output, TextWriter errorOutput) : base(input, output, errorOutput)
     {
-        this.Init();
     }
 
     public override IToken NextToken() // reading the input stream until a return EOF
     {
         this.CheckNextToken();
-        IToken firstPendingToken = this.pendingTokens.First.Value;
+        IToken firstPendingToken = this.pendingTokens.First!.Value;
         this.pendingTokens.RemoveFirst();
         return firstPendingToken; // add the queued token to the token stream
     }
@@ -78,11 +76,11 @@ public override void Reset()
         this.Init();
         base.Reset();
     }
-    
+
     private void Init()
     {
-        this.indentLengthStack = new Stack<int>();
-        this.pendingTokens = new LinkedList<IToken>();
+        this.indentLengthStack = new();
+        this.pendingTokens = new();
         this.previousPendingTokenType = 0;
         this.lastPendingTokenTypeFromDefaultChannel = 0;
         this.opened = 0;
@@ -180,7 +178,7 @@ private void InsertLeadingIndentToken()
     {
         if (this.previousPendingTokenType == PythonLexer.WS)
         {
-            var prevToken = this.pendingTokens.Last.Value;
+            var prevToken = this.pendingTokens.Last!.Value;
             if (this.GetIndentationLength(prevToken.Text) != 0) // there is an "indentation" before the first statement
             {
                 const string errMsg = "first statement indented";
@@ -302,7 +300,7 @@ private void HideAndAddPendingToken(IToken tkn)
         this.AddPendingToken(ctkn);
     }
 
-    private void CreateAndAddPendingToken(int ttype, int channel, string text, IToken sampleToken)
+    private void CreateAndAddPendingToken(int ttype, int channel, string? text, IToken sampleToken)
     {
         CommonToken ctkn = new CommonToken(sampleToken);
         ctkn.Type = ttype;

diff --git a/python/python2_7_18/Python3/README.md b/python/python2_7_18/Python3/README.md
diff --git a/python/python2_7_18/Python3/transformGrammar.py b/python/python2_7_18/Python3/transformGrammar.py
diff --git a/python/python2_7_18/PythonLexer.g4 b/python/python2_7_18/PythonLexer.g4
@@ -28,46 +28,17 @@ THE SOFTWARE.
   */
 
 lexer grammar PythonLexer;
+
 options { superClass=PythonLexerBase; }
-tokens { INDENT, DEDENT } // https://docs.python.org/2.7/reference/lexical_analysis.html#indentation
+
+tokens { 
+  INDENT, DEDENT // https://docs.python.org/2.7/reference/lexical_analysis.html#indentation
+}
 
 /*
  * lexer rules    // https://docs.python.org/2.7/library/tokenize.html
  */
 
-// https://docs.python.org/2.7/reference/lexical_analysis.html#keywords
-AND      : 'and';
-AS       : 'as';
-ASSERT   : 'assert';
-BREAK    : 'break';
-CLASS    : 'class';
-CONTINUE : 'continue';
-DEF      : 'def';
-DEL      : 'del';
-ELIF     : 'elif';
-ELSE     : 'else';
-EXCEPT   : 'except';
-EXEC     : 'exec';
-FINALLY  : 'finally';
-FOR      : 'for';
-FROM     : 'from';
-GLOBAL   : 'global';
-IF       : 'if';
-IMPORT   : 'import';
-IN       : 'in';
-IS       : 'is';
-LAMBDA   : 'lambda';
-NOT      : 'not';
-OR       : 'or';
-PASS     : 'pass';
-PRINT    : 'print';
-RAISE    : 'raise';
-RETURN   : 'return';
-TRY      : 'try';
-WHILE    : 'while';
-WITH     : 'with';
-YIELD    : 'yield';
-
 // https://docs.python.org/2.7/library/token.html#token.OP
 LPAR             : '(';  // OPEN_PAREN
 LSQB             : '[';  // OPEN_BRACK
@@ -115,6 +86,38 @@ DOUBLESLASH      : '//';
 DOUBLESLASHEQUAL : '//=';
 AT               : '@';
 
+// https://docs.python.org/2.7/reference/lexical_analysis.html#keywords
+AND      : 'and';
+AS       : 'as';
+ASSERT   : 'assert';
+BREAK    : 'break';
+CLASS    : 'class';
+CONTINUE : 'continue';
+DEF      : 'def';
+DEL      : 'del';
+ELIF     : 'elif';
+ELSE     : 'else';
+EXCEPT   : 'except';
+EXEC     : 'exec';
+FINALLY  : 'finally';
+FOR      : 'for';
+FROM     : 'from';
+GLOBAL   : 'global';
+IF       : 'if';
+IMPORT   : 'import';
+IN       : 'in';
+IS       : 'is';
+LAMBDA   : 'lambda';
+NOT      : 'not';
+OR       : 'or';
+PASS     : 'pass';
+PRINT    : 'print';
+RAISE    : 'raise';
+RETURN   : 'return';
+TRY      : 'try';
+WHILE    : 'while';
+WITH     : 'with';
+YIELD    : 'yield';
 
 // https://docs.python.org/2.7/reference/lexical_analysis.html#identifiers
 NAME : IDENTIFIER;
@@ -134,15 +137,16 @@ STRING : STRING_LITERAL;
 NEWLINE : '\r'? '\n'; // Unix, Windows
 
 // https://docs.python.org/2.7/reference/lexical_analysis.html#comments
-COMMENT : '#' ~[\r\n]*               -> channel(HIDDEN);
+COMMENT : '#' ~[\r\n]*                    -> channel(HIDDEN);
 
 // https://docs.python.org/2.7/reference/lexical_analysis.html#whitespace-between-tokens
-WS : [ \t\f]+                        -> channel(HIDDEN);
+WS : [ \t\f]+                             -> channel(HIDDEN);
 
 // https://docs.python.org/2.7/reference/lexical_analysis.html#explicit-line-joining
-EXPLICIT_LINE_JOINING : '\\' NEWLINE -> channel(HIDDEN);
+EXPLICIT_LINE_JOINING : BACKSLASH_NEWLINE -> channel(HIDDEN);
 
-ERRORTOKEN : . ; // catch unrecognized characters and redirect these errors to the parser
+// catch the unrecognized character(s)
+ERRORTOKEN : . ; // PythonLexerBase class will report an error about this (the ERRORTOKEN will also cause an error in the parser)
 
 
 /*
@@ -153,30 +157,35 @@ ERRORTOKEN : . ; // catch unrecognized characters and redirect these errors to t
 
 // https://docs.python.org/2.7/reference/lexical_analysis.html#string-literals
 fragment STRING_LITERAL : STRING_PREFIX? (SHORT_STRING | LONG_STRING);
-fragment STRING_PREFIX  : 'r' | 'u' | 'ur' | 'R' | 'U' | 'UR' | 'Ur' | 'uR' | 'b' | 'B' | 'br' | 'Br' | 'bR' | 'BR';
+
+// 'r' | 'u' | 'ur' | 'R' | 'U' | 'UR' | 'Ur' | 'uR' | 'b' | 'B' | 'br' | 'Br' | 'bR' | 'BR';
+fragment STRING_PREFIX  options { caseInsensitive=true; } : 'r' | 'u' | 'ur' | 'b' | 'br';
 
 fragment SHORT_STRING
-   : '\'' SHORT_STRING_ITEM_FOR_SINGLE_QUOTE* '\''
-   | '"'  SHORT_STRING_ITEM_FOR_DOUBLE_QUOTE* '"'
-   ;
+    : ['] SHORT_STRING_ITEM_FOR_SINGLE_QUOTE* [']
+    | ["] SHORT_STRING_ITEM_FOR_DOUBLE_QUOTE* ["]
+    ;
 
 fragment LONG_STRING
-   : '\'\'\'' LONG_STRING_ITEM*? '\'\'\''
-   | '"""'    LONG_STRING_ITEM*? '"""'
-   ;
+    : ['][']['] LONG__STRING_ITEM*? ['][']['] // nongreede
+    | ["]["]["] LONG__STRING_ITEM*? ["]["]["] // nongreede
+    ;
 
 fragment SHORT_STRING_ITEM_FOR_SINGLE_QUOTE : SHORT_STRING_CHAR_NO_SINGLE_QUOTE | ESCAPE_SEQ;
 fragment SHORT_STRING_ITEM_FOR_DOUBLE_QUOTE : SHORT_STRING_CHAR_NO_DOUBLE_QUOTE | ESCAPE_SEQ;
 
-fragment LONG_STRING_ITEM : LONG_STRING_CHAR | ESCAPE_SEQ;
+fragment LONG__STRING_ITEM : LONG_STRING_CHAR | ESCAPE_SEQ;
 
-fragment SHORT_STRING_CHAR_NO_SINGLE_QUOTE : ~[\\\r\n'];      // <any source character except "\" or newline or single quote>
-fragment SHORT_STRING_CHAR_NO_DOUBLE_QUOTE : ~[\\\r\n"];      // <any source character except "\" or newline or double quote>
-fragment LONG_STRING_CHAR  : ~'\\';                           // <any source character except "\">
-fragment ESCAPE_SEQ // https://docs.python.org/2.7/reference/lexical_analysis.html#string-literals
-   : '\\' '\r' '\n'  // for the two-character Windows line break: \<newline> escape sequence (string literal line continuation)
-   | '\\' [\u0000-\u007F]                                     // "\" <any ASCII character>
-   ;
+fragment SHORT_STRING_CHAR_NO_SINGLE_QUOTE : ~[\\\r\n'];          // <any source character except "\" or newline or single quote>
+fragment SHORT_STRING_CHAR_NO_DOUBLE_QUOTE : ~[\\\r\n"];          // <any source character except "\" or newline or double quote>
+fragment LONG_STRING_CHAR  : ~'\\';                               // <any source character except "\">
+
+// https://docs.python.org/2.7/reference/lexical_analysis.html#string-literals
+fragment ESCAPE_SEQ : ESCAPE_SEQ_NEWLINE | '\\' [\u0000-\u007F];  // "\" <any ASCII character>
+
+fragment ESCAPE_SEQ_NEWLINE : BACKSLASH_NEWLINE; // it is a kind of line continuation for string literals (backslash and newline will be ignored)
+
+fragment BACKSLASH_NEWLINE : '\\' NEWLINE;
 
 // https://docs.python.org/2.7/reference/lexical_analysis.html#integer-and-long-integer-literals
 fragment LONG_INTEGER    : INTEGER ('l' | 'L');

diff --git a/python/python2_7_18/PythonParser.g4 b/python/python2_7_18/PythonParser.g4
@@ -26,8 +26,11 @@ THE SOFTWARE.
   * Developed by : Robert Einhorn
   */
 
-parser grammar PythonParser; // https://docs.python.org/2.7/reference/grammar.html
+// https://docs.python.org/2.7/reference/grammar.html
+parser grammar PythonParser;
+
 options { tokenVocab=PythonLexer; }
+
 // ANTLR4 grammar for Python
 
 // Start symbols for the grammar:

diff --git a/python/python2_7_18/README.md b/python/python2_7_18/README.md
@@ -1,13 +1,14 @@
 # Python 2.7.18 parser
 
 ### About files:
- - PythonParser.g4
+- PythonParser.g4
    is the ANTLR4 parser grammar that based on the last official [Python 2 grammar](https://docs.python.org/2.7/reference/grammar.html)
 
- - PythonLexerBase
-   handles the Python indentations
-
- - Example files: [Python 2.7.18 Standard Lib](https://www.python.org/downloads/release/python-2718/)
+- PythonLexerBase:
+    - handles the Python indentations
+    - and manage many other things
+
+- Example files from: [Python 2.7.18 Standard Lib](https://www.python.org/downloads/release/python-2718/)
 
 ### Related link:
 [ANTLR4-parser-for-Python-2.7.18](https://github.com/RobEin/ANTLR4-parser-for-Python-2.7.18)
diff --git a/python/python2_7_18/changes.md b/python/python2_7_18/changes.md
@@ -0,0 +1,3 @@
+# Sept. 05, 2024
+- Line continuation for string literals (backslash followed by a newline) is no longer resolved.  
+  (backslash+newline is no longer removed from string literals)
diff --git a/python/python2_7_18/changes.txt b/python/python2_7_18/changes.txt
diff --git a/python/python2_7_18/tests/test_empty_file.py b/python/python2_7_18/tests/test_empty_file.py
diff --git a/python/python2_7_18/tests/test_error_first_statement_indented.py b/python/python2_7_18/tests/test_error_first_statement_indented.py
diff --git a/python/python2_7_18/tests/test_error_inconsistent_dedent.py b/python/python2_7_18/tests/test_error_inconsistent_dedent.py
diff --git a/python/python2_7_18/tests/test_error_not_indented.py b/python/python2_7_18/tests/test_error_not_indented.py
diff --git a/python/python2_7_18/tests/test_error_tab_and_space_in_indentation.py b/python/python2_7_18/tests/test_error_tab_and_space_in_indentation.py
diff --git a/python/python2_7_18/tests/test_error_unexpected_indent.py b/python/python2_7_18/tests/test_error_unexpected_indent.py
diff --git a/python/python2_7_18/tests/test_explicit_line_joining.py b/python/python2_7_18/tests/test_explicit_line_joining.py
diff --git a/python/python2_7_18/tests/test_formfeed_as_separator.py b/python/python2_7_18/tests/test_formfeed_as_separator.py
diff --git a/python/python2_7_18/tests/test_formfeed_at_start_of_line.py b/python/python2_7_18/tests/test_formfeed_at_start_of_line.py