Coverage for C:\Repos\leo-editor\leo\plugins\importers\javascript.py: 79%
265 statements
« prev ^ index » next coverage.py v6.4, created at 2022-05-24 10:21 -0500
« prev ^ index » next coverage.py v6.4, created at 2022-05-24 10:21 -0500
1#@+leo-ver=5-thin
2#@+node:ekr.20140723122936.18144: * @file ../plugins/importers/javascript.py
3"""The @auto importer for JavaScript."""
4import re
5import textwrap
6import unittest
7from typing import List
8from leo.core import leoGlobals as g
9from leo.plugins.importers import linescanner
10Importer = linescanner.Importer
11Target = linescanner.Target
12#@+others
13#@+node:ekr.20140723122936.18049: ** class JS_Importer
14class JS_Importer(Importer):
16 def __init__(self, importCommands, force_at_others=False, **kwargs):
17 """The ctor for the JS_ImportController class."""
18 # Init the base class.
19 super().__init__(
20 importCommands,
21 gen_refs=False, # Fix #639.
22 language='javascript',
23 state_class=JS_ScanState,
24 )
26 #@+others
27 #@+node:ekr.20180123051226.1: *3* js_i.post_pass & helpers
28 def post_pass(self, parent):
29 """
30 Optional Stage 2 of the javascript pipeline.
32 All substages **must** use the API for setting body text. Changing
33 p.b directly will cause asserts to fail later in i.finish().
34 """
35 self.clean_all_headlines(parent)
36 self.remove_singleton_at_others(parent)
37 self.clean_all_nodes(parent)
38 self.move_trailing_comments(parent)
39 #@+node:ekr.20180123051401.1: *4* js_i.remove_singleton_at_others
40 at_others = re.compile(r'^\s*@others\b')
42 def remove_singleton_at_others(self, parent):
43 """Replace @others by the body of a singleton child node."""
44 found = False
45 for p in parent.subtree():
46 if p.numberOfChildren() == 1:
47 child = p.firstChild()
48 lines = self.get_lines(p)
49 matches = [i for i, s in enumerate(lines) if self.at_others.match(s)]
50 if len(matches) == 1:
51 found = True
52 i = matches[0]
53 child_lines = self.get_lines(child)
54 lines = lines[:i] + child_lines + lines[i + 1 :]
55 self.set_lines(p, lines)
56 # Delete child later. Is this enough???
57 self.set_lines(child, [])
58 return found
59 #@+node:ekr.20180123060307.1: *4* js_i.remove_organizer_nodes
60 def remove_organizer_nodes(self, parent):
61 """Removed all organizer nodes created by i.delete_all_empty_nodes."""
62 # Careful: Restart this loop whenever we find an organizer.
63 found = True
64 while found:
65 found = False
66 for p in parent.subtree():
67 lines = self.get_lines(p)
68 if p.h.lower() == 'organizer' and not lines:
69 p.promote()
70 p.doDelete()
71 found = True # Restart the loop.
72 #@+node:ekr.20200202071105.1: *4* js_i.clean_all_nodes
73 def clean_all_nodes(self, parent):
74 """Remove common leading whitespace from all nodes."""
75 for p in parent.subtree():
76 lines = self.get_lines(p)
77 s = textwrap.dedent(''.join(lines))
78 self.set_lines(p, g.splitLines(s))
79 #@+node:ekr.20200202091613.1: *4* js_i.move_trailing_comments & helper (new)
80 def move_trailing_comments(self, parent):
81 """Move all trailing comments to the start of the next node."""
82 for p in parent.subtree():
83 next = p.next()
84 if next:
85 lines = self.get_lines(p)
86 head_lines, tail_lines = self.get_trailing_comments(lines)
87 if tail_lines:
88 self.set_lines(p, head_lines)
89 next_lines = self.get_lines(next)
90 self.set_lines(next, tail_lines + next_lines)
91 #@+node:ekr.20200202092332.1: *5* js_i.get_trailing_comments
92 def get_trailing_comments(self, lines):
93 """
94 Return the trailing comments of p.
95 Return (head_lines, tail_lines).
96 """
97 s = ''.join(lines)
98 head: List[str] = []
99 tail: List[str] = []
100 if not s.strip:
101 return head, tail
102 in_block_comment = False
103 head = lines
104 for i, line in enumerate(lines):
105 s = line.strip()
106 if in_block_comment:
107 tail.append(line)
108 if s.startswith('*/'):
109 in_block_comment = False
110 elif s.startswith('/*'):
111 in_block_comment = True
112 head = lines[:i]
113 tail = [line]
114 elif s.startswith('//'):
115 head = lines[:i]
116 tail = [line]
117 elif s: # Clear any previous comments.
118 head = lines
119 tail = []
120 return head, tail
121 #@+node:ekr.20161105140842.5: *3* js_i.scan_line (rewritten)
122 def scan_line(self, s, prev_state):
123 """
124 Update the scan state at the *end* of the line.
125 Return JS_ScanState({'context':context, 'curlies':curlies, 'parens':parens})
127 This code uses JsLex to scan the tokens, which scans strings and regexs properly.
129 This code also handles *partial* tokens: tokens continued from the
130 previous line or continued to the next line.
131 """
132 context = prev_state.context
133 curlies, parens = prev_state.curlies, prev_state.parens
134 # Scan tokens, updating context and counts.
135 prev_val = None
136 for kind, val in JsLexer().lex(s):
137 # g.trace(f"context: {context:2} kind: {kind:10} val: {val!r}")
138 if context:
139 if context in ('"', "'") and kind in ('other', 'punct') and val == context:
140 context = ''
141 elif (
142 context == '/*'
143 and kind in ('other', 'punct')
144 and prev_val == '*'
145 and val == '/'
146 ):
147 context = ''
148 elif kind in ('other', 'punct') and val in ('"', "'"):
149 context = val
150 elif kind in ('other', 'punct') and val == '*' and prev_val == '/':
151 context = '/*'
152 elif kind in ('other', 'punct'):
153 if val == '*' and prev_val == '/':
154 context = '/*'
155 elif val == '{':
156 curlies += 1
157 elif val == '}':
158 curlies -= 1
159 elif val == '(':
160 parens += 1
161 elif val == ')':
162 parens -= 1
163 prev_val = val
164 d = {'context': context, 'curlies': curlies, 'parens': parens}
165 state = JS_ScanState(d)
166 return state
167 #@+node:ekr.20171224145755.1: *3* js_i.starts_block
168 func_patterns = [
169 re.compile(r'.*?\)\s*=>\s*\{'),
170 re.compile(r'\s*class\b'),
171 re.compile(r'\s*function\b'),
172 re.compile(r'.*?[(=,]\s*function\b'),
173 ]
175 def starts_block(self, i, lines, new_state, prev_state):
176 """True if the new state starts a block."""
177 if new_state.level() <= prev_state.level():
178 return False
179 # Remove strings and regexs from the line before applying the patterns.
180 cleaned_line = []
181 for kind, val in JsLexer().lex(lines[i]):
182 if kind not in ('string', 'regex'):
183 cleaned_line.append(val)
184 # Search for any of the patterns.
185 line = ''.join(cleaned_line)
186 for pattern in self.func_patterns:
187 if pattern.match(line) is not None:
188 return True
189 return False
190 #@+node:ekr.20200131193217.1: *3* js_i.ends_block
191 def ends_block(self, line, new_state, prev_state, stack):
192 """True if line ends the block."""
193 # Comparing new_state against prev_state does not work for python.
194 top = stack[-1]
195 return new_state.level() < top.state.level()
196 #@+node:ekr.20161101183354.1: *3* js_i.clean_headline
197 clean_regex_list1 = [
198 # (function name (
199 re.compile(r'\s*\(?(function\b\s*[\w]*)\s*\('),
200 # name: (function (
201 re.compile(r'\s*(\w+\s*\:\s*\(*\s*function\s*\()'),
202 # const|let|var name = .* =>
203 re.compile(r'\s*(?:const|let|var)\s*(\w+\s*(?:=\s*.*)=>)'),
204 ]
205 clean_regex_list2 = [
206 re.compile(r'(.*\=)(\s*function)'), # .* = function
207 ]
208 clean_regex_list3 = [
209 re.compile(r'(.*\=\s*new\s*\w+)\s*\(.*(=>)'), # .* = new name .* =>
210 re.compile(r'(.*)\=\s*\(.*(=>)'), # .* = ( .* =>
211 re.compile(r'(.*)\((\s*function)'), # .* ( function
212 re.compile(r'(.*)\(.*(=>)'), # .* ( .* =>
213 re.compile(r'(.*)(\(.*\,\s*function)'), # .* \( .*, function
214 ]
215 clean_regex_list4 = [
216 re.compile(r'(.*)\(\s*(=>)'), # .* ( =>
217 ]
219 def clean_headline(self, s, p=None, trace=False):
220 """Return a cleaned up headline s."""
221 # pylint: disable=arguments-differ
222 s = s.strip()
223 # Don't clean a headline twice.
224 if s.endswith('>>') and s.startswith('<<'):
225 return s
226 for ch in '{(=':
227 if s.endswith(ch):
228 s = s[:-1].strip()
229 # First regex cleanup. Use \1.
230 for pattern in self.clean_regex_list1:
231 m = pattern.match(s)
232 if m:
233 s = m.group(1)
234 break
235 # Second regex cleanup. Use \1 + \2
236 for pattern in self.clean_regex_list2:
237 m = pattern.match(s)
238 if m:
239 s = m.group(1) + m.group(2)
240 break
241 # Third regex cleanup. Use \1 + ' ' + \2
242 for pattern in self.clean_regex_list3:
243 m = pattern.match(s)
244 if m:
245 s = m.group(1) + ' ' + m.group(2)
246 break
247 # Fourth cleanup. Use \1 + ' ' + \2 again
248 for pattern in self.clean_regex_list4:
249 m = pattern.match(s)
250 if m:
251 s = m.group(1) + ' ' + m.group(2)
252 break
253 # Final whitespace cleanups.
254 s = s.replace(' ', ' ')
255 s = s.replace(' (', '(')
256 return g.truncate(s, 100)
257 #@-others
258#@+node:ekr.20161105092745.1: ** class JS_ScanState
259class JS_ScanState:
260 """A class representing the state of the javascript line-oriented scan."""
262 def __init__(self, d=None):
263 """JS_ScanState ctor"""
264 if d:
265 # d is *different* from the dict created by i.scan_line.
266 self.context = d.get('context')
267 self.curlies = d.get('curlies')
268 self.parens = d.get('parens')
269 else:
270 self.context = ''
271 self.curlies = self.parens = 0
273 def __repr__(self):
274 """JS_ScanState.__repr__"""
275 return 'JS_ScanState context: %r curlies: %s parens: %s' % (
276 self.context, self.curlies, self.parens)
278 __str__ = __repr__
280 #@+others
281 #@+node:ekr.20161119115505.1: *3* js_state.level
282 def level(self):
283 """JS_ScanState.level."""
284 return (self.curlies, self.parens)
285 #@+node:ekr.20161119051049.1: *3* js_state.update
286 def update(self, data):
287 """
288 Update the state using the 6-tuple returned by i.scan_line.
289 Return i = data[1]
290 """
291 context, i, delta_c, delta_p, delta_s, bs_nl = data
292 # self.bs_nl = bs_nl
293 self.context = context
294 self.curlies += delta_c
295 self.parens += delta_p
296 # self.squares += delta_s
297 return i
299 #@-others
301#@+node:ekr.20200131110322.2: ** JsLexer...
302# JsLex: a lexer for Javascript
303# Written by Ned Batchelder. Used by permission.
304#
305# Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
306# For details: https://bitbucket.org/ned/jslex/src/default/NOTICE.txt
307#@+node:ekr.20200131110322.4: *3* class Tok
308class Tok:
309 """A specification for a token class."""
311 num = 0
313 def __init__(self, name, regex, next=None):
314 self.id = Tok.num
315 Tok.num += 1
316 self.name = name
317 self.regex = regex
318 self.next = next
319#@+node:ekr.20200131110322.7: *3* class Lexer
320class Lexer:
321 """A generic multi-state regex-based lexer."""
323 #@+others
324 #@+node:ekr.20200131110322.8: *4* Lexer.__init__
325 def __init__(self, states, first):
326 self.regexes = {}
327 self.toks = {}
328 for state, rules in states.items():
329 parts = []
330 for tok in rules:
331 groupid = "t%d" % tok.id
332 self.toks[groupid] = tok
333 parts.append("(?P<%s>%s)" % (groupid, tok.regex))
334 self.regexes[state] = re.compile("|".join(parts), re.MULTILINE | re.VERBOSE) # |re.UNICODE)
335 self.state = first
337 #@+node:ekr.20200131110322.9: *4* Lexer.lex
338 def lex(self, text):
339 """Lexically analyze `text`.
341 Yields pairs (`name`, `tokentext`).
342 """
343 end = len(text)
344 state = self.state
345 regexes = self.regexes
346 toks = self.toks
347 start = 0
348 while start < end:
349 for match in regexes[state].finditer(text, start):
350 # g.trace(state, start, text, match)
351 # g.printObj(regexes[state])
352 name = match.lastgroup
353 tok = toks[name]
354 toktext = match.group(name)
355 start += len(toktext)
356 yield(tok.name, toktext)
357 if tok.next:
358 state = tok.next
359 break
360 self.state = state
361 #@-others
362#@+node:ekr.20200131110322.6: *3* function: literals
363def literals(choices, prefix="", suffix=""):
364 """
365 Create a regex from a space-separated list of literal `choices`.
367 If provided, `prefix` and `suffix` will be attached to each choice
368 individually.
370 """
371 return "|".join(prefix + re.escape(c) + suffix for c in choices.split())
373#@+node:ekr.20200131110322.10: *3* class JsLexer(Lexer)
374class JsLexer(Lexer):
375 """A Javascript lexer
377 >>> lexer = JsLexer()
378 >>> list(lexer.lex("a = 1"))
379 [('id', 'a'), ('ws', ' '), ('punct', '='), ('ws', ' '), ('dnum', '1')]
381 This doesn't properly handle non-Ascii characters in the Javascript source.
383 """
385 #@+<< constants >>
386 #@+node:ekr.20200131190707.1: *4* << constants >> (JsLexer)
388 # Because these tokens are matched as alternatives in a regex, longer possibilities
389 # must appear in the list before shorter ones, for example, '>>' before '>'.
390 #
391 # Note that we don't have to detect malformed Javascript, only properly lex
392 # correct Javascript, so much of this is simplified.
394 # Details of Javascript lexical structure are taken from
395 # http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-262.pdf
397 # A useful explanation of automatic semicolon insertion is at
398 # http://inimino.org/~inimino/blog/javascript_semicolons
400 # See https://stackoverflow.com/questions/6314614/match-any-unicode-letter
402 both_before = [
403 Tok("comment", r"/\*(.|\n)*?\*/"),
404 Tok("linecomment", r"//.*?$"),
405 Tok("ws", r"\s+"),
406 Tok("keyword", literals("""
407 async await
408 break case catch class const continue debugger
409 default delete do else enum export extends
410 finally for function if import in instanceof new
411 return super switch this throw try typeof var
412 void while with
413 """, suffix=r"\b"), next='reg'),
414 Tok("reserved", literals("null true false", suffix=r"\b"), next='div'),
415 #
416 # EKR: This would work if patterns were compiled with the re.UNICODE flag.
417 # However, \w is not the same as valid JS characters.
418 # In any case, the JS importer doesn't need to handle id's carefully.
419 #
420 # Tok("id", r"""([\w$])([\w\d]*)""", next='div'),
421 #
422 Tok("id", r"""
423 ([a-zA-Z_$ ]|\\u[0-9a-fA-Z]{4}) # first char
424 ([a-zA-Z_$0-9]|\\u[0-9a-fA-F]{4})* # rest chars
425 """, next='div'),
426 Tok("hnum", r"0[xX][0-9a-fA-F]+", next='div'),
427 Tok("onum", r"0[0-7]+"),
428 Tok("dnum", r"""
429 ( (0|[1-9][0-9]*) # DecimalIntegerLiteral
430 \. # dot
431 [0-9]* # DecimalDigits-opt
432 ([eE][-+]?[0-9]+)? # ExponentPart-opt
433 |
434 \. # dot
435 [0-9]+ # DecimalDigits
436 ([eE][-+]?[0-9]+)? # ExponentPart-opt
437 |
438 (0|[1-9][0-9]*) # DecimalIntegerLiteral
439 ([eE][-+]?[0-9]+)? # ExponentPart-opt
440 )
441 """, next='div'),
442 Tok("punct", literals("""
443 >>>= === !== >>> <<= >>= <= >= == != << >> &&
444 || += -= *= %= &= |= ^=
445 """), next="reg"),
446 Tok("punct", literals("++ -- ) ]"), next='div'),
447 Tok("punct", literals("{ } ( [ . ; , < > + - * % & | ^ ! ~ ? : ="), next='reg'),
448 Tok("string", r'"([^"\\]|(\\(.|\n)))*?"', next='div'),
449 Tok("string", r"'([^'\\]|(\\(.|\n)))*?'", next='div'),
450 ]
452 both_after = [
453 Tok("other", r"."),
454 ]
456 states = {
457 'div': # slash will mean division
458 both_before + [
459 Tok("punct", literals("/= /"), next='reg'),
460 ] + both_after,
462 'reg': # slash will mean regex
463 both_before + [
464 Tok("regex",
465 r"""
466 / # opening slash
467 # First character is..
468 ( [^*\\/[] # anything but * \ / or [
469 | \\. # or an escape sequence
470 | \[ # or a class, which has
471 ( [^\]\\] # anything but \ or ]
472 | \\. # or an escape sequence
473 )* # many times
474 \]
475 )
476 # Following characters are same, except for excluding a star
477 ( [^\\/[] # anything but \ / or [
478 | \\. # or an escape sequence
479 | \[ # or a class, which has
480 ( [^\]\\] # anything but \ or ]
481 | \\. # or an escape sequence
482 )* # many times
483 \]
484 )* # many times
485 / # closing slash
486 [a-zA-Z0-9]* # trailing flags
487 """, next='div'),
488 ] + both_after,
489 }
490 #@-<< constants >>
492 #@+others
493 #@+node:ekr.20200131110322.11: *4* JsLexer.__init__
494 def __init__(self):
495 super().__init__(self.states, 'reg')
496 #@-others
497#@+node:ekr.20200131070055.1: ** class TestJSImporter (importers/javascript.py)
498class TestJSImporter(unittest.TestCase):
499 #@+others
500 #@+node:ekr.20200202093420.1: *3* test_get_trailing_comments
501 def test_get_trailing_comments(self):
503 table = (
504 # Test 1
505 ("""\
506 head
507 // tail""", 1),
509 # Test 2
510 ("""\
511 head
512 /* comment 1
513 * comment 2
514 */""", 3),
516 # Test 3
517 ("""\
518 head
519 /* comment 1
520 * comment 2
521 */
522 tail""", 0), # no tail
524 # Test 4
525 ("""\
526 head
527 // comment
528 tail""", 0), # no tail
530 ) # End table.
531 for s, expected_length in table:
532 x = JS_Importer(None)
533 s = textwrap.dedent(s)
534 lines = g.splitLines(s)
535 head, tail = x.get_trailing_comments(lines)
536 expected_lines = lines[-expected_length :] if expected_length else []
537 assert tail == expected_lines, (repr(tail), repr(expected_lines))
538 #@+node:ekr.20200202104932.1: *3* test_JsLex
539 def test_JsLex(self):
541 table = (
542 ('id', ('f_', '$', 'A1', 'abc')),
543 ('other', ('ÁÁ',)), # Unicode strings are not handled by JsLex.
544 ('keyword', ('async', 'await', 'if')),
545 ('punct', ('(', ')', '{', '}', ',', ':', ';')),
546 # ('num', ('9', '2')), # This test doesn't matter at present.
547 )
548 for kind, data in table:
549 for contents in data:
550 for name, tok in JsLexer().lex(contents):
551 assert name == kind, f"expected {kind!s} got {name!s} {tok!r} {contents}"
552 # print(f"{kind!s:10} {tok!r:10}")
554 #@+node:ekr.20200203051839.1: *3* test_starts_block
555 def test_starts_block(self):
557 table = (
558 (1, 'xx) => {}'),
559 (1, 'class c1'),
560 (1, 'function f1'),
561 (1, 'xx(function f2'),
562 (1, 'xx = function f3'),
563 (1, 'xx, function f4'),
564 (0, 'a = "function"'),
565 (0, 'a = /function/'),
566 )
567 for expected, line in table:
568 x = JS_Importer(None)
569 lines = [line]
570 new_state = JS_ScanState()
571 new_state.curlies += 1
572 prev_state = JS_ScanState()
573 results = x.starts_block(0, lines, new_state, prev_state)
574 # if expected != results: x.scan_line(line, prev_state
575 assert expected == results, f"expected: {expected} got: {int(results)} {line!r}\n"
576 #@+node:ekr.20200203060718.1: *3* test_scan_line
577 def test_scan_line(self):
579 table = (
580 # result prev_context s
581 ((0, 0, '"'), "", r'"string'),
582 ((0, 0, '/*'), "", r'/* line 1'),
583 ((0, 0, '/*'), "/*", r'line 2'), # New.
584 ((0, 0, ''), "/*", r'line 3 */'), # New.
585 ((0, 0, ''), "", r'a + b // /*'),
586 ((0, 1, ''), "", r'(function'),
587 ((1, 1, ''), "", r'(function(a) {'),
588 ((0, 0, ''), "", r'var x = /abc/'),
589 ((0, 0, ''), "", r'var x = /a"c/'),
590 ((0, 0, ''), "", r'var x = /a\//'),
591 ((0, 0, ''), "", r'var x = /a\//'),
592 ((0, 1, ''), "", r'var x = (0,'),
593 )
594 for result, prev_context, s in table:
595 importer = JS_Importer(None)
596 prev_state = JS_ScanState()
597 prev_state.context = prev_context
598 new_state = importer.scan_line(s, prev_state)
599 curlies, parens, context = result
600 ok = (
601 new_state.curlies == curlies and
602 new_state.parens == parens and
603 new_state.context == context)
604 assert ok, (
605 f"\n"
606 f" expected: curlies: {curlies}, parens: {parens}, context: {context!r}\n"
607 f"new_state: {new_state}\n"
608 f" s: {s!r}")
609 #@-others
610#@-others
611importer_dict = {
612 'func': JS_Importer.do_import(),
613 'extensions': ['.js',],
614}
615if __name__ == '__main__':
616 unittest.main()
617#@@language python
618#@@tabwidth -4
619#@-leo