Coverage for C:\leo.repo\leo-editor\leo\plugins\importers\xml.py: 91%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#@+leo-ver=5-thin
2#@+node:ekr.20140723122936.18137: * @file ../plugins/importers/xml.py
3"""The @auto importer for the xml language."""
4import re
5from leo.core import leoGlobals as g
6from leo.plugins.importers import linescanner
7Importer = linescanner.Importer
8Target = linescanner.Target
9#@+others
10#@+node:ekr.20161121204146.3: ** class Xml_Importer
11class Xml_Importer(Importer):
12 """The importer for the xml lanuage."""
14 #@+others
15 #@+node:ekr.20161122124109.1: *3* xml_i.__init__
16 def __init__(self, importCommands, tags_setting='import_xml_tags', **kwargs):
17 """Xml_Importer.__init__"""
18 # Init the base class.
19 super().__init__(
20 importCommands,
21 language='xml',
22 state_class=Xml_ScanState,
23 strict=False,
24 )
25 self.tags_setting = tags_setting
26 self.start_tags = self.add_tags()
27 # A closing tag decrements state.tag_level only if the top is an opening tag.
28 self.stack = [] # Stack of tags.
29 self.void_tags = [
30 '<?xml',
31 '!doctype',
32 ]
33 self.tag_warning_given = False # True: a structure error has been detected.
34 #@+node:ekr.20161121204918.1: *3* xml_i.add_tags
35 def add_tags(self):
36 """Add items to self.class/functionTags and from settings."""
37 c, setting = self.c, self.tags_setting
38 aList = c.config.getData(setting) or []
39 aList = [z.lower() for z in aList]
40 return aList
41 #@+node:ekr.20170416082422.1: *3* xml_i.clean_headline
42 def clean_headline(self, s, p=None):
43 """xml and html: Return a cleaned up headline s."""
44 m = re.match(r'\s*(<[^>]+>)', s)
45 return m.group(1) if m else s.strip()
46 #@+node:ekr.20161123003732.1: *3* xml_i.error
47 def error(self, s):
48 """Issue an error, but do *not* cause a unit test to fail."""
49 g.es_print('\nin %s' % self.root.h)
50 g.es_print(s)
51 # Tell i.check to strip lws.
52 self.ws_error = True
53 #@+node:ekr.20161122073505.1: *3* xml_i.scan_line & helpers
54 def scan_line(self, s, prev_state):
55 """Update the xml scan state by scanning line s."""
56 context, tag_level = prev_state.context, prev_state.tag_level
57 i = 0
58 while i < len(s):
59 progress = i
60 if context:
61 context, i = self.scan_in_context(context, i, s)
62 else:
63 context, i, tag_level = self.scan_out_context(i, s, tag_level)
64 assert progress < i, (repr(s[i]), '***', repr(s))
65 d = {'context': context, 'tag_level': tag_level}
66 return Xml_ScanState(d)
67 #@+node:ekr.20161122073937.1: *4* xml_i.scan_in_context
68 def scan_in_context(self, context, i, s):
69 """
70 Scan s from i, within the given context.
71 Return (context, i)
72 """
73 assert context in ('"', '<!--'), repr(context)
74 # Only double-quoted strings are valid strings in xml/html.
75 if context == '"' and self.match(s, i, '"'):
76 context = ''
77 i += 1
78 elif context == '<!--' and self.match(s, i, '-->'):
79 context = ''
80 i += 3
81 else:
82 i += 1
83 return context, i
84 #@+node:ekr.20161122073938.1: *4* xml_i.scan_out_context & helpers
85 def scan_out_context(self, i, s, tag_level):
86 """
87 Scan s from i, outside any context.
88 Return (context, i, tag_level)
89 """
90 context = ''
91 if self.match(s, i, '"'):
92 context = '"' # Only double-quoted strings are xml/html strings.
93 i += 1
94 elif self.match(s, i, '<!--'):
95 context = '<!--'
96 i += 4
97 elif self.match(s, i, '<'):
98 # xml/html tags do *not* start contexts.
99 i, tag_level = self.scan_tag(s, i, tag_level)
100 elif self.match(s, i, '/>'):
101 i += 2
102 tag_level = self.end_tag(s, tag='/>', tag_level=tag_level)
103 elif self.match(s, i, '>'):
104 tag_level = self.end_tag(s, tag='>', tag_level=tag_level)
105 i += 1
106 else:
107 i += 1
108 return context, i, tag_level
109 #@+node:ekr.20161122084808.1: *5* xml_i.end_tag
110 def end_tag(self, s, tag, tag_level):
111 """
112 Handle the ">" or "/>" that ends an element.
114 Ignore ">" except for void tags.
115 """
116 if self.stack:
117 if tag == '/>':
118 top = self.stack.pop()
119 if top in self.start_tags:
120 tag_level -= 1
121 else:
122 top = self.stack[-1]
123 if top in self.void_tags:
124 self.stack.pop()
125 elif tag == '/>':
126 g.es_print("Warning: ignoring dubious /> in...")
127 g.es_print(repr(s))
128 return tag_level
129 #@+node:ekr.20161122080143.1: *5* xml_i.scan_tag & helper
130 ch_pattern = re.compile(r'([\!\?]?[\w\_\.\:\-]+)', re.UNICODE)
132 def scan_tag(self, s, i, tag_level):
133 """
134 Scan an xml tag starting with "<" or "</".
136 Adjust the stack as appropriate:
137 - "<" adds the tag to the stack.
138 - "</" removes the top of the stack if it matches.
139 """
140 assert s[i] == '<', repr(s[i])
141 end_tag = self.match(s, i, '</')
142 # Scan the tag.
143 i += (2 if end_tag else 1)
144 m = self.ch_pattern.match(s, i)
145 if m:
146 tag = m.group(0).lower()
147 i += len(m.group(0))
148 else:
149 # All other '<' characters should have had xml/html escapes applied to them.
150 self.error('missing tag in position %s of %r' % (i, s))
151 g.es_print(repr(s))
152 return i, tag_level
153 if end_tag:
154 self.pop_to_tag(tag, s)
155 if tag in self.start_tags:
156 tag_level -= 1
157 else:
158 self.stack.append(tag)
159 if tag in self.start_tags:
160 tag_level += 1
161 return i, tag_level
162 #@+node:ekr.20170416043508.1: *6* xml_i.pop_to_tag
163 def pop_to_tag(self, tag, s):
164 """
165 Attempt to pop tag from the top of the stack.
167 If the top doesn't match, issue a warning and attempt to recover.
168 """
169 if not self.stack:
170 self.error('Empty tag stack: %s' % tag)
171 g.es_print(repr(s))
172 return
173 top = self.stack[-1]
174 if top == tag:
175 self.stack.pop()
176 return
177 # Only issue one warning per file.
178 # Attempt a recovery.
179 if tag in self.stack:
180 while self.stack:
181 top = self.stack.pop()
182 # if trace: g.trace('POP: ', top)
183 if top == tag:
184 return
185 #@+node:ekr.20161121210839.1: *3* xml_i.starts_block
186 def starts_block(self, i, lines, new_state, prev_state):
187 """True if the line startswith an xml block"""
188 return new_state.tag_level > prev_state.tag_level
189 #@+node:ekr.20161121212858.1: *3* xml_i.is_ws_line
190 # Warning: base Importer class defines ws_pattern.
191 xml_ws_pattern = re.compile(r'\s*(<!--([^-]|-[^-])*-->\s*)*$')
193 def is_ws_line(self, s):
194 """True if s is nothing but whitespace or single-line comments."""
195 return bool(self.xml_ws_pattern.match(s))
196 #@+node:ekr.20161123005742.1: *3* xml_i.undent
197 def undent(self, p):
198 """
199 Regularize lws before @others, but preserve lws for all other lines.
200 This is needed to handle embedded brython code properly.
201 """
202 result, w = [], self.tab_width
203 indent = ' ' * abs(w) if w < 0 else '\t'
204 for s in self.get_lines(p):
205 ls = '\n' if s.isspace() else s.lstrip()
206 if ls.startswith('@others'):
207 if p == self.root:
208 result.append(ls)
209 else:
210 result.append(indent + ls)
211 else:
212 # Fix #479: Preserve brython indentation when importing .html files.
213 result.append('\n' if s.isspace() else s)
214 return result
215 #@-others
216#@+node:ekr.20161121204146.7: ** class class Xml_ScanState
217class Xml_ScanState:
218 """A class representing the state of the xml line-oriented scan."""
220 def __init__(self, d=None):
221 """Xml_ScanState.__init__"""
222 if d:
223 self.context = d.get('context')
224 self.tag_level = d.get('tag_level')
225 else:
226 self.context = ''
227 self.tag_level = 0
229 def __repr__(self):
230 """Xml_ScanState.__repr__"""
231 return "Xml_ScanState context: %r tag_level: %s" % (
232 self.context, self.tag_level)
234 __str__ = __repr__
236 #@+others
237 #@+node:ekr.20161121204146.8: *3* xml_state.level
238 def level(self):
239 """Xml_ScanState.level."""
240 return self.tag_level
241 #@-others
242#@-others
243importer_dict = {
244 'func': Xml_Importer.do_import(),
245 'extensions': ['.xml',],
246}
247#@@language python
248#@@tabwidth -4
250#@-leo