1 /** |
|
2 * WordFilter.js |
|
3 * |
|
4 * Copyright, Moxiecode Systems AB |
|
5 * Released under LGPL License. |
|
6 * |
|
7 * License: http://www.tinymce.com/license |
|
8 * Contributing: http://www.tinymce.com/contributing |
|
9 */ |
|
10 |
|
11 /** |
|
12 * This class parses word HTML into proper TinyMCE markup. |
|
13 * |
|
14 * @class tinymce.pasteplugin.WordFilter |
|
15 * @private |
|
16 */ |
|
17 define("tinymce/pasteplugin/WordFilter", [ |
|
18 "tinymce/util/Tools", |
|
19 "tinymce/html/DomParser", |
|
20 "tinymce/html/Schema", |
|
21 "tinymce/html/Serializer", |
|
22 "tinymce/html/Node", |
|
23 "tinymce/pasteplugin/Utils" |
|
24 ], function(Tools, DomParser, Schema, Serializer, Node, Utils) { |
|
25 /** |
|
26 * Checks if the specified content is from any of the following sources: MS Word/Office 365/Google docs. |
|
27 */ |
|
28 function isWordContent(content) { |
|
29 return ( |
|
30 (/<font face="Times New Roman"|class="?Mso|style="[^"]*\bmso-|style='[^'']*\bmso-|w:WordDocument/i).test(content) || |
|
31 (/class="OutlineElement/).test(content) || |
|
32 (/id="?docs\-internal\-guid\-/.test(content)) |
|
33 ); |
|
34 } |
|
35 |
|
36 /** |
|
37 * Checks if the specified text starts with "1. " or "a. " etc. |
|
38 */ |
|
39 function isNumericList(text) { |
|
40 var found, patterns; |
|
41 |
|
42 patterns = [ |
|
43 /^[IVXLMCD]{1,2}\.[ \u00a0]/, // Roman upper case |
|
44 /^[ivxlmcd]{1,2}\.[ \u00a0]/, // Roman lower case |
|
45 /^[a-z]{1,2}[\.\)][ \u00a0]/, // Alphabetical a-z |
|
46 /^[A-Z]{1,2}[\.\)][ \u00a0]/, // Alphabetical A-Z |
|
47 /^[0-9]+\.[ \u00a0]/, // Numeric lists |
|
48 /^[\u3007\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d]+\.[ \u00a0]/, // Japanese |
|
49 /^[\u58f1\u5f10\u53c2\u56db\u4f0d\u516d\u4e03\u516b\u4e5d\u62fe]+\.[ \u00a0]/ // Chinese |
|
50 ]; |
|
51 |
|
52 text = text.replace(/^[\u00a0 ]+/, ''); |
|
53 |
|
54 Tools.each(patterns, function(pattern) { |
|
55 if (pattern.test(text)) { |
|
56 found = true; |
|
57 return false; |
|
58 } |
|
59 }); |
|
60 |
|
61 return found; |
|
62 } |
|
63 |
|
64 function isBulletList(text) { |
|
65 return /^[\s\u00a0]*[\u2022\u00b7\u00a7\u25CF]\s*/.test(text); |
|
66 } |
|
67 |
|
68 function WordFilter(editor) { |
|
69 var settings = editor.settings; |
|
70 |
|
71 editor.on('BeforePastePreProcess', function(e) { |
|
72 var content = e.content, retainStyleProperties, validStyles; |
|
73 |
|
74 // Remove google docs internal guid markers |
|
75 content = content.replace(/<b[^>]+id="?docs-internal-[^>]*>/gi, ''); |
|
76 content = content.replace(/<br class="?Apple-interchange-newline"?>/gi, ''); |
|
77 |
|
78 retainStyleProperties = settings.paste_retain_style_properties; |
|
79 if (retainStyleProperties) { |
|
80 validStyles = Tools.makeMap(retainStyleProperties.split(/[, ]/)); |
|
81 } |
|
82 |
|
83 /** |
|
84 * Converts fake bullet and numbered lists to real semantic OL/UL. |
|
85 * |
|
86 * @param {tinymce.html.Node} node Root node to convert children of. |
|
87 */ |
|
88 function convertFakeListsToProperLists(node) { |
|
89 var currentListNode, prevListNode, lastLevel = 1; |
|
90 |
|
91 function getText(node) { |
|
92 var txt = ''; |
|
93 |
|
94 if (node.type === 3) { |
|
95 return node.value; |
|
96 } |
|
97 |
|
98 if ((node = node.firstChild)) { |
|
99 do { |
|
100 txt += getText(node); |
|
101 } while ((node = node.next)); |
|
102 } |
|
103 |
|
104 return txt; |
|
105 } |
|
106 |
|
107 function trimListStart(node, regExp) { |
|
108 if (node.type === 3) { |
|
109 if (regExp.test(node.value)) { |
|
110 node.value = node.value.replace(regExp, ''); |
|
111 return false; |
|
112 } |
|
113 } |
|
114 |
|
115 if ((node = node.firstChild)) { |
|
116 do { |
|
117 if (!trimListStart(node, regExp)) { |
|
118 return false; |
|
119 } |
|
120 } while ((node = node.next)); |
|
121 } |
|
122 |
|
123 return true; |
|
124 } |
|
125 |
|
126 function removeIgnoredNodes(node) { |
|
127 if (node._listIgnore) { |
|
128 node.remove(); |
|
129 return; |
|
130 } |
|
131 |
|
132 if ((node = node.firstChild)) { |
|
133 do { |
|
134 removeIgnoredNodes(node); |
|
135 } while ((node = node.next)); |
|
136 } |
|
137 } |
|
138 |
|
139 function convertParagraphToLi(paragraphNode, listName, start) { |
|
140 var level = paragraphNode._listLevel || lastLevel; |
|
141 |
|
142 // Handle list nesting |
|
143 if (level != lastLevel) { |
|
144 if (level < lastLevel) { |
|
145 // Move to parent list |
|
146 if (currentListNode) { |
|
147 currentListNode = currentListNode.parent.parent; |
|
148 } |
|
149 } else { |
|
150 // Create new list |
|
151 prevListNode = currentListNode; |
|
152 currentListNode = null; |
|
153 } |
|
154 } |
|
155 |
|
156 if (!currentListNode || currentListNode.name != listName) { |
|
157 prevListNode = prevListNode || currentListNode; |
|
158 currentListNode = new Node(listName, 1); |
|
159 |
|
160 if (start > 1) { |
|
161 currentListNode.attr('start', '' + start); |
|
162 } |
|
163 |
|
164 paragraphNode.wrap(currentListNode); |
|
165 } else { |
|
166 currentListNode.append(paragraphNode); |
|
167 } |
|
168 |
|
169 paragraphNode.name = 'li'; |
|
170 |
|
171 // Append list to previous list if it exists |
|
172 if (level > lastLevel && prevListNode) { |
|
173 prevListNode.lastChild.append(currentListNode); |
|
174 } |
|
175 |
|
176 lastLevel = level; |
|
177 |
|
178 // Remove start of list item "1. " or "· " etc |
|
179 removeIgnoredNodes(paragraphNode); |
|
180 trimListStart(paragraphNode, /^\u00a0+/); |
|
181 trimListStart(paragraphNode, /^\s*([\u2022\u00b7\u00a7\u25CF]|\w+\.)/); |
|
182 trimListStart(paragraphNode, /^\u00a0+/); |
|
183 } |
|
184 |
|
185 // Build a list of all root level elements before we start |
|
186 // altering them in the loop below. |
|
187 var elements = [], child = node.firstChild; |
|
188 while (typeof child !== 'undefined' && child !== null) { |
|
189 elements.push(child); |
|
190 |
|
191 child = child.walk(); |
|
192 if (child !== null) { |
|
193 while (typeof child !== 'undefined' && child.parent !== node) { |
|
194 child = child.walk(); |
|
195 } |
|
196 } |
|
197 } |
|
198 |
|
199 for (var i = 0; i < elements.length; i++) { |
|
200 node = elements[i]; |
|
201 |
|
202 if (node.name == 'p' && node.firstChild) { |
|
203 // Find first text node in paragraph |
|
204 var nodeText = getText(node); |
|
205 |
|
206 // Detect unordered lists look for bullets |
|
207 if (isBulletList(nodeText)) { |
|
208 convertParagraphToLi(node, 'ul'); |
|
209 continue; |
|
210 } |
|
211 |
|
212 // Detect ordered lists 1., a. or ixv. |
|
213 if (isNumericList(nodeText)) { |
|
214 // Parse OL start number |
|
215 var matches = /([0-9]+)\./.exec(nodeText); |
|
216 var start = 1; |
|
217 if (matches) { |
|
218 start = parseInt(matches[1], 10); |
|
219 } |
|
220 |
|
221 convertParagraphToLi(node, 'ol', start); |
|
222 continue; |
|
223 } |
|
224 |
|
225 // Convert paragraphs marked as lists but doesn't look like anything |
|
226 if (node._listLevel) { |
|
227 convertParagraphToLi(node, 'ul', 1); |
|
228 continue; |
|
229 } |
|
230 |
|
231 currentListNode = null; |
|
232 } else { |
|
233 // If the root level element isn't a p tag which can be |
|
234 // processed by convertParagraphToLi, it interrupts the |
|
235 // lists, causing a new list to start instead of having |
|
236 // elements from the next list inserted above this tag. |
|
237 prevListNode = currentListNode; |
|
238 currentListNode = null; |
|
239 } |
|
240 } |
|
241 } |
|
242 |
|
243 function filterStyles(node, styleValue) { |
|
244 var outputStyles = {}, matches, styles = editor.dom.parseStyle(styleValue); |
|
245 |
|
246 Tools.each(styles, function(value, name) { |
|
247 // Convert various MS styles to W3C styles |
|
248 switch (name) { |
|
249 case 'mso-list': |
|
250 // Parse out list indent level for lists |
|
251 matches = /\w+ \w+([0-9]+)/i.exec(styleValue); |
|
252 if (matches) { |
|
253 node._listLevel = parseInt(matches[1], 10); |
|
254 } |
|
255 |
|
256 // Remove these nodes <span style="mso-list:Ignore">o</span> |
|
257 // Since the span gets removed we mark the text node and the span |
|
258 if (/Ignore/i.test(value) && node.firstChild) { |
|
259 node._listIgnore = true; |
|
260 node.firstChild._listIgnore = true; |
|
261 } |
|
262 |
|
263 break; |
|
264 |
|
265 case "horiz-align": |
|
266 name = "text-align"; |
|
267 break; |
|
268 |
|
269 case "vert-align": |
|
270 name = "vertical-align"; |
|
271 break; |
|
272 |
|
273 case "font-color": |
|
274 case "mso-foreground": |
|
275 name = "color"; |
|
276 break; |
|
277 |
|
278 case "mso-background": |
|
279 case "mso-highlight": |
|
280 name = "background"; |
|
281 break; |
|
282 |
|
283 case "font-weight": |
|
284 case "font-style": |
|
285 if (value != "normal") { |
|
286 outputStyles[name] = value; |
|
287 } |
|
288 return; |
|
289 |
|
290 case "mso-element": |
|
291 // Remove track changes code |
|
292 if (/^(comment|comment-list)$/i.test(value)) { |
|
293 node.remove(); |
|
294 return; |
|
295 } |
|
296 |
|
297 break; |
|
298 } |
|
299 |
|
300 if (name.indexOf('mso-comment') === 0) { |
|
301 node.remove(); |
|
302 return; |
|
303 } |
|
304 |
|
305 // Never allow mso- prefixed names |
|
306 if (name.indexOf('mso-') === 0) { |
|
307 return; |
|
308 } |
|
309 |
|
310 // Output only valid styles |
|
311 if (retainStyleProperties == "all" || (validStyles && validStyles[name])) { |
|
312 outputStyles[name] = value; |
|
313 } |
|
314 }); |
|
315 |
|
316 // Convert bold style to "b" element |
|
317 if (/(bold)/i.test(outputStyles["font-weight"])) { |
|
318 delete outputStyles["font-weight"]; |
|
319 node.wrap(new Node("b", 1)); |
|
320 } |
|
321 |
|
322 // Convert italic style to "i" element |
|
323 if (/(italic)/i.test(outputStyles["font-style"])) { |
|
324 delete outputStyles["font-style"]; |
|
325 node.wrap(new Node("i", 1)); |
|
326 } |
|
327 |
|
328 // Serialize the styles and see if there is something left to keep |
|
329 outputStyles = editor.dom.serializeStyle(outputStyles, node.name); |
|
330 if (outputStyles) { |
|
331 return outputStyles; |
|
332 } |
|
333 |
|
334 return null; |
|
335 } |
|
336 |
|
337 if (settings.paste_enable_default_filters === false) { |
|
338 return; |
|
339 } |
|
340 |
|
341 // Detect is the contents is Word junk HTML |
|
342 if (isWordContent(e.content)) { |
|
343 e.wordContent = true; // Mark it for other processors |
|
344 |
|
345 // Remove basic Word junk |
|
346 content = Utils.filter(content, [ |
|
347 // Word comments like conditional comments etc |
|
348 /<!--[\s\S]+?-->/gi, |
|
349 |
|
350 // Remove comments, scripts (e.g., msoShowComment), XML tag, VML content, |
|
351 // MS Office namespaced tags, and a few other tags |
|
352 /<(!|script[^>]*>.*?<\/script(?=[>\s])|\/?(\?xml(:\w+)?|img|meta|link|style|\w:\w+)(?=[\s\/>]))[^>]*>/gi, |
|
353 |
|
354 // Convert <s> into <strike> for line-though |
|
355 [/<(\/?)s>/gi, "<$1strike>"], |
|
356 |
|
357 // Replace nsbp entites to char since it's easier to handle |
|
358 [/ /gi, "\u00a0"], |
|
359 |
|
360 // Convert <span style="mso-spacerun:yes">___</span> to string of alternating |
|
361 // breaking/non-breaking spaces of same length |
|
362 [/<span\s+style\s*=\s*"\s*mso-spacerun\s*:\s*yes\s*;?\s*"\s*>([\s\u00a0]*)<\/span>/gi, |
|
363 function(str, spaces) { |
|
364 return (spaces.length > 0) ? |
|
365 spaces.replace(/./, " ").slice(Math.floor(spaces.length / 2)).split("").join("\u00a0") : ""; |
|
366 } |
|
367 ] |
|
368 ]); |
|
369 |
|
370 var validElements = settings.paste_word_valid_elements; |
|
371 if (!validElements) { |
|
372 validElements = ( |
|
373 '-strong/b,-em/i,-u,-span,-p,-ol,-ul,-li,-h1,-h2,-h3,-h4,-h5,-h6,' + |
|
374 '-p/div,-a[href|name],sub,sup,strike,br,del,table[width],tr,' + |
|
375 'td[colspan|rowspan|width],th[colspan|rowspan|width],thead,tfoot,tbody' |
|
376 ); |
|
377 } |
|
378 |
|
379 // Setup strict schema |
|
380 var schema = new Schema({ |
|
381 valid_elements: validElements, |
|
382 valid_children: '-li[p]' |
|
383 }); |
|
384 |
|
385 // Add style/class attribute to all element rules since the user might have removed them from |
|
386 // paste_word_valid_elements config option and we need to check them for properties |
|
387 Tools.each(schema.elements, function(rule) { |
|
388 /*eslint dot-notation:0*/ |
|
389 if (!rule.attributes["class"]) { |
|
390 rule.attributes["class"] = {}; |
|
391 rule.attributesOrder.push("class"); |
|
392 } |
|
393 |
|
394 if (!rule.attributes.style) { |
|
395 rule.attributes.style = {}; |
|
396 rule.attributesOrder.push("style"); |
|
397 } |
|
398 }); |
|
399 |
|
400 // Parse HTML into DOM structure |
|
401 var domParser = new DomParser({}, schema); |
|
402 |
|
403 // Filter styles to remove "mso" specific styles and convert some of them |
|
404 domParser.addAttributeFilter('style', function(nodes) { |
|
405 var i = nodes.length, node; |
|
406 |
|
407 while (i--) { |
|
408 node = nodes[i]; |
|
409 node.attr('style', filterStyles(node, node.attr('style'))); |
|
410 |
|
411 // Remove pointess spans |
|
412 if (node.name == 'span' && node.parent && !node.attributes.length) { |
|
413 node.unwrap(); |
|
414 } |
|
415 } |
|
416 }); |
|
417 |
|
418 // Check the class attribute for comments or del items and remove those |
|
419 domParser.addAttributeFilter('class', function(nodes) { |
|
420 var i = nodes.length, node, className; |
|
421 |
|
422 while (i--) { |
|
423 node = nodes[i]; |
|
424 |
|
425 className = node.attr('class'); |
|
426 if (/^(MsoCommentReference|MsoCommentText|msoDel)$/i.test(className)) { |
|
427 node.remove(); |
|
428 } |
|
429 |
|
430 node.attr('class', null); |
|
431 } |
|
432 }); |
|
433 |
|
434 // Remove all del elements since we don't want the track changes code in the editor |
|
435 domParser.addNodeFilter('del', function(nodes) { |
|
436 var i = nodes.length; |
|
437 |
|
438 while (i--) { |
|
439 nodes[i].remove(); |
|
440 } |
|
441 }); |
|
442 |
|
443 // Keep some of the links and anchors |
|
444 domParser.addNodeFilter('a', function(nodes) { |
|
445 var i = nodes.length, node, href, name; |
|
446 |
|
447 while (i--) { |
|
448 node = nodes[i]; |
|
449 href = node.attr('href'); |
|
450 name = node.attr('name'); |
|
451 |
|
452 if (href && href.indexOf('#_msocom_') != -1) { |
|
453 node.remove(); |
|
454 continue; |
|
455 } |
|
456 |
|
457 if (href && href.indexOf('file://') === 0) { |
|
458 href = href.split('#')[1]; |
|
459 if (href) { |
|
460 href = '#' + href; |
|
461 } |
|
462 } |
|
463 |
|
464 if (!href && !name) { |
|
465 node.unwrap(); |
|
466 } else { |
|
467 // Remove all named anchors that aren't specific to TOC, Footnotes or Endnotes |
|
468 if (name && !/^_?(?:toc|edn|ftn)/i.test(name)) { |
|
469 node.unwrap(); |
|
470 continue; |
|
471 } |
|
472 |
|
473 node.attr({ |
|
474 href: href, |
|
475 name: name |
|
476 }); |
|
477 } |
|
478 } |
|
479 }); |
|
480 |
|
481 // Parse into DOM structure |
|
482 var rootNode = domParser.parse(content); |
|
483 |
|
484 // Process DOM |
|
485 if (settings.paste_convert_word_fake_lists !== false) { |
|
486 convertFakeListsToProperLists(rootNode); |
|
487 } |
|
488 |
|
489 // Serialize DOM back to HTML |
|
490 e.content = new Serializer({}, schema).serialize(rootNode); |
|
491 } |
|
492 }); |
|
493 } |
|
494 |
|
495 WordFilter.isWordContent = isWordContent; |
|
496 |
|
497 return WordFilter; |
|
498 }); |
|