src/pyams_skin/resources/js/ext/tinymce/dev/plugins/paste/classes/WordFilter.js
changeset 566 a1707c607eec
parent 565 318533413200
child 567 bca1726b1d85
equal deleted inserted replaced
565:318533413200 566:a1707c607eec
     1 /**
       
     2  * WordFilter.js
       
     3  *
       
     4  * Copyright, Moxiecode Systems AB
       
     5  * Released under LGPL License.
       
     6  *
       
     7  * License: http://www.tinymce.com/license
       
     8  * Contributing: http://www.tinymce.com/contributing
       
     9  */
       
    10 
       
    11 /**
       
    12  * This class parses word HTML into proper TinyMCE markup.
       
    13  *
       
    14  * @class tinymce.pasteplugin.WordFilter
       
    15  * @private
       
    16  */
       
    17 define("tinymce/pasteplugin/WordFilter", [
       
    18 	"tinymce/util/Tools",
       
    19 	"tinymce/html/DomParser",
       
    20 	"tinymce/html/Schema",
       
    21 	"tinymce/html/Serializer",
       
    22 	"tinymce/html/Node",
       
    23 	"tinymce/pasteplugin/Utils"
       
    24 ], function(Tools, DomParser, Schema, Serializer, Node, Utils) {
       
    25 	/**
       
    26 	 * Checks if the specified content is from any of the following sources: MS Word/Office 365/Google docs.
       
    27 	 */
       
    28 	function isWordContent(content) {
       
    29 		return (
       
    30 			(/<font face="Times New Roman"|class="?Mso|style="[^"]*\bmso-|style='[^'']*\bmso-|w:WordDocument/i).test(content) ||
       
    31 			(/class="OutlineElement/).test(content) ||
       
    32 			(/id="?docs\-internal\-guid\-/.test(content))
       
    33 		);
       
    34 	}
       
    35 
       
    36 	/**
       
    37 	 * Checks if the specified text starts with "1. " or "a. " etc.
       
    38 	 */
       
    39 	function isNumericList(text) {
       
    40 		var found, patterns;
       
    41 
       
    42 		patterns = [
       
    43 			/^[IVXLMCD]{1,2}\.[ \u00a0]/,  // Roman upper case
       
    44 			/^[ivxlmcd]{1,2}\.[ \u00a0]/,  // Roman lower case
       
    45 			/^[a-z]{1,2}[\.\)][ \u00a0]/,  // Alphabetical a-z
       
    46 			/^[A-Z]{1,2}[\.\)][ \u00a0]/,  // Alphabetical A-Z
       
    47 			/^[0-9]+\.[ \u00a0]/,          // Numeric lists
       
    48 			/^[\u3007\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d]+\.[ \u00a0]/, // Japanese
       
    49 			/^[\u58f1\u5f10\u53c2\u56db\u4f0d\u516d\u4e03\u516b\u4e5d\u62fe]+\.[ \u00a0]/  // Chinese
       
    50 		];
       
    51 
       
    52 		text = text.replace(/^[\u00a0 ]+/, '');
       
    53 
       
    54 		Tools.each(patterns, function(pattern) {
       
    55 			if (pattern.test(text)) {
       
    56 				found = true;
       
    57 				return false;
       
    58 			}
       
    59 		});
       
    60 
       
    61 		return found;
       
    62 	}
       
    63 
       
    64 	function isBulletList(text) {
       
    65 		return /^[\s\u00a0]*[\u2022\u00b7\u00a7\u25CF]\s*/.test(text);
       
    66 	}
       
    67 
       
    68 	function WordFilter(editor) {
       
    69 		var settings = editor.settings;
       
    70 
       
    71 		editor.on('BeforePastePreProcess', function(e) {
       
    72 			var content = e.content, retainStyleProperties, validStyles;
       
    73 
       
    74 			// Remove google docs internal guid markers
       
    75 			content = content.replace(/<b[^>]+id="?docs-internal-[^>]*>/gi, '');
       
    76 			content = content.replace(/<br class="?Apple-interchange-newline"?>/gi, '');
       
    77 
       
    78 			retainStyleProperties = settings.paste_retain_style_properties;
       
    79 			if (retainStyleProperties) {
       
    80 				validStyles = Tools.makeMap(retainStyleProperties.split(/[, ]/));
       
    81 			}
       
    82 
       
    83 			/**
       
    84 			 * Converts fake bullet and numbered lists to real semantic OL/UL.
       
    85 			 *
       
    86 			 * @param {tinymce.html.Node} node Root node to convert children of.
       
    87 			 */
       
    88 			function convertFakeListsToProperLists(node) {
       
    89 				var currentListNode, prevListNode, lastLevel = 1;
       
    90 
       
    91 				function getText(node) {
       
    92 					var txt = '';
       
    93 
       
    94 					if (node.type === 3) {
       
    95 						return node.value;
       
    96 					}
       
    97 
       
    98 					if ((node = node.firstChild)) {
       
    99 						do {
       
   100 							txt += getText(node);
       
   101 						} while ((node = node.next));
       
   102 					}
       
   103 
       
   104 					return txt;
       
   105 				}
       
   106 
       
   107 				function trimListStart(node, regExp) {
       
   108 					if (node.type === 3) {
       
   109 						if (regExp.test(node.value)) {
       
   110 							node.value = node.value.replace(regExp, '');
       
   111 							return false;
       
   112 						}
       
   113 					}
       
   114 
       
   115 					if ((node = node.firstChild)) {
       
   116 						do {
       
   117 							if (!trimListStart(node, regExp)) {
       
   118 								return false;
       
   119 							}
       
   120 						} while ((node = node.next));
       
   121 					}
       
   122 
       
   123 					return true;
       
   124 				}
       
   125 
       
   126 				function removeIgnoredNodes(node) {
       
   127 					if (node._listIgnore) {
       
   128 						node.remove();
       
   129 						return;
       
   130 					}
       
   131 
       
   132 					if ((node = node.firstChild)) {
       
   133 						do {
       
   134 							removeIgnoredNodes(node);
       
   135 						} while ((node = node.next));
       
   136 					}
       
   137 				}
       
   138 
       
   139 				function convertParagraphToLi(paragraphNode, listName, start) {
       
   140 					var level = paragraphNode._listLevel || lastLevel;
       
   141 
       
   142 					// Handle list nesting
       
   143 					if (level != lastLevel) {
       
   144 						if (level < lastLevel) {
       
   145 							// Move to parent list
       
   146 							if (currentListNode) {
       
   147 								currentListNode = currentListNode.parent.parent;
       
   148 							}
       
   149 						} else {
       
   150 							// Create new list
       
   151 							prevListNode = currentListNode;
       
   152 							currentListNode = null;
       
   153 						}
       
   154 					}
       
   155 
       
   156 					if (!currentListNode || currentListNode.name != listName) {
       
   157 						prevListNode = prevListNode || currentListNode;
       
   158 						currentListNode = new Node(listName, 1);
       
   159 
       
   160 						if (start > 1) {
       
   161 							currentListNode.attr('start', '' + start);
       
   162 						}
       
   163 
       
   164 						paragraphNode.wrap(currentListNode);
       
   165 					} else {
       
   166 						currentListNode.append(paragraphNode);
       
   167 					}
       
   168 
       
   169 					paragraphNode.name = 'li';
       
   170 
       
   171 					// Append list to previous list if it exists
       
   172 					if (level > lastLevel && prevListNode) {
       
   173 						prevListNode.lastChild.append(currentListNode);
       
   174 					}
       
   175 
       
   176 					lastLevel = level;
       
   177 
       
   178 					// Remove start of list item "1. " or "&middot; " etc
       
   179 					removeIgnoredNodes(paragraphNode);
       
   180 					trimListStart(paragraphNode, /^\u00a0+/);
       
   181 					trimListStart(paragraphNode, /^\s*([\u2022\u00b7\u00a7\u25CF]|\w+\.)/);
       
   182 					trimListStart(paragraphNode, /^\u00a0+/);
       
   183 				}
       
   184 
       
   185 				// Build a list of all root level elements before we start
       
   186 				// altering them in the loop below.
       
   187 				var elements = [], child = node.firstChild;
       
   188 				while (typeof child !== 'undefined' && child !== null) {
       
   189 					elements.push(child);
       
   190 
       
   191 					child = child.walk();
       
   192 					if (child !== null) {
       
   193 						while (typeof child !== 'undefined' && child.parent !== node) {
       
   194 							child = child.walk();
       
   195 						}
       
   196 					}
       
   197 				}
       
   198 
       
   199 				for (var i = 0; i < elements.length; i++) {
       
   200 					node = elements[i];
       
   201 
       
   202 					if (node.name == 'p' && node.firstChild) {
       
   203 						// Find first text node in paragraph
       
   204 						var nodeText = getText(node);
       
   205 
       
   206 						// Detect unordered lists look for bullets
       
   207 						if (isBulletList(nodeText)) {
       
   208 							convertParagraphToLi(node, 'ul');
       
   209 							continue;
       
   210 						}
       
   211 
       
   212 						// Detect ordered lists 1., a. or ixv.
       
   213 						if (isNumericList(nodeText)) {
       
   214 							// Parse OL start number
       
   215 							var matches = /([0-9]+)\./.exec(nodeText);
       
   216 							var start = 1;
       
   217 							if (matches) {
       
   218 								start = parseInt(matches[1], 10);
       
   219 							}
       
   220 
       
   221 							convertParagraphToLi(node, 'ol', start);
       
   222 							continue;
       
   223 						}
       
   224 
       
   225 						// Convert paragraphs marked as lists but doesn't look like anything
       
   226 						if (node._listLevel) {
       
   227 							convertParagraphToLi(node, 'ul', 1);
       
   228 							continue;
       
   229 						}
       
   230 
       
   231 						currentListNode = null;
       
   232 					} else {
       
   233 						// If the root level element isn't a p tag which can be
       
   234 						// processed by convertParagraphToLi, it interrupts the
       
   235 						// lists, causing a new list to start instead of having
       
   236 						// elements from the next list inserted above this tag.
       
   237 						prevListNode = currentListNode;
       
   238 						currentListNode = null;
       
   239 					}
       
   240 				}
       
   241 			}
       
   242 
       
   243 			function filterStyles(node, styleValue) {
       
   244 				var outputStyles = {}, matches, styles = editor.dom.parseStyle(styleValue);
       
   245 
       
   246 				Tools.each(styles, function(value, name) {
       
   247 					// Convert various MS styles to W3C styles
       
   248 					switch (name) {
       
   249 						case 'mso-list':
       
   250 							// Parse out list indent level for lists
       
   251 							matches = /\w+ \w+([0-9]+)/i.exec(styleValue);
       
   252 							if (matches) {
       
   253 								node._listLevel = parseInt(matches[1], 10);
       
   254 							}
       
   255 
       
   256 							// Remove these nodes <span style="mso-list:Ignore">o</span>
       
   257 							// Since the span gets removed we mark the text node and the span
       
   258 							if (/Ignore/i.test(value) && node.firstChild) {
       
   259 								node._listIgnore = true;
       
   260 								node.firstChild._listIgnore = true;
       
   261 							}
       
   262 
       
   263 							break;
       
   264 
       
   265 						case "horiz-align":
       
   266 							name = "text-align";
       
   267 							break;
       
   268 
       
   269 						case "vert-align":
       
   270 							name = "vertical-align";
       
   271 							break;
       
   272 
       
   273 						case "font-color":
       
   274 						case "mso-foreground":
       
   275 							name = "color";
       
   276 							break;
       
   277 
       
   278 						case "mso-background":
       
   279 						case "mso-highlight":
       
   280 							name = "background";
       
   281 							break;
       
   282 
       
   283 						case "font-weight":
       
   284 						case "font-style":
       
   285 							if (value != "normal") {
       
   286 								outputStyles[name] = value;
       
   287 							}
       
   288 							return;
       
   289 
       
   290 						case "mso-element":
       
   291 							// Remove track changes code
       
   292 							if (/^(comment|comment-list)$/i.test(value)) {
       
   293 								node.remove();
       
   294 								return;
       
   295 							}
       
   296 
       
   297 							break;
       
   298 					}
       
   299 
       
   300 					if (name.indexOf('mso-comment') === 0) {
       
   301 						node.remove();
       
   302 						return;
       
   303 					}
       
   304 
       
   305 					// Never allow mso- prefixed names
       
   306 					if (name.indexOf('mso-') === 0) {
       
   307 						return;
       
   308 					}
       
   309 
       
   310 					// Output only valid styles
       
   311 					if (retainStyleProperties == "all" || (validStyles && validStyles[name])) {
       
   312 						outputStyles[name] = value;
       
   313 					}
       
   314 				});
       
   315 
       
   316 				// Convert bold style to "b" element
       
   317 				if (/(bold)/i.test(outputStyles["font-weight"])) {
       
   318 					delete outputStyles["font-weight"];
       
   319 					node.wrap(new Node("b", 1));
       
   320 				}
       
   321 
       
   322 				// Convert italic style to "i" element
       
   323 				if (/(italic)/i.test(outputStyles["font-style"])) {
       
   324 					delete outputStyles["font-style"];
       
   325 					node.wrap(new Node("i", 1));
       
   326 				}
       
   327 
       
   328 				// Serialize the styles and see if there is something left to keep
       
   329 				outputStyles = editor.dom.serializeStyle(outputStyles, node.name);
       
   330 				if (outputStyles) {
       
   331 					return outputStyles;
       
   332 				}
       
   333 
       
   334 				return null;
       
   335 			}
       
   336 
       
   337 			if (settings.paste_enable_default_filters === false) {
       
   338 				return;
       
   339 			}
       
   340 
       
   341 			// Detect is the contents is Word junk HTML
       
   342 			if (isWordContent(e.content)) {
       
   343 				e.wordContent = true; // Mark it for other processors
       
   344 
       
   345 				// Remove basic Word junk
       
   346 				content = Utils.filter(content, [
       
   347 					// Word comments like conditional comments etc
       
   348 					/<!--[\s\S]+?-->/gi,
       
   349 
       
   350 					// Remove comments, scripts (e.g., msoShowComment), XML tag, VML content,
       
   351 					// MS Office namespaced tags, and a few other tags
       
   352 					/<(!|script[^>]*>.*?<\/script(?=[>\s])|\/?(\?xml(:\w+)?|img|meta|link|style|\w:\w+)(?=[\s\/>]))[^>]*>/gi,
       
   353 
       
   354 					// Convert <s> into <strike> for line-though
       
   355 					[/<(\/?)s>/gi, "<$1strike>"],
       
   356 
       
   357 					// Replace nsbp entites to char since it's easier to handle
       
   358 					[/&nbsp;/gi, "\u00a0"],
       
   359 
       
   360 					// Convert <span style="mso-spacerun:yes">___</span> to string of alternating
       
   361 					// breaking/non-breaking spaces of same length
       
   362 					[/<span\s+style\s*=\s*"\s*mso-spacerun\s*:\s*yes\s*;?\s*"\s*>([\s\u00a0]*)<\/span>/gi,
       
   363 						function(str, spaces) {
       
   364 							return (spaces.length > 0) ?
       
   365 								spaces.replace(/./, " ").slice(Math.floor(spaces.length / 2)).split("").join("\u00a0") : "";
       
   366 						}
       
   367 					]
       
   368 				]);
       
   369 
       
   370 				var validElements = settings.paste_word_valid_elements;
       
   371 				if (!validElements) {
       
   372 					validElements = (
       
   373 						'-strong/b,-em/i,-u,-span,-p,-ol,-ul,-li,-h1,-h2,-h3,-h4,-h5,-h6,' +
       
   374 						'-p/div,-a[href|name],sub,sup,strike,br,del,table[width],tr,' +
       
   375 						'td[colspan|rowspan|width],th[colspan|rowspan|width],thead,tfoot,tbody'
       
   376 					);
       
   377 				}
       
   378 
       
   379 				// Setup strict schema
       
   380 				var schema = new Schema({
       
   381 					valid_elements: validElements,
       
   382 					valid_children: '-li[p]'
       
   383 				});
       
   384 
       
   385 				// Add style/class attribute to all element rules since the user might have removed them from
       
   386 				// paste_word_valid_elements config option and we need to check them for properties
       
   387 				Tools.each(schema.elements, function(rule) {
       
   388 					/*eslint dot-notation:0*/
       
   389 					if (!rule.attributes["class"]) {
       
   390 						rule.attributes["class"] = {};
       
   391 						rule.attributesOrder.push("class");
       
   392 					}
       
   393 
       
   394 					if (!rule.attributes.style) {
       
   395 						rule.attributes.style = {};
       
   396 						rule.attributesOrder.push("style");
       
   397 					}
       
   398 				});
       
   399 
       
   400 				// Parse HTML into DOM structure
       
   401 				var domParser = new DomParser({}, schema);
       
   402 
       
   403 				// Filter styles to remove "mso" specific styles and convert some of them
       
   404 				domParser.addAttributeFilter('style', function(nodes) {
       
   405 					var i = nodes.length, node;
       
   406 
       
   407 					while (i--) {
       
   408 						node = nodes[i];
       
   409 						node.attr('style', filterStyles(node, node.attr('style')));
       
   410 
       
   411 						// Remove pointess spans
       
   412 						if (node.name == 'span' && node.parent && !node.attributes.length) {
       
   413 							node.unwrap();
       
   414 						}
       
   415 					}
       
   416 				});
       
   417 
       
   418 				// Check the class attribute for comments or del items and remove those
       
   419 				domParser.addAttributeFilter('class', function(nodes) {
       
   420 					var i = nodes.length, node, className;
       
   421 
       
   422 					while (i--) {
       
   423 						node = nodes[i];
       
   424 
       
   425 						className = node.attr('class');
       
   426 						if (/^(MsoCommentReference|MsoCommentText|msoDel)$/i.test(className)) {
       
   427 							node.remove();
       
   428 						}
       
   429 
       
   430 						node.attr('class', null);
       
   431 					}
       
   432 				});
       
   433 
       
   434 				// Remove all del elements since we don't want the track changes code in the editor
       
   435 				domParser.addNodeFilter('del', function(nodes) {
       
   436 					var i = nodes.length;
       
   437 
       
   438 					while (i--) {
       
   439 						nodes[i].remove();
       
   440 					}
       
   441 				});
       
   442 
       
   443 				// Keep some of the links and anchors
       
   444 				domParser.addNodeFilter('a', function(nodes) {
       
   445 					var i = nodes.length, node, href, name;
       
   446 
       
   447 					while (i--) {
       
   448 						node = nodes[i];
       
   449 						href = node.attr('href');
       
   450 						name = node.attr('name');
       
   451 
       
   452 						if (href && href.indexOf('#_msocom_') != -1) {
       
   453 							node.remove();
       
   454 							continue;
       
   455 						}
       
   456 
       
   457 						if (href && href.indexOf('file://') === 0) {
       
   458 							href = href.split('#')[1];
       
   459 							if (href) {
       
   460 								href = '#' + href;
       
   461 							}
       
   462 						}
       
   463 
       
   464 						if (!href && !name) {
       
   465 							node.unwrap();
       
   466 						} else {
       
   467 							// Remove all named anchors that aren't specific to TOC, Footnotes or Endnotes
       
   468 							if (name && !/^_?(?:toc|edn|ftn)/i.test(name)) {
       
   469 								node.unwrap();
       
   470 								continue;
       
   471 							}
       
   472 
       
   473 							node.attr({
       
   474 								href: href,
       
   475 								name: name
       
   476 							});
       
   477 						}
       
   478 					}
       
   479 				});
       
   480 
       
   481 				// Parse into DOM structure
       
   482 				var rootNode = domParser.parse(content);
       
   483 
       
   484 				// Process DOM
       
   485 				if (settings.paste_convert_word_fake_lists !== false) {
       
   486 					convertFakeListsToProperLists(rootNode);
       
   487 				}
       
   488 
       
   489 				// Serialize DOM back to HTML
       
   490 				e.content = new Serializer({}, schema).serialize(rootNode);
       
   491 			}
       
   492 		});
       
   493 	}
       
   494 
       
   495 	WordFilter.isWordContent = isWordContent;
       
   496 
       
   497 	return WordFilter;
       
   498 });