1. Project Clover database Tue Dec 20 2016 21:24:09 CET
  2. Package org.htmlcleaner

File XWikiDOMSerializer.java

 

Coverage histogram

../../img/srcFileCovDistChart8.png
54% of files have more coverage

Code metrics

38
86
6
1
322
156
27
0.31
14.33
6
4.5

Classes

Class Line # Actions
XWikiDOMSerializer 57 86 0% 27 34
0.7384615573.8%
 

Contributing tests

This file is covered by 109 tests. .

Source view

1    /*
2    * See the NOTICE file distributed with this work for additional
3    * information regarding copyright ownership.
4    *
5    * This is free software; you can redistribute it and/or modify it
6    * under the terms of the GNU Lesser General Public License as
7    * published by the Free Software Foundation; either version 2.1 of
8    * the License, or (at your option) any later version.
9    *
10    * This software is distributed in the hope that it will be useful,
11    * but WITHOUT ANY WARRANTY; without even the implied warranty of
12    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13    * Lesser General Public License for more details.
14    *
15    * You should have received a copy of the GNU Lesser General Public
16    * License along with this software; if not, write to the Free
17    * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
18    * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
19    */
20    package org.htmlcleaner;
21   
22    import java.util.Iterator;
23    import java.util.List;
24    import java.util.Map;
25    import java.util.regex.Matcher;
26    import java.util.regex.Pattern;
27   
28    import javax.xml.parsers.DocumentBuilder;
29    import javax.xml.parsers.ParserConfigurationException;
30   
31    import org.apache.commons.lang3.StringEscapeUtils;
32    import org.apache.commons.lang3.StringUtils;
33    import org.w3c.dom.Comment;
34    import org.w3c.dom.DOMImplementation;
35    import org.w3c.dom.Document;
36    import org.w3c.dom.DocumentType;
37    import org.w3c.dom.Element;
38   
39    /**
40    * Generate a W3C Document from a SF's HTML Cleaner TagNode.
41    *
42    * Some code has been copy-pasted from SF's HTML Cleaner code (which is under a BDS license, see
43    * http://htmlcleaner.sourceforge.net/license.php). Our goal is to remove this class completely if we can get SF's HTML
44    * Cleaner to support the CDATA-related use cases that force us to have this class.
45    *
46    * Remove when the following issues have been fixed:
47    * <ul>
48    * <li>https://sourceforge.net/p/htmlcleaner/bugs/169/</li>
49    * </ul>
50    *
51    * Note: Even though in a public package this code is not meant to be a public API. We've had to put in under the {@code
52    * org.htmlcleaner} package because of https://sourceforge.net/p/htmlcleaner/bugs/167/.
53    *
54    * @version $Id: 124eceb29fd098c392e9dcffa5c21bfef5cecb8e $
55    * @since 1.8.2
56    */
 
57    public class XWikiDOMSerializer
58    {
59    /**
60    * The Regex Pattern to recognize a CDATA block.
61    */
62    private static final Pattern CDATA_PATTERN = Pattern.compile("<!\\[CDATA\\[.*(\\]\\]>|<!\\[CDATA\\[)",
63    Pattern.DOTALL);
64   
65    private static final String CSS_COMMENT_START = "/*";
66   
67    private static final String CSS_COMMENT_END = "*/";
68   
69    private static final String JS_COMMENT = "//";
70   
71    private static final String NEW_LINE = "\n";
72   
73    private static final String SCRIPT_TAG_NAME = "script";
74   
75    private static final String STYLE_TAG_NAME = "style";
76   
77    private static final String HTML_TAG_NAME = "html";
78   
79    /**
80    * The HTML Cleaner properties set by the user to control the HTML cleaning.
81    */
82    private CleanerProperties props;
83   
84    /**
85    * Whether XML entities should be escaped or not.
86    */
87    private boolean escapeXml;
88   
89    /**
90    * @param props the HTML Cleaner properties set by the user to control the HTML cleaning.
91    * @param escapeXml if true then escape XML entities
92    */
 
93  4986 toggle public XWikiDOMSerializer(CleanerProperties props, boolean escapeXml)
94    {
95  4986 this.props = props;
96  4986 this.escapeXml = escapeXml;
97    }
98   
99    /**
100    * @param documentDocumentBuilder the {@link DocumentBuilder} instance to use, DocumentBuilder is not garantied to
101    * be thread safe so at most the safe instance should be used only in the same thread
102    * @param rootNode the HTML Cleaner root node to serialize
103    * @return the W3C Document object
104    * @throws ParserConfigurationException if there's an error during serialization
105    */
 
106  4986 toggle public Document createDOM(DocumentBuilder documentDocumentBuilder, TagNode rootNode)
107    throws ParserConfigurationException
108    {
109  4986 DOMImplementation impl = documentDocumentBuilder.getDOMImplementation();
110   
111    // Copied from the source code of HTML Cleaner.
112   
113  4986 Document document;
114   
115    //
116    // Where a DOCTYPE is supplied in the input, ensure that this is in the output DOM. See issue #27
117    //
118    // Note that we may want to fix incorrect DOCTYPEs in future; there are some fairly
119    // common patterns for errors with the older HTML4 doctypes.
120    //
121  4986 if (rootNode.getDocType() != null) {
122  4986 String qualifiedName = rootNode.getDocType().getPart1();
123  4986 String publicId = rootNode.getDocType().getPublicId();
124  4986 String systemId = rootNode.getDocType().getSystemId();
125   
126    //
127    // If there is no qualified name, set it to html. See bug #153.
128    //
129  4986 if (qualifiedName == null) {
130  0 qualifiedName = HTML_TAG_NAME;
131    }
132   
133  4986 DocumentType documentType = impl.createDocumentType(qualifiedName, publicId, systemId);
134   
135    //
136    // While the qualified name is "HTML" for some DocTypes, we want the actual document root name to be "html".
137    // See bug #116
138    //
139  4986 if (qualifiedName.equals("HTML")) {
140  0 qualifiedName = HTML_TAG_NAME;
141    }
142  4986 document = impl.createDocument(rootNode.getNamespaceURIOnPath(""), qualifiedName, documentType);
143    } else {
144  0 document = documentDocumentBuilder.newDocument();
145  0 Element rootElement = document.createElement(rootNode.getName());
146  0 document.appendChild(rootElement);
147    }
148   
149    //
150    // Copy across root node attributes - see issue 127. Thanks to rasifiel for the patch
151    //
152  4986 Map<String, String> attributes = rootNode.getAttributes();
153  4986 Iterator<Map.Entry<String, String>> entryIterator = attributes.entrySet().iterator();
154  4986 while (entryIterator.hasNext()) {
155  0 Map.Entry<String, String> entry = entryIterator.next();
156  0 String attrName = entry.getKey();
157  0 String attrValue = entry.getValue();
158  0 if (escapeXml) {
159  0 attrValue = Utils.escapeXml(attrValue, props, true);
160    }
161   
162  0 document.getDocumentElement().setAttribute(attrName, attrValue);
163   
164    //
165    // Flag the attribute as an ID attribute if appropriate. Thanks to Chris173
166    //
167  0 if (attrName.equalsIgnoreCase("id")) {
168  0 document.getDocumentElement().setIdAttribute(attrName, true);
169    }
170    }
171   
172  4986 createSubnodes(document, document.getDocumentElement(), rootNode.getAllChildren());
173   
174  4986 return document;
175    }
176   
177    /**
178    * Perform CDATA transformations if the user has specified to use CDATA inside scripts and style elements.
179    *
180    * @param document the W3C Document to use for creating new DOM elements
181    * @param element the W3C element to which we'll add the text content to
182    * @param bufferedContent the buffered text content on which we need to perform the CDATA transformations
183    * @param item the current HTML Cleaner node being processed
184    */
 
185  92443 toggle private void flushContent(Document document, Element element, StringBuffer bufferedContent, Object item)
186    {
187  92443 if (bufferedContent.length() > 0 && !(item instanceof ContentNode)) {
188    // Flush the buffered content
189  25499 boolean specialCase = this.props.isUseCdataForScriptAndStyle() && isScriptOrStyle(element);
190  25499 String content = bufferedContent.toString();
191   
192  25499 if (this.escapeXml && !specialCase) {
193  0 content = Utils.escapeXml(content, this.props, true);
194  25499 } else if (specialCase) {
195  194 content = processCDATABlocks(content);
196    }
197   
198    // Generate a javascript comment in front on the CDATA block so that it works in IE and when
199    // serving XHTML under a mimetype of HTML.
200  25499 if (specialCase) {
201  194 if (SCRIPT_TAG_NAME.equalsIgnoreCase(element.getNodeName())) {
202    // JS
203  188 element.appendChild(document.createTextNode(JS_COMMENT));
204  188 element.appendChild(document.createCDATASection(NEW_LINE + content + NEW_LINE + JS_COMMENT));
205    } else {
206    // CSS
207  6 element.appendChild(document.createTextNode(CSS_COMMENT_START));
208  6 element.appendChild(document.createCDATASection(CSS_COMMENT_END + StringUtils.chomp(content)
209    + NEW_LINE + CSS_COMMENT_START));
210  6 element.appendChild(document.createTextNode(CSS_COMMENT_END));
211    }
212    } else {
213  25305 element.appendChild(document.createTextNode(content));
214    }
215   
216  25499 bufferedContent.setLength(0);
217    }
218    }
219   
220    /**
221    * Remove any existing CDATA section and unencode HTML entities that are not inside a CDATA block.
222    *
223    * @param content the text input to transform
224    * @return the transformed content that will be wrapped inside a CDATA block
225    */
 
226  194 toggle private String processCDATABlocks(String content)
227    {
228  194 StringBuffer result = new StringBuffer();
229  194 Matcher matcher = CDATA_PATTERN.matcher(content);
230  194 int cursor = 0;
231  194 while (matcher.find()) {
232  0 result.append(StringEscapeUtils.unescapeHtml4(content.substring(cursor, matcher.start())));
233  0 result.append(content.substring(matcher.start() + 9, matcher.end() - matcher.group(1).length()));
234  0 cursor = matcher.end() - matcher.group(1).length() + 3;
235    }
236    // Copy the remaining text data in the result buffer
237  194 if (cursor < content.length()) {
238  194 result.append(StringEscapeUtils.unescapeHtml4(content.substring(cursor)));
239    }
240    // Ensure ther's no invalid <![CDATA[ or ]]> remaining.
241  194 String contentResult = result.toString().replace("<![CDATA[", "").replace("]]>", "");
242   
243  194 return contentResult;
244    }
245   
246    /**
247    * @param element the element to check
248    * @return true if the passed element is a script or style element
249    */
 
250  25499 toggle protected boolean isScriptOrStyle(Element element)
251    {
252  25499 String tagName = element.getNodeName();
253  25499 return SCRIPT_TAG_NAME.equalsIgnoreCase(tagName) || STYLE_TAG_NAME.equalsIgnoreCase(tagName);
254    }
255   
256    /**
257    * Serialize a given SF HTML Cleaner node.
258    *
259    * @param document the W3C Document to use for creating new DOM elements
260    * @param element the W3C element to which we'll add the subnodes to
261    * @param tagChildren the SF HTML Cleaner nodes to serialize for that node
262    */
 
263  35754 toggle private void createSubnodes(Document document, Element element, List<? extends BaseToken> tagChildren)
264    {
265    // We've modified the original implementation based in SF's HTML Cleaner to better handle CDATA.
266    // More specifically we want to handle the following 3 use cases:
267    //
268    // Use case 1: useCdata = true && input is:
269    // <script>...<![CDATA[...]]>...</script>
270    // In this case we must make sure to have only one CDATA block.
271    //
272    // Use case 2: useCdata = true && input is:
273    // <script>...entities not encoded (e.g. "<")...</script>
274    // We must generate a CDATA block around the whole content (the HTML Tokenizer split
275    // ContentToken on "<" character so we need to join them before creating the CDATA block.
276    // We must also unencode any entities (i.e. transform "&lt;" into "<") since we'll be
277    // wrapping them in a CDATA section.
278    //
279    // Use case 3: useCData = false
280    // Simply group all ContentToken together.
281   
282  35754 StringBuffer bufferedContent = new StringBuffer();
283   
284  35754 if (tagChildren != null) {
285  35754 for (Object item : tagChildren) {
286    // Flush content tokens
287  56689 flushContent(document, element, bufferedContent, item);
288   
289  56689 if (item instanceof CommentNode) {
290  42 CommentNode commentToken = (CommentNode) item;
291  42 Comment comment = document.createComment(commentToken.getContent());
292  42 element.appendChild(comment);
293  56647 } else if (item instanceof ContentNode) {
294  25874 ContentNode contentToken = (ContentNode) item;
295  25874 bufferedContent.append(contentToken.getContent());
296  30773 } else if (item instanceof TagNode) {
297  30768 TagNode subTagNode = (TagNode) item;
298  30768 Element subelement = document.createElement(subTagNode.getName());
299  30768 Map<String, String> attributes = subTagNode.getAttributes();
300  30768 for (Map.Entry<String, String> entry : attributes.entrySet()) {
301  31126 String attrName = entry.getKey();
302  31126 String attrValue = entry.getValue();
303  31126 if (this.escapeXml) {
304  0 attrValue = Utils.escapeXml(attrValue, this.props, true);
305    }
306  31126 subelement.setAttribute(attrName, attrValue);
307    }
308   
309    // recursively create subnodes
310  30768 createSubnodes(document, subelement, subTagNode.getAllChildren());
311   
312  30768 element.appendChild(subelement);
313  5 } else if (item instanceof List<?>) {
314  0 @SuppressWarnings("unchecked")
315    List<BaseToken> sublist = (List<BaseToken>) item;
316  0 createSubnodes(document, element, sublist);
317    }
318    }
319  35754 flushContent(document, element, bufferedContent, null);
320    }
321    }
322    }