1. Project Clover database Tue Dec 20 2016 21:24:09 CET
  2. Package org.xwiki.xml.html

File HTMLUtils.java

 

Coverage histogram

../../../../img/srcFileCovDistChart9.png
38% of files have more coverage

Code metrics

36
78
11
2
313
168
30
0.38
7.09
5.5
2.73

Classes

Class Line # Actions
HTMLUtils 46 37 0% 15 5
0.9193548691.9%
HTMLUtils.XWikiXMLOutputter 71 41 0% 15 12
0.809523881%
 

Contributing tests

This file is covered by 92 tests. .

Source view

1    /*
2    * See the NOTICE file distributed with this work for additional
3    * information regarding copyright ownership.
4    *
5    * This is free software; you can redistribute it and/or modify it
6    * under the terms of the GNU Lesser General Public License as
7    * published by the Free Software Foundation; either version 2.1 of
8    * the License, or (at your option) any later version.
9    *
10    * This software is distributed in the hope that it will be useful,
11    * but WITHOUT ANY WARRANTY; without even the implied warranty of
12    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13    * Lesser General Public License for more details.
14    *
15    * You should have received a copy of the GNU Lesser General Public
16    * License along with this software; if not, write to the Free
17    * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
18    * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
19    */
20    package org.xwiki.xml.html;
21   
22    import java.io.IOException;
23    import java.io.Writer;
24    import java.util.Arrays;
25    import java.util.List;
26    import java.util.regex.Matcher;
27    import java.util.regex.Pattern;
28   
29    import org.apache.commons.lang3.StringUtils;
30    import org.jdom.DocType;
31    import org.jdom.Element;
32    import org.jdom.input.DOMBuilder;
33    import org.jdom.output.Format;
34    import org.jdom.output.XMLOutputter;
35    import org.w3c.dom.Document;
36    import org.w3c.dom.Node;
37    import org.w3c.dom.NodeList;
38   
39    /**
40    * HTML Utility methods.
41    *
42    * @version $Id: cc7ab52deafe00eed701a7c3e06bfbc67791776f $
43    * @since 1.8.3
44    */
45    // TODO: Create a separate class for each HTML version (XHTML 1.0, HTML5, etc...)
 
46    public final class HTMLUtils
47    {
48    /**
49    * In HTML5, some elements must be expanded (for example {@code <span></span>} instead of {@code <span />}), and
50    * some others must not (for example {@code <br />} instead of {@code <br></br>}. Thus for the list of elements
51    * below we need special handling (not expanding).
52    */
53    private static final List<String> OMIT_ELEMENT_EXPANDING_SET = Arrays.asList(
54    "area", "base", "br", "col", "hr", "img", "input", "link", "meta", "param");
55   
56    /**
57    * JDOM's XMLOutputter class converts reserved XML characters ({@code <, >, ' , &, \r and \n}) into their entity
58    * format {@code &lt;, &gt; &apos; &amp; &#xD; and \r\n}. However since we're using HTML Cleaner
59    * (http://htmlcleaner.sourceforge.net/) and since it's buggy for character escapes we have turned off character
60    * escaping for it and thus we need to perform selective escaping here.
61    * <p>
62    * Moreover, since we support HTML5, we need to
63    * expand empty elements on some elements and not on the others. For example: {@code <span></span>} is valid
64    * meanwhile:
65    * <pre>{@code
66    * <br>
67    * </br>}</pre>
68    * is not. See {@code OMIT_ELEMENT_EXPANDING_SET} for the list of elements to not expand.
69    */
70    // TODO: Remove the complex escaping code when SF HTML Cleaner will do proper escaping
 
71    public static class XWikiXMLOutputter extends XMLOutputter
72    {
73    /**
74    * Regex to recognize a XML Entity.
75    */
76    private static final Pattern ENTITY = Pattern.compile("&[a-z]+;|&#[0-9a-zA-Z]+;");
77   
78    /**
79    * Ampersand character.
80    */
81    private static final String AMPERSAND = "&";
82   
83    private static final String[] REPLACE_ELEMENTS_SEARCH = new String[] { "<", ">" };
84   
85    private static final String[] REPLACE_ELEMENTS_RESULT = new String[] { "&lt;", "&gt;" };
86   
87    /**
88    * Whether to omit the document type when printing the W3C Document or not.
89    */
90    private boolean omitDocType;
91   
92    /**
93    * @param format the JDOM class used to control output formats, see {@link org.jdom.output.Format}
94    * @param omitDocType if true then omit the document type when printing the W3C Document
95    * @see XMLOutputter#XMLOutputter(Format)
96    */
 
97  4789 toggle public XWikiXMLOutputter(Format format, boolean omitDocType)
98    {
99  4789 super(format);
100  4789 this.omitDocType = omitDocType;
101    }
102   
 
103  56444 toggle @Override
104    public String escapeElementEntities(String text)
105    {
106  56444 if (text.length() == 0) {
107  331 return text;
108    }
109   
110  56113 String result;
111  56113 int pos1 = text.indexOf("<![CDATA[");
112  56113 if (pos1 > -1) {
113  0 int pos2 = text.indexOf("]]>", pos1 + 9);
114  0 if (pos2 + 3 == text.length()) {
115  0 return text;
116    }
117  0 result = escapeElementEntities(text.substring(0, pos1));
118  0 if (pos2 + 3 == text.length()) {
119  0 result = result + text.substring(pos1);
120    } else {
121  0 result = result + text.substring(pos1, pos2 + 3) + escapeElementEntities(text.substring(pos2 + 3));
122    }
123    } else {
124  56113 result = escapeAmpersand(text);
125  56113 StringUtils.replaceEach(text, REPLACE_ELEMENTS_SEARCH, REPLACE_ELEMENTS_RESULT);
126    }
127   
128  56113 return result;
129    }
130   
 
131  31117 toggle @Override
132    public String escapeAttributeEntities(String text)
133    {
134  31117 String result = escapeElementEntities(text);
135   
136    // Attribute values must have quotes escaped since attributes are defined with quotes...
137  31117 result = StringUtils.replace(result, "\"", "&quot;");
138   
139  31117 return result;
140    }
141   
142    /**
143    * Escape ampersand when it's not defining an entity.
144    *
145    * @param text the text to escape
146    * @return the escaped text
147    */
 
148  56113 toggle private String escapeAmpersand(String text)
149    {
150  56113 StringBuilder buffer = new StringBuilder(text);
151    // find all occurrences of &
152  56113 int pos = buffer.indexOf(AMPERSAND);
153  62464 while (pos > -1 && pos < buffer.length()) {
154    // Check if the & is an entity
155  6351 Matcher matcher = ENTITY.matcher(buffer.substring(pos));
156  6351 if (matcher.lookingAt()) {
157    // We've found an entity, don't do anything, just skip it
158  2951 pos = pos + matcher.end() - matcher.start();
159    } else {
160    // No entity, escape the &
161  3400 buffer.replace(pos, pos + 1, "&amp;");
162  3400 pos += 5;
163    }
164  6351 pos = buffer.indexOf(AMPERSAND, pos);
165    }
166  56113 return buffer.toString();
167    }
168   
 
169  4788 toggle @Override
170    protected void printDocType(Writer out, DocType docType) throws IOException
171    {
172  4788 if (!this.omitDocType) {
173  128 super.printDocType(out, docType);
174    }
175    }
176   
 
177  25576 toggle @Override
178    protected void printElement(Writer out, Element element, int level, NamespaceStack namespaces)
179    throws IOException
180    {
181    // We override the code from the super class to not expand some empty elements.
182  25576 boolean currentFormatPolicy = currentFormat.getExpandEmptyElements();
183  25576 try {
184  25576 String elementName = element.getName();
185  25576 for (String name : OMIT_ELEMENT_EXPANDING_SET) {
186  247047 if (name.equals(elementName)) {
187    // We do not expand this empty element
188  2602 currentFormat.setExpandEmptyElements(false);
189  2602 break;
190    }
191    }
192   
193    // Call the method from the super class
194  25576 super.printElement(out, element, level, namespaces);
195   
196    } finally {
197    // Reset the format
198  25576 currentFormat.setExpandEmptyElements(currentFormatPolicy);
199    }
200    }
201    }
202   
203    /**
204    * Private constructor since this is a utility class that shouldn't be instantiated (all methods are static).
205    */
 
206  0 toggle private HTMLUtils()
207    {
208    // Nothing to do
209    }
210   
211    /**
212    * @param document the W3C Document to transform into a String
213    * @return the XML as a String
214    */
 
215  129 toggle public static String toString(Document document)
216    {
217  129 return HTMLUtils.toString(document, false, false);
218    }
219   
220    /**
221    * @param document the W3C Document to transform into a String
222    * @param omitDeclaration whether the XML declaration should be printed or not
223    * @param omitDoctype whether the document type should be printed or not
224    * @return the XML as a String
225    */
 
226  4789 toggle public static String toString(Document document, boolean omitDeclaration, boolean omitDoctype)
227    {
228    // Note: We don't use javax.xml.transform.Transformer since it prints our valid XHTML as HTML which is not
229    // XHTML compliant. For example it transforms our "<hr/>" into "<hr>.
230  4789 DOMBuilder builder = new DOMBuilder();
231  4789 org.jdom.Document jdomDoc = builder.build(document);
232   
233  4789 Format format = Format.getRawFormat();
234    // Force newlines to use \n since otherwise the default is \n\r.
235    // See http://www.jdom.org/docs/apidocs/org/jdom/output/Format.html#setLineSeparator(java.lang.String)
236  4789 format.setLineSeparator("\n");
237   
238    // Make sure all elements are expanded so that they can also be rendered fine in browsers that only support
239    // HTML.
240  4789 format.setExpandEmptyElements(true);
241   
242  4789 format.setOmitDeclaration(omitDeclaration);
243   
244  4789 XMLOutputter outputter = new XWikiXMLOutputter(format, omitDoctype);
245  4789 String result = outputter.outputString(jdomDoc);
246   
247  4789 return result;
248    }
249   
250    /**
251    * Strip the HTML envelope if it exists. Precisely this means removing the head tag and move all tags in the body
252    * tag directly under the html element. This is useful for example if you wish to insert an HTML fragment into an
253    * existing HTML page.
254    *
255    * @param document the w3c Document to strip
256    */
 
257  4660 toggle public static void stripHTMLEnvelope(Document document)
258    {
259  4660 org.w3c.dom.Element root = document.getDocumentElement();
260  4660 if (root.getNodeName().equalsIgnoreCase(HTMLConstants.TAG_HTML)) {
261    // Look for a head element below the root element and for a body element
262  4660 Node bodyNode = null;
263  4660 Node headNode = null;
264  4660 NodeList nodes = root.getChildNodes();
265  13978 for (int i = 0; i < nodes.getLength(); i++) {
266  9318 Node node = nodes.item(i);
267  9318 if (node.getNodeName().equalsIgnoreCase(HTMLConstants.TAG_HEAD)) {
268  4659 headNode = node;
269  4659 } else if (node.getNodeName().equalsIgnoreCase(HTMLConstants.TAG_BODY)) {
270  4659 bodyNode = node;
271    }
272    }
273   
274  4660 if (headNode != null) {
275  4659 root.removeChild(headNode);
276    }
277   
278  4660 if (bodyNode != null) {
279    // Move all children of body node under the root element
280  4659 NodeList bodyChildrenNodes = bodyNode.getChildNodes();
281  10124 while (bodyChildrenNodes.getLength() > 0) {
282  5465 root.insertBefore(bodyChildrenNodes.item(0), null);
283    }
284  4659 root.removeChild(bodyNode);
285    }
286    }
287    }
288   
289    /**
290    * Remove the first element inside a parent element and copy the element's children in the parent.
291    *
292    * @param document the w3c document from which to remove the top level paragraph
293    * @param parentTagName the name of the parent tag to look under
294    * @param elementTagName the name of the first element to remove
295    */
 
296  3981 toggle public static void stripFirstElementInside(Document document, String parentTagName, String elementTagName)
297    {
298  3981 NodeList parentNodes = document.getElementsByTagName(parentTagName);
299  3981 if (parentNodes.getLength() > 0) {
300  3981 Node parentNode = parentNodes.item(0);
301    // Look for a p element below the first parent element
302  3981 Node pNode = parentNode.getFirstChild();
303  3981 if (elementTagName.equalsIgnoreCase(pNode.getNodeName())) {
304    // Move all children of p node under the root element
305  3981 NodeList pChildrenNodes = pNode.getChildNodes();
306  7971 while (pChildrenNodes.getLength() > 0) {
307  3990 parentNode.insertBefore(pChildrenNodes.item(0), null);
308    }
309  3981 parentNode.removeChild(pNode);
310    }
311    }
312    }
313    }