1. Project Clover database Tue Dec 20 2016 21:24:09 CET
  2. Package org.xwiki.xml.internal.html

File DefaultHTMLCleaner.java

 

Coverage histogram

../../../../../img/srcFileCovDistChart10.png
0% of files have more coverage

Code metrics

8
60
7
1
284
153
13
0.22
8.57
7
1.86

Classes

Class Line # Actions
DefaultHTMLCleaner 60 60 0% 13 2
0.9733333697.3%
 

Contributing tests

This file is covered by 109 tests. .

Source view

1    /*
2    * See the NOTICE file distributed with this work for additional
3    * information regarding copyright ownership.
4    *
5    * This is free software; you can redistribute it and/or modify it
6    * under the terms of the GNU Lesser General Public License as
7    * published by the Free Software Foundation; either version 2.1 of
8    * the License, or (at your option) any later version.
9    *
10    * This software is distributed in the hope that it will be useful,
11    * but WITHOUT ANY WARRANTY; without even the implied warranty of
12    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13    * Lesser General Public License for more details.
14    *
15    * You should have received a copy of the GNU Lesser General Public
16    * License along with this software; if not, write to the Free
17    * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
18    * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
19    */
20    package org.xwiki.xml.internal.html;
21   
22    import java.io.Reader;
23    import java.io.StringReader;
24    import java.util.Arrays;
25   
26    import javax.inject.Inject;
27    import javax.inject.Named;
28    import javax.inject.Singleton;
29    import javax.xml.parsers.DocumentBuilder;
30    import javax.xml.parsers.DocumentBuilderFactory;
31    import javax.xml.parsers.ParserConfigurationException;
32   
33    import org.htmlcleaner.CleanerProperties;
34    import org.htmlcleaner.CleanerTransformations;
35    import org.htmlcleaner.DoctypeToken;
36    import org.htmlcleaner.HtmlCleaner;
37    import org.htmlcleaner.TagNode;
38    import org.htmlcleaner.TagTransformation;
39    import org.htmlcleaner.XWikiDOMSerializer;
40    import org.w3c.dom.Document;
41    import org.xwiki.component.annotation.Component;
42    import org.xwiki.component.phase.Initializable;
43    import org.xwiki.component.phase.InitializationException;
44    import org.xwiki.context.Execution;
45    import org.xwiki.context.ExecutionContext;
46    import org.xwiki.xml.html.HTMLCleaner;
47    import org.xwiki.xml.html.HTMLCleanerConfiguration;
48    import org.xwiki.xml.html.HTMLConstants;
49    import org.xwiki.xml.html.filter.HTMLFilter;
50   
51    /**
52    * Default implementation for {@link org.xwiki.xml.html.HTMLCleaner} using the <a href="HTML Cleaner
53    * framework>http://htmlcleaner.sourceforge.net/</a>.
54    *
55    * @version $Id: c4fc9f2a986974f6eea411cacfae05dcf7f10f10 $
56    * @since 1.6M1
57    */
58    @Component
59    @Singleton
 
60    public class DefaultHTMLCleaner implements HTMLCleaner, Initializable
61    {
62    /**
63    * {@link HTMLFilter} for filtering html lists.
64    */
65    @Inject
66    @Named("list")
67    private HTMLFilter listFilter;
68   
69    /**
70    * {@link HTMLFilter} for filtering html lists.
71    */
72    @Inject
73    @Named("listitem")
74    private HTMLFilter listItemFilter;
75   
76    /**
77    * {@link HTMLFilter} for filtering HTML font elements.
78    */
79    @Inject
80    @Named("font")
81    private HTMLFilter fontFilter;
82   
83    /**
84    * {@link HTMLFilter} for wrapping invalid body elements with paragraphs.
85    */
86    @Inject
87    @Named("body")
88    private HTMLFilter bodyFilter;
89   
90    /**
91    * {@link HTMLFilter} for filtering HTML attributes that are used by many different elements and for which we cannot
92    * write simple transformations like in {@link #getDefaultCleanerTransformations(HTMLCleanerConfiguration)}.
93    */
94    @Inject
95    @Named("attribute")
96    private HTMLFilter attributeFilter;
97   
98    /**
99    * {@link HTMLFilter} for filtering HTML links.
100    */
101    @Inject
102    @Named("link")
103    private HTMLFilter linkFilter;
104   
105    @Inject
106    private Execution execution;
107   
 
108  136 toggle @Override
109    public void initialize() throws InitializationException
110    {
111    // The clean method below is thread safe. However it seems that DOMOutputter.output() is not fully thread safe
112    // since it causes the following exception on the first time it's called from different threads:
113    // Caused by: org.jdom.JDOMException: Reflection failed while creating new JAXP document:
114    // duplicate class definition: org/apache/xerces/jaxp/DocumentBuilderFactoryImpl
115    // at org.jdom.adapters.JAXPDOMAdapter.createDocument(JAXPDOMAdapter.java:191)
116    // at org.jdom.adapters.AbstractDOMAdapter.createDocument(AbstractDOMAdapter.java:133)
117    // at org.jdom.output.DOMOutputter.createDOMDocument(DOMOutputter.java:208)
118    // at org.jdom.output.DOMOutputter.output(DOMOutputter.java:127)
119    // Since this only happens once, we call it first here at initialization time (since there's no thread
120    // contention at that time). Note: This email thread seems to say it's thread safe but that's not what we see
121    // here: http:osdir.com/ml/text.xml.xforms.chiba.devel/2006-09/msg00025.html
122  136 clean(new StringReader(""));
123    }
124   
 
125  246 toggle @Override
126    public Document clean(Reader originalHtmlContent)
127    {
128  246 return clean(originalHtmlContent, getDefaultConfiguration());
129    }
130   
 
131  4986 toggle private DocumentBuilder getAvailableDocumentBuilder() throws ParserConfigurationException
132    {
133  4986 ExecutionContext econtext = this.execution.getContext();
134   
135  4986 if (econtext != null) {
136  4882 DocumentBuilder documentBuilder = (DocumentBuilder) econtext.getProperty(DocumentBuilder.class.getName());
137   
138  4882 if (documentBuilder == null) {
139  550 documentBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
140  550 econtext.setProperty(DocumentBuilder.class.getName(), documentBuilder);
141    }
142   
143  4882 return documentBuilder;
144    }
145   
146  104 return DocumentBuilderFactory.newInstance().newDocumentBuilder();
147    }
148   
 
149  4986 toggle @Override
150    public Document clean(Reader originalHtmlContent, HTMLCleanerConfiguration configuration)
151    {
152  4986 Document result;
153   
154    // Note: Instantiation of an HtmlCleaner object is cheap so there's no need to cache an instance of it,
155    // especially since this makes it extra safe with regards to multithreading (even though HTML Cleaner is
156    // already supposed to be thread safe).
157  4986 CleanerProperties cleanerProperties = getDefaultCleanerProperties(configuration);
158  4986 HtmlCleaner cleaner = new HtmlCleaner(cleanerProperties);
159   
160  4986 TagNode cleanedNode;
161  4986 try {
162  4986 cleanedNode = cleaner.clean(originalHtmlContent);
163    } catch (Exception e) {
164    // This shouldn't happen since we're not doing any IO... I consider this a flaw in the design of HTML
165    // Cleaner.
166  0 throw new RuntimeException("Unhandled error when cleaning HTML", e);
167    }
168   
169  4986 try {
170    // Ideally we would use SF's HTMLCleaner DomSerializer but there are outstanding issues with it, so we're
171    // using a custom XWikiDOMSerializer (see its javadoc for more details).
172    // Replace by the following when fixed:
173    // result = new DomSerializer(cleanerProperties, false).createDOM(cleanedNode);
174   
175  4986 cleanedNode.setDocType(new DoctypeToken("html", "PUBLIC", "-//W3C//DTD XHTML 1.0 Strict//EN",
176    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"));
177  4986 result =
178    new XWikiDOMSerializer(cleanerProperties, false).createDOM(getAvailableDocumentBuilder(), cleanedNode);
179    } catch (ParserConfigurationException ex) {
180  0 throw new RuntimeException("Error while serializing TagNode into w3c dom.", ex);
181    }
182   
183    // Finally apply filters.
184  4986 for (HTMLFilter filter : configuration.getFilters()) {
185  30570 filter.filter(result, configuration.getParameters());
186    }
187   
188  4986 return result;
189    }
190   
 
191  4985 toggle @Override
192    public HTMLCleanerConfiguration getDefaultConfiguration()
193    {
194  4985 HTMLCleanerConfiguration configuration = new DefaultHTMLCleanerConfiguration();
195  4985 configuration.setFilters(Arrays.asList(
196    this.bodyFilter,
197    this.listItemFilter,
198    this.listFilter,
199    this.fontFilter,
200    this.attributeFilter,
201    this.linkFilter));
202  4985 return configuration;
203    }
204   
205    /**
206    * @param configuration the configuration to use for the cleaning
207    * @return the default {@link CleanerProperties} to be used for cleaning.
208    */
 
209  4986 toggle private CleanerProperties getDefaultCleanerProperties(HTMLCleanerConfiguration configuration)
210    {
211  4986 CleanerProperties defaultProperties = new CleanerProperties();
212  4986 defaultProperties.setOmitUnknownTags(true);
213   
214    // HTML Cleaner uses the compact notation by default but we don't want that since:
215    // - it's more work and not required since not compact notation is valid XHTML
216    // - expanded elements can also be rendered fine in browsers that only support HTML.
217  4986 defaultProperties.setUseEmptyElementTags(false);
218   
219    // Wrap script and style content in CDATA blocks
220  4986 defaultProperties.setUseCdataForScriptAndStyle(true);
221   
222    // We need this for example to ignore CDATA sections not inside script or style elements.
223  4986 defaultProperties.setIgnoreQuestAndExclam(true);
224   
225    // Remove CDATA outside of script and style since according to the spec it has no effect there.
226  4986 defaultProperties.setOmitCdataOutsideScriptAndStyle(true);
227   
228    // If the caller has defined NAMESPACE_AWARE configuration property then use it, otherwise use our default.
229  4986 String param = configuration.getParameters().get(HTMLCleanerConfiguration.NAMESPACES_AWARE);
230  4986 boolean namespacesAware = (param != null) ? Boolean.parseBoolean(param) : true;
231  4986 defaultProperties.setNamespacesAware(namespacesAware);
232   
233    // Set Cleaner transformations
234  4986 defaultProperties.setCleanerTransformations(getDefaultCleanerTransformations(configuration));
235   
236    // By default, we are cleaning XHTML 1.0 code, not HTML 5.
237    // Note: Tests are broken if we don't set the version 4, meaning that supporting HTML5 requires some work.
238    // TODO: handle HTML5 correctly (see: http://jira.xwiki.org/browse/XCOMMONS-901)
239  4986 defaultProperties.setHtmlVersion(4);
240   
241  4986 return defaultProperties;
242    }
243   
244    /**
245    * @param configuration The cleaner configuration.
246    * @return the default cleaning transformations to perform on tags, in addition to the base transformations done by
247    * HTML Cleaner
248    */
 
249  4986 toggle private CleanerTransformations getDefaultCleanerTransformations(HTMLCleanerConfiguration configuration)
250    {
251  4986 CleanerTransformations defaultTransformations = new CleanerTransformations();
252   
253  4986 TagTransformation tt = new TagTransformation(HTMLConstants.TAG_B, HTMLConstants.TAG_STRONG, false);
254  4986 defaultTransformations.addTransformation(tt);
255   
256  4986 tt = new TagTransformation(HTMLConstants.TAG_I, HTMLConstants.TAG_EM, false);
257  4986 defaultTransformations.addTransformation(tt);
258   
259  4986 tt = new TagTransformation(HTMLConstants.TAG_U, HTMLConstants.TAG_INS, false);
260  4986 defaultTransformations.addTransformation(tt);
261   
262  4986 tt = new TagTransformation(HTMLConstants.TAG_S, HTMLConstants.TAG_DEL, false);
263  4986 defaultTransformations.addTransformation(tt);
264   
265  4986 tt = new TagTransformation(HTMLConstants.TAG_STRIKE, HTMLConstants.TAG_DEL, false);
266  4986 defaultTransformations.addTransformation(tt);
267   
268  4986 tt = new TagTransformation(HTMLConstants.TAG_CENTER, HTMLConstants.TAG_P, false);
269  4986 tt.addAttributeTransformation(HTMLConstants.ATTRIBUTE_STYLE, "text-align:center");
270  4986 defaultTransformations.addTransformation(tt);
271   
272  4986 String restricted = configuration.getParameters().get(HTMLCleanerConfiguration.RESTRICTED);
273  4986 if ("true".equalsIgnoreCase(restricted)) {
274   
275  3 tt = new TagTransformation(HTMLConstants.TAG_SCRIPT, HTMLConstants.TAG_PRE, false);
276  3 defaultTransformations.addTransformation(tt);
277   
278  3 tt = new TagTransformation(HTMLConstants.TAG_STYLE, HTMLConstants.TAG_PRE, false);
279  3 defaultTransformations.addTransformation(tt);
280    }
281   
282  4986 return defaultTransformations;
283    }
284    }