Clover Coverage Report - XWiki Commons - Parent POM 4.0-SNAPSHOT (Aggregated)
Coverage timestamp: Mon Mar 12 2012 17:13:48 CET
../../../../../img/srcFileCovDistChart10.png 0% of files have more coverage
46   237   9   7.67
2   121   0.2   6
6     1.5  
1    
 
  DefaultHTMLCleaner       Line # 57 46 0% 9 3 94.4% 0.9444444
 
No Tests
 
1    /*
2    * See the NOTICE file distributed with this work for additional
3    * information regarding copyright ownership.
4    *
5    * This is free software; you can redistribute it and/or modify it
6    * under the terms of the GNU Lesser General Public License as
7    * published by the Free Software Foundation; either version 2.1 of
8    * the License, or (at your option) any later version.
9    *
10    * This software is distributed in the hope that it will be useful,
11    * but WITHOUT ANY WARRANTY; without even the implied warranty of
12    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13    * Lesser General Public License for more details.
14    *
15    * You should have received a copy of the GNU Lesser General Public
16    * License along with this software; if not, write to the Free
17    * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
18    * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
19    */
20    package org.xwiki.xml.internal.html;
21   
22    import java.io.Reader;
23    import java.io.StringReader;
24    import java.util.Arrays;
25   
26    import javax.inject.Inject;
27    import javax.inject.Named;
28    import javax.inject.Singleton;
29    import javax.xml.parsers.DocumentBuilderFactory;
30    import javax.xml.parsers.ParserConfigurationException;
31   
32    import org.htmlcleaner.CleanerProperties;
33    import org.htmlcleaner.CleanerTransformations;
34    import org.htmlcleaner.HtmlCleaner;
35    import org.htmlcleaner.TagNode;
36    import org.htmlcleaner.TagTransformation;
37    import org.w3c.dom.DOMImplementation;
38    import org.w3c.dom.Document;
39    import org.w3c.dom.DocumentType;
40    import org.xwiki.component.annotation.Component;
41    import org.xwiki.component.phase.Initializable;
42    import org.xwiki.component.phase.InitializationException;
43    import org.xwiki.xml.html.HTMLCleaner;
44    import org.xwiki.xml.html.HTMLCleanerConfiguration;
45    import org.xwiki.xml.html.HTMLConstants;
46    import org.xwiki.xml.html.filter.HTMLFilter;
47   
48    /**
49    * Default implementation for {@link org.xwiki.xml.html.HTMLCleaner} using the <a href="HTML Cleaner
50    * framework>http://htmlcleaner.sourceforge.net/</a>.
51    *
52    * @version $Id: 5d69669db6c9557464bb601fc1e322e0e02ad1e5 $
53    * @since 1.6M1
54    */
55    @Component
56    @Singleton
 
57    public class DefaultHTMLCleaner implements HTMLCleaner, Initializable
58    {
59    /**
60    * The qualified name to be used when generating an html {@link DocumentType}.
61    */
62    private static final String QUALIFIED_NAME_HTML = "html";
63   
64    /**
65    * {@link HTMLFilter} for filtering html lists.
66    */
67    @Inject
68    @Named("list")
69    private HTMLFilter listFilter;
70   
71    /**
72    * {@link HTMLFilter} for filtering html lists.
73    */
74    @Inject
75    @Named("listitem")
76    private HTMLFilter listItemFilter;
77   
78    /**
79    * {@link HTMLFilter} for filtering HTML font elements.
80    */
81    @Inject
82    @Named("font")
83    private HTMLFilter fontFilter;
84   
85    /**
86    * {@link HTMLFilter} for wrapping invalid body elements with paragraphs.
87    */
88    @Inject
89    @Named("body")
90    private HTMLFilter bodyFilter;
91   
 
92  10 toggle @Override
93    public void initialize() throws InitializationException
94    {
95    // The clean method below is thread safe. However it seems that DOMOutputter.output() is not fully thread safe
96    // since it causes the following exception on the first time it's called from different threads:
97    // Caused by: org.jdom.JDOMException: Reflection failed while creating new JAXP document:
98    // duplicate class definition: org/apache/xerces/jaxp/DocumentBuilderFactoryImpl
99    // at org.jdom.adapters.JAXPDOMAdapter.createDocument(JAXPDOMAdapter.java:191)
100    // at org.jdom.adapters.AbstractDOMAdapter.createDocument(AbstractDOMAdapter.java:133)
101    // at org.jdom.output.DOMOutputter.createDOMDocument(DOMOutputter.java:208)
102    // at org.jdom.output.DOMOutputter.output(DOMOutputter.java:127)
103    // Since this only happens once, we call it first here at initialization time (since there's no thread
104    // contention at that time). Note: This email thread seems to say it's thread safe but that's not what we see
105    // here: http:osdir.com/ml/text.xml.xforms.chiba.devel/2006-09/msg00025.html
106  10 clean(new StringReader(""));
107    }
108   
 
109  52 toggle @Override
110    public Document clean(Reader originalHtmlContent)
111    {
112  52 return clean(originalHtmlContent, getDefaultConfiguration());
113    }
114   
 
115  53 toggle @Override
116    public Document clean(Reader originalHtmlContent, HTMLCleanerConfiguration configuration)
117    {
118  53 Document result;
119   
120    // Note: Instantiation of an HtmlCleaner object is cheap so there's no need to cache an instance of it,
121    // especially since this makes it extra safe with regards to multithreading (even though HTML Cleaner is
122    // already supposed to be thread safe).
123  53 CleanerProperties cleanerProperties = getDefaultCleanerProperties(configuration);
124  53 HtmlCleaner cleaner = new HtmlCleaner(cleanerProperties);
125   
126  53 cleaner.setTransformations(getDefaultCleanerTransformations());
127  53 TagNode cleanedNode;
128  53 try {
129  53 cleanedNode = cleaner.clean(originalHtmlContent);
130    } catch (Exception e) {
131    // This shouldn't happen since we're not doing any IO... I consider this a flaw in the design of HTML
132    // Cleaner.
133  0 throw new RuntimeException("Unhandled error when cleaning HTML", e);
134    }
135   
136    // Serialize the cleanedNode TagNode into a w3c dom. Ideally following code should be enough.
137    // But SF's HTML Cleaner seems to omit the DocType declaration while serializing.
138    // See https://sourceforge.net/tracker/index.php?func=detail&aid=2062318&group_id=183053&atid=903696
139    // cleanedNode.setDocType(new DoctypeToken("html", "PUBLIC", "-//W3C//DTD XHTML 1.0 Strict//EN",
140    // "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"));
141    // try {
142    // result = new DomSerializer(cleanerProperties, false).createDOM(cleanedNode);
143    // } catch(ParserConfigurationException ex) { }
144    // As a workaround, we must serialize the cleanedNode into a temporary w3c document, create a new w3c document
145    // with proper DocType declaration and move the root node from the temporary document to the new one.
146  53 try {
147    // Since there's a bug in SF's HTML Cleaner in that it doesn't recognize CDATA blocks we need to turn off
148    // character escaping (hence the false value passed) and do the escaping in XMLUtils.toString(). Note that
149    // this can cause problem for code not serializing the W3C DOM to a String since it won't have the
150    // characters escaped.
151    // See https://sourceforge.net/tracker/index.php?func=detail&aid=2691888&group_id=183053&atid=903696
152  53 Document tempDoc = new XWikiDOMSerializer(cleanerProperties, false).createDOM(cleanedNode);
153  53 DOMImplementation domImpl =
154    DocumentBuilderFactory.newInstance().newDocumentBuilder().getDOMImplementation();
155  53 DocumentType docType = domImpl.createDocumentType(QUALIFIED_NAME_HTML, "-//W3C//DTD XHTML 1.0 Strict//EN",
156    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd");
157  53 result = domImpl.createDocument(null, QUALIFIED_NAME_HTML, docType);
158  53 result.replaceChild(result.adoptNode(tempDoc.getDocumentElement()), result.getDocumentElement());
159    } catch (ParserConfigurationException ex) {
160  0 throw new RuntimeException("Error while serializing TagNode into w3c dom.", ex);
161    }
162   
163    // Finally apply filters.
164  53 for (HTMLFilter filter : configuration.getFilters()) {
165  208 filter.filter(result, configuration.getParameters());
166    }
167   
168  53 return result;
169    }
170   
 
171  53 toggle @Override
172    public HTMLCleanerConfiguration getDefaultConfiguration()
173    {
174  53 HTMLCleanerConfiguration configuration = new DefaultHTMLCleanerConfiguration();
175  53 configuration.setFilters(Arrays.asList(
176    this.bodyFilter,
177    this.listItemFilter,
178    this.listFilter,
179    this.fontFilter));
180  53 return configuration;
181    }
182   
183    /**
184    * @param configuration the configuration to use for the cleaning
185    * @return the default {@link CleanerProperties} to be used for cleaning.
186    */
 
187  53 toggle private CleanerProperties getDefaultCleanerProperties(HTMLCleanerConfiguration configuration)
188    {
189  53 CleanerProperties defaultProperties = new CleanerProperties();
190  53 defaultProperties.setOmitUnknownTags(true);
191   
192    // HTML Cleaner uses the compact notation by default but we don't want that since:
193    // - it's more work and not required since not compact notation is valid XHTML
194    // - expanded elements can also be rendered fine in browsers that only support HTML.
195  53 defaultProperties.setUseEmptyElementTags(false);
196   
197    // Wrap script and style content in CDATA blocks
198  53 defaultProperties.setUseCdataForScriptAndStyle(true);
199   
200    // If the caller has defined NAMESPACE_AWARE configuration property then use it, otherwise use our default.
201  53 String param = configuration.getParameters().get(HTMLCleanerConfiguration.NAMESPACES_AWARE);
202  53 boolean namespacesAware = (param != null) ? Boolean.parseBoolean(param) : true;
203  53 defaultProperties.setNamespacesAware(namespacesAware);
204   
205  53 return defaultProperties;
206    }
207   
208    /**
209    * @return the default cleaning transformations to perform on tags, in addition to the base transformations done by
210    * HTML Cleaner
211    */
 
212  53 toggle private CleanerTransformations getDefaultCleanerTransformations()
213    {
214  53 CleanerTransformations defaultTransformations = new CleanerTransformations();
215   
216  53 TagTransformation tt = new TagTransformation(HTMLConstants.TAG_B, HTMLConstants.TAG_STRONG, false);
217  53 defaultTransformations.addTransformation(tt);
218   
219  53 tt = new TagTransformation(HTMLConstants.TAG_I, HTMLConstants.TAG_EM, false);
220  53 defaultTransformations.addTransformation(tt);
221   
222  53 tt = new TagTransformation(HTMLConstants.TAG_U, HTMLConstants.TAG_INS, false);
223  53 defaultTransformations.addTransformation(tt);
224   
225  53 tt = new TagTransformation(HTMLConstants.TAG_S, HTMLConstants.TAG_DEL, false);
226  53 defaultTransformations.addTransformation(tt);
227   
228  53 tt = new TagTransformation(HTMLConstants.TAG_STRIKE, HTMLConstants.TAG_DEL, false);
229  53 defaultTransformations.addTransformation(tt);
230   
231  53 tt = new TagTransformation(HTMLConstants.TAG_CENTER, HTMLConstants.TAG_P, false);
232  53 tt.addAttributeTransformation(HTMLConstants.ATTRIBUTE_STYLE, "text-align:center");
233  53 defaultTransformations.addTransformation(tt);
234   
235  53 return defaultTransformations;
236    }
237    }