1. Project Clover database Tue Dec 20 2016 21:24:09 CET
  2. Package org.xwiki.officeimporter.internal.builder

File DefaultXHTMLOfficeDocumentBuilder.java

 

Coverage histogram

../../../../../img/srcFileCovDistChart9.png
38% of files have more coverage

Code metrics

6
28
2
1
146
82
7
0.25
14
2
3.5

Classes

Class Line # Actions
DefaultXHTMLOfficeDocumentBuilder 57 28 0% 7 5
0.861111186.1%
 

Contributing tests

This file is covered by 2 tests. .

Source view

1    /*
2    * See the NOTICE file distributed with this work for additional
3    * information regarding copyright ownership.
4    *
5    * This is free software; you can redistribute it and/or modify it
6    * under the terms of the GNU Lesser General Public License as
7    * published by the Free Software Foundation; either version 2.1 of
8    * the License, or (at your option) any later version.
9    *
10    * This software is distributed in the hope that it will be useful,
11    * but WITHOUT ANY WARRANTY; without even the implied warranty of
12    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13    * Lesser General Public License for more details.
14    *
15    * You should have received a copy of the GNU Lesser General Public
16    * License along with this software; if not, write to the Free
17    * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
18    * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
19    */
20    package org.xwiki.officeimporter.internal.builder;
21   
22    import java.io.ByteArrayInputStream;
23    import java.io.IOException;
24    import java.io.InputStream;
25    import java.io.InputStreamReader;
26    import java.io.Reader;
27    import java.nio.charset.Charset;
28    import java.util.HashMap;
29    import java.util.Map;
30   
31    import javax.inject.Inject;
32    import javax.inject.Named;
33    import javax.inject.Singleton;
34   
35    import org.apache.commons.lang3.StringUtils;
36    import org.apache.tika.parser.html.HtmlEncodingDetector;
37    import org.w3c.dom.Document;
38    import org.xwiki.component.annotation.Component;
39    import org.xwiki.model.reference.DocumentReference;
40    import org.xwiki.model.reference.EntityReferenceSerializer;
41    import org.xwiki.officeimporter.OfficeImporterException;
42    import org.xwiki.officeimporter.builder.XHTMLOfficeDocumentBuilder;
43    import org.xwiki.officeimporter.converter.OfficeConverterException;
44    import org.xwiki.officeimporter.document.XHTMLOfficeDocument;
45    import org.xwiki.officeimporter.server.OfficeServer;
46    import org.xwiki.xml.html.HTMLCleaner;
47    import org.xwiki.xml.html.HTMLCleanerConfiguration;
48   
49    /**
50    * Default implementation of {@link XHTMLOfficeDocumentBuilder}.
51    *
52    * @version $Id: 1bc0fa620a75b41020df2d4bb444ad95834f12b4 $
53    * @since 2.1M1
54    */
55    @Component
56    @Singleton
 
57    public class DefaultXHTMLOfficeDocumentBuilder implements XHTMLOfficeDocumentBuilder
58    {
59    /**
60    * Used to serialize the reference document name.
61    */
62    @Inject
63    private EntityReferenceSerializer<String> entityReferenceSerializer;
64   
65    /**
66    * Used to obtain document converter.
67    */
68    @Inject
69    private OfficeServer officeServer;
70   
71    /**
72    * Office HTML cleaner.
73    */
74    @Inject
75    @Named("openoffice")
76    private HTMLCleaner officeHtmlCleaner;
77   
78    /**
79    * Used to determine the encoding of the HTML byte array produced by the office server.
80    */
81    private HtmlEncodingDetector htmlEncodingDetector = new HtmlEncodingDetector();
82   
 
83  2 toggle @Override
84    public XHTMLOfficeDocument build(InputStream officeFileStream, String officeFileName, DocumentReference reference,
85    boolean filterStyles) throws OfficeImporterException
86    {
87    // Invoke the office document converter.
88  2 Map<String, InputStream> inputStreams = new HashMap<String, InputStream>();
89  2 inputStreams.put(officeFileName, officeFileStream);
90  2 Map<String, byte[]> artifacts;
91    // The office converter uses the output file name extension to determine the output format/syntax.
92  2 String outputFileName = StringUtils.substringBeforeLast(officeFileName, ".") + ".html";
93  2 try {
94  2 artifacts = this.officeServer.getConverter().convert(inputStreams, officeFileName, outputFileName);
95    } catch (OfficeConverterException ex) {
96  0 String message = "Error while converting document [%s] into html.";
97  0 throw new OfficeImporterException(String.format(message, officeFileName), ex);
98    }
99   
100    // Prepare the parameters for HTML cleaning.
101  2 Map<String, String> params = new HashMap<String, String>();
102  2 params.put("targetDocument", this.entityReferenceSerializer.serialize(reference));
103    // Extract the images that are embedded through the Data URI scheme and add them to the other artifacts so that
104    // they end up as attachments.
105  2 params.put("attachEmbeddedImages", "true");
106  2 if (filterStyles) {
107  2 params.put("filterStyles", "strict");
108    }
109   
110    // Parse and clean the HTML output.
111  2 HTMLCleanerConfiguration configuration = this.officeHtmlCleaner.getDefaultConfiguration();
112  2 configuration.setParameters(params);
113  2 Reader html = getReader(artifacts.remove(outputFileName));
114  2 Document xhtmlDoc = this.officeHtmlCleaner.clean(html, configuration);
115   
116  2 @SuppressWarnings("unchecked")
117    Map<String, byte[]> embeddedImages = (Map<String, byte[]>) xhtmlDoc.getUserData("embeddedImages");
118  2 if (embeddedImages != null) {
119  2 artifacts.putAll(embeddedImages);
120    }
121   
122    // Return a new XHTMLOfficeDocument instance.
123  2 return new XHTMLOfficeDocument(xhtmlDoc, artifacts);
124    }
125   
126    /**
127    * Detects the proper encoding of the given byte array and returns a reader.
128    *
129    * @param html HTML text as a byte array
130    * @return a reader for the given HTML byte array, that has the proper encoding
131    */
 
132  2 toggle private Reader getReader(byte[] html)
133    {
134  2 InputStream htmlInputStream = new ByteArrayInputStream(html);
135  2 Charset charset = null;
136  2 try {
137  2 charset = htmlEncodingDetector.detect(htmlInputStream, null);
138    } catch (IOException e) {
139    // Shouldn't happen.
140    }
141  2 if (charset == null) {
142  2 charset = Charset.forName("UTF-8");
143    }
144  2 return new InputStreamReader(htmlInputStream, charset);
145    }
146    }