1. Project Clover database Tue Dec 20 2016 21:24:09 CET
  2. Package org.xwiki.officeimporter.internal.filter

File ParagraphFilter.java

 

Coverage histogram

../../../../../img/srcFileCovDistChart10.png
0% of files have more coverage

Code metrics

12
32
8
1
172
88
17
0.53
4
8
2.12

Classes

Class Line # Actions
ParagraphFilter 55 32 0% 17 4
0.923076992.3%
 

Contributing tests

This file is covered by 20 tests. .

Source view

1    /*
2    * See the NOTICE file distributed with this work for additional
3    * information regarding copyright ownership.
4    *
5    * This is free software; you can redistribute it and/or modify it
6    * under the terms of the GNU Lesser General Public License as
7    * published by the Free Software Foundation; either version 2.1 of
8    * the License, or (at your option) any later version.
9    *
10    * This software is distributed in the hope that it will be useful,
11    * but WITHOUT ANY WARRANTY; without even the implied warranty of
12    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13    * Lesser General Public License for more details.
14    *
15    * You should have received a copy of the GNU Lesser General Public
16    * License along with this software; if not, write to the Free
17    * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
18    * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
19    */
20    package org.xwiki.officeimporter.internal.filter;
21   
22    import java.util.ArrayList;
23    import java.util.List;
24    import java.util.Map;
25   
26    import javax.inject.Named;
27    import javax.inject.Singleton;
28   
29    import org.w3c.dom.Document;
30    import org.w3c.dom.Element;
31    import org.w3c.dom.Node;
32    import org.w3c.dom.NodeList;
33    import org.xwiki.component.annotation.Component;
34    import org.xwiki.xml.html.filter.AbstractHTMLFilter;
35    import org.xwiki.xml.html.filter.ElementSelector;
36   
37    /**
38    * Open Office server creates a new paragraph element for every line break (enter) in the original office document. For
39    * an example: <br/>
40    * {@code<P STYLE="margin-bottom: 0in">Line - 1</P>}<br/>
41    * {@code<P STYLE="margin-bottom: 0in">Line - 2</P>}<br/>
42    * {@code<P STYLE="margin-bottom: 0in">Line - 3</P>}<br/>
43    * Is the output produced by open office for a simple document containing only three consecutive lines. Further, to
44    * represent empty lines, Open Office uses following element: <br/>
45    * {@code<P STYLE="margin-bottom: 0in"><BR></P>} <br/>
46    * These constructs when rendered on browsers doesn't resemble the original document at all, and when parsed
47    * into xwiki syntax the generated xwiki syntax is also invalid (obviously). The purpose of this filter is to clean
48    * up such html content by merging consecutive paragraph sequences and appropriately inserting {@code<br/>} elements.
49    *
50    * @version $Id: fdf826a54f0f430fa6d03dea9b35b30556715953 $
51    */
52    @Component
53    @Named("officeimporter/paragraph")
54    @Singleton
 
55    public class ParagraphFilter extends AbstractHTMLFilter
56    {
 
57  66 toggle @Override
58    public void filter(Document document, Map<String, String> cleaningParams)
59    {
60  66 for (Node p : findEmptyLineParagraphSequences(document)) {
61  2 Node next = p.getNextSibling();
62    // Remove the first empty paragraph.
63  2 p.getParentNode().removeChild(p);
64    // Replace the following ones by their children elements.
65  5 while (isEmptyLineParagraph(next)) {
66  3 Node following = next.getNextSibling();
67  3 replaceWithChildren((Element) next);
68  3 next = following;
69    }
70    }
71    }
72   
73    /**
74    * Finds all the empty paragraph sequences in the document.
75    *
76    * @param document the {@link Document}
77    * @return a list of nodes containing leading paragraph elements of each sequence found.
78    */
 
79  66 toggle private List<Node> findEmptyLineParagraphSequences(Document document)
80    {
81  66 List<Element> emptyLineParagraphs =
82    filterDescendants(document.getDocumentElement(), new String[] {TAG_P}, new ElementSelector()
83    {
 
84  47 toggle @Override
85    public boolean isSelected(Element element)
86    {
87  47 return isEmptyLineParagraph(element);
88    }
89    });
90  66 List<Node> sequences = new ArrayList<Node>();
91  66 for (Element emptyLineParagraph : emptyLineParagraphs) {
92  5 Node prev = emptyLineParagraph.getPreviousSibling();
93    // Skip garbage.
94  5 while (isEmptyTextNode(prev) || isCommentNode(prev)) {
95  0 Node oneBefore = prev.getPreviousSibling();
96  0 prev.getParentNode().removeChild(prev);
97  0 prev = oneBefore;
98    }
99  5 if (!isEmptyLineParagraph(prev)) {
100    // This is the beginning of a sequence of empty line paragraphs
101  2 sequences.add(emptyLineParagraph);
102    }
103    }
104  66 return sequences;
105    }
106   
107    /**
108    * Checks if a node represents a paragraph element.
109    *
110    * @param node the {@link Node}.
111    * @return True if the node represents a {@code <p/>} element.
112    */
 
113  57 toggle private boolean isParagraph(Node node)
114    {
115  57 return null != node && node.getNodeName().equals(TAG_P);
116    }
117   
118    /**
119    * Checks if a node represents a {@code<p><br/></p>} element used by open office to represent an empty line.
120    *
121    * @param node the {@link Node}
122    * @return true if the node represents an empty line.
123    */
 
124  57 toggle private boolean isEmptyLineParagraph(Node node)
125    {
126  57 boolean isEmptyLine = false;
127  57 if (isParagraph(node)) {
128  53 isEmptyLine = true;
129  53 NodeList children = node.getChildNodes();
130  130 for (int i = 0; i < children.getLength(); i++) {
131  77 Node child = children.item(i);
132  77 if (!(isEmptyTextNode(child) || isCommentNode(child) || isLineBreak(child))) {
133  66 isEmptyLine = false;
134    }
135    }
136    }
137  57 return isEmptyLine;
138    }
139   
140    /**
141    * Checks if a node represents empty text content (white space).
142    *
143    * @param node the {@link Node}.
144    * @return true if the node represents white space.
145    */
 
146  82 toggle private boolean isEmptyTextNode(Node node)
147    {
148  82 return null != node && node.getNodeType() == Node.TEXT_NODE && node.getTextContent().trim().equals("");
149    }
150   
151    /**
152    * Checks if a node represents an html comment.
153    *
154    * @param node the {@link Node}.
155    * @return true if the node is a comment node.
156    */
 
157  82 toggle private boolean isCommentNode(Node node)
158    {
159  82 return null != node && node.getNodeType() == Node.COMMENT_NODE;
160    }
161   
162    /**
163    * Checks if a node represents an html line break.
164    *
165    * @param node the {@link Node}
166    * @return true of the node represents a line break.
167    */
 
168  77 toggle private boolean isLineBreak(Node node)
169    {
170  77 return null != node && node.getNodeName().equals(TAG_BR);
171    }
172    }