1. Project Clover database Tue Dec 20 2016 21:24:09 CET
  2. Package com.xpn.xwiki.plugin.autotag

File AutoTagPlugin.java

 

Coverage histogram

../../../../../img/srcFileCovDistChart0.png
83% of files have more coverage

Code metrics

32
132
16
1
502
277
35
0.27
8.25
16
2.19

Classes

Class Line # Actions
AutoTagPlugin 46 132 0% 35 180
0.00%
 

Contributing tests

No tests hitting this source file were found.

Source view

1    /*
2    * See the NOTICE file distributed with this work for additional
3    * information regarding copyright ownership.
4    *
5    * This is free software; you can redistribute it and/or modify it
6    * under the terms of the GNU Lesser General Public License as
7    * published by the Free Software Foundation; either version 2.1 of
8    * the License, or (at your option) any later version.
9    *
10    * This software is distributed in the hope that it will be useful,
11    * but WITHOUT ANY WARRANTY; without even the implied warranty of
12    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13    * Lesser General Public License for more details.
14    *
15    * You should have received a copy of the GNU Lesser General Public
16    * License along with this software; if not, write to the Free
17    * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
18    * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
19    */
20    package com.xpn.xwiki.plugin.autotag;
21   
22    import java.util.ArrayList;
23    import java.util.Collections;
24    import java.util.HashMap;
25    import java.util.HashSet;
26    import java.util.LinkedHashMap;
27    import java.util.List;
28    import java.util.Map;
29    import java.util.Set;
30    import java.util.SortedSet;
31    import java.util.TreeSet;
32    import java.util.Map.Entry;
33    import java.util.regex.Pattern;
34   
35    import com.xpn.xwiki.XWikiContext;
36    import com.xpn.xwiki.plugin.XWikiDefaultPlugin;
37    import com.xpn.xwiki.plugin.XWikiPluginInterface;
38   
39    /**
40    * Plugin which extracts a set of tags from a text.
41    *
42    * @version $Id: e30fac037d84bc9c172fff38493f900354bdcc6b $
43    * @deprecated the plugin technology is deprecated, consider rewriting as components
44    */
45    @Deprecated
 
46    public class AutoTagPlugin extends XWikiDefaultPlugin implements XWikiPluginInterface
47    {
48    /** Identifier for the French language. */
49    public static final int LANG_FRENCH = 0;
50   
51    /** Identifier for the English language. */
52    public static final int LANG_ENGLISH = 1;
53   
54    /**
55    * The name of the plugin, which is used for retrieving the plugin from the plugin manager (and from the public
56    * {@code $xwiki.get()} API).
57    */
58    private static final String PLUGIN_NAME = "autotag";
59   
60    /**
61    * Special characters that will cause a token word to be ignored when that token contains one of these characters.
62    */
63    private static final Pattern SPECIAL_CHARS = Pattern.compile("<|>|=|/|\"|\u0093");
64   
65    /** Needed to make chekstyle pass. */
66    private static final String A = "a";
67   
68    /** Needed to make chekstyle pass. */
69    private static final String ON = "on";
70   
71    /** French words that should be ignored since they don't add any value, they're very common words. */
72    private static final String[] FRENCH_STOP_WORDS = {
73    A, "afin", "ai", "ainsi", "apr\u00e8s", "attendu", "au", "aujourd", "auquel", "aussi",
74    "autre", "autres", "aux", "auxquelles", "auxquels", "avait", "avant", "avec", "avoir",
75    "c", "car", "ce", "ceci", "cela", "celle", "celles", "celui", "cependant", "certain",
76    "certaine", "certaines", "certains", "ces", "cet", "cette", "ceux", "chez", "ci",
77    "combien", "comme", "comment", "concernant", "contre", "d", "dans", "de", "debout",
78    "dedans", "dehors", "del\u00e0", "depuis", "derri\u00e8re", "des", "d\u00e9sormais",
79    "desquelles", "desquels", "dessous", "dessus", "devant", "devers", "devra", "divers",
80    "diverse", "diverses", "doit", "donc", "dont", "du", "duquel", "durant", "d\u00e8s",
81    "elle", "elles", "en", "entre", "environ", "est", "et", "etc", "etre", "eu", "eux",
82    "except\u00e9", "hormis", "hors", "h\u00e9las", "hui", "il", "ils", "j", "je", "jusqu",
83    "jusque", "l", "la", "laquelle", "le", "lequel", "les", "lesquelles", "lesquels", "leur",
84    "leurs", "lorsque", "lui", "l\u00e0", "ma", "mais", "malgr\u00e9", "me", "merci", "mes",
85    "mien", "mienne", "miennes", "miens", "moi", "moins", "mon", "moyennant", "m\u00eame",
86    "m\u00eames", "n", "ne", "ni", "non", "nos", "notre", "nous", "n\u00e9anmoins",
87    "n\u00f4tre", "n\u00f4tres", ON, "ont", "ou", "outre", "o\u00f9", "par", "parmi",
88    "partant", "pas", "pass\u00e9", "pendant", "plein", "plus", "plusieurs", "pour",
89    "pourquoi", "proche", "pr\u00e8s", "puisque", "qu", "quand", "que", "quel", "quelle",
90    "quelles", "quels", "qui", "quoi", "quoique", "revoici", "revoil\u00e0", "s", "sa",
91    "sans", "sauf", "se", "selon", "seront", "ses", "si", "sien", "sienne", "siennes",
92    "siens", "sinon", "soi", "soit", "son", "sont", "sous", "suivant", "sur", "ta", "te",
93    "tes", "tien", "tienne", "tiennes", "tiens", "toi", "ton", "tous", "tout", "toute",
94    "toutes", "tu", "un", "une", "va", "vers", "voici", "voil\u00e0", "vos", "votre", "vous",
95    "vu", "v\u00f4tre", "v\u00f4tres", "y", "\u00e0", "\u00e7a", "\u00e8s", "\u00e9t\u00e9",
96    "\u00eatre", "\u00f4", "avez", "parce", "suis"};
97   
98    /** English words that should be ignored since they don't add any value, they're very common words. */
99    private static final String[] ENGLISH_STOP_WORDS = {
100    "the", "of", "and", A, "to", "in", "is", "you", "that", "it", "he", "was", "for", ON,
101    "are", "as", "with", "his", "they", "I", "at", "be", "this", "have", "from", "or", "one",
102    "had", "by", "but", "not", "what", "all", "were", "we", "when", "your", "can", "said",
103    "there", "use", "an", "each", "which", "she", "do", "how", "their", "if", "will", "up",
104    "other", "about", "out", "many", "then", "them", "these", "so", "some", "her", "would",
105    "make", "like", "him", "into", "time", "has", "look", "two", "more", "go", "see", "no",
106    "way", "could", "my", "than", "first", "been", "call", "who", "its", "now", "find", "long",
107    "down", "day", "did", "get", "come", "may"};
108   
109    /** The list of words (or, more generally, tokens) that should be ignored. */
110    private List<String> ignoreList = Collections.synchronizedList(new ArrayList<String>());
111   
112    /** The list of words that should never be ignored, even if they're placed in {@link #ignoreList}. */
113    private List<String> dontignoreList = Collections.synchronizedList(new ArrayList<String>());
114   
115    /** The maximum number of tags to generate in the tag cloud. */
116    private int maximumNumberOfTags = 100;
117   
118    /** The maximum size of a tag, corresponding to the most common tag in the document list. */
119    private int maxTagSize = 64;
120   
121    /** The minimum size of a tag, corresponding to the least common tag that gets included in the tag cloud. */
122    private int minTagSize = 12;
123   
124    /**
125    * The mandatory plugin constructor, this is the method called (through reflection) by the plugin manager.
126    *
127    * @param name the plugin name, usually ignored, since plugins have a fixed name
128    * @param className the name of this class, ignored
129    * @param context the current request context
130    */
 
131  0 toggle public AutoTagPlugin(String name, String className, XWikiContext context)
132    {
133  0 super(name, className, context);
134  0 init(context);
135    }
136   
 
137  0 toggle @Override
138    public void init(XWikiContext context)
139    {
140  0 super.init(context);
141    }
142   
 
143  0 toggle @Override
144    public String getName()
145    {
146  0 return PLUGIN_NAME;
147    }
148   
 
149  0 toggle @Override
150    public AutoTagPluginAPI getPluginApi(XWikiPluginInterface plugin, XWikiContext context)
151    {
152  0 return new AutoTagPluginAPI((AutoTagPlugin) plugin, context);
153    }
154   
155    /**
156    * Analyze a piece of text, and extract the most common words into a "tag cloud". In detail, this splits the text
157    * into tokens, counts how many times each token appears in the text, removes the "stop-words", joins together words
158    * from the same root (stemming), and prepares an HTML tag cloud which can be printed in the response.
159    *
160    * @param text the text to analyze
161    * @param lang the language in which the text is written, {@code 0} for French or {@code 1} for English
162    * @return the resulting TagCloud with all the analyzed data, including the HTML tag cloud
163    */
 
164  0 toggle public TagCloud generateTagCloud(String text, int lang)
165    {
166  0 TagCloud tagcloud = countWords(text, lang);
167  0 calculateTags(tagcloud);
168  0 return tagcloud;
169    }
170   
171    /**
172    * Analyze a piece of text, splitting it into individual words, along with their frequencies. In detail, this splits
173    * the text into tokens, counts how many times each token appears in the text, removes the "stop-words", and joins
174    * together words from the same root (stemming). {@link #generateTagCloud(String, int)} also prepares an HTML tag
175    * cloud which can be printed in the response.
176    *
177    * @param text the text to analyze
178    * @param lang the language, {@code 0} for French or {@code 1} for English
179    * @return the resulting TagCloud with all the analyzed data, except the HTML tag cloud
180    */
 
181  0 toggle public TagCloud countWords(String text, int lang)
182    {
183  0 TagCloud tagcloud = new TagCloud();
184  0 tagcloud.setText(text);
185   
186  0 splitWords(tagcloud);
187  0 countWords(tagcloud);
188  0 clearStopWords(tagcloud, lang);
189  0 stemmer(tagcloud, lang);
190  0 return tagcloud;
191    }
192   
193    // Utility methods
194   
195    /**
196    * Return a sorted copy of a set.
197    *
198    * @param <T> the type of the items in the set
199    * @param oSet the set containing the values to sort; it is not affected in any way by this method
200    * @return a new sorted set containing all the values in the input set
201    */
 
202  0 toggle public static <T extends Comparable<T>> SortedSet<T> sortSet(Set<T> oSet)
203    {
204  0 return new TreeSet<T>(oSet);
205    }
206   
207    /**
208    * Return a copy of a map, sorted in ascending order of their values.
209    *
210    * @param <K> the type of the map keys
211    * @param <V> the type of the map values
212    * @param hmap the map containing the entries to sort; it is not affected in any way by this method
213    * @return a new sorted map containing all the entries in the input map
214    */
 
215  0 toggle public static <K, V> Map<K, V> sortMap(Map<K, V> hmap)
216    {
217  0 Map<K, V> map = new LinkedHashMap<K, V>();
218   
219  0 List<K> mapKeys = new ArrayList<K>(hmap.keySet());
220  0 List<V> mapValues = new ArrayList<V>(hmap.values());
221   
222  0 Set<V> sortedSet = new TreeSet<V>(mapValues);
223  0 @SuppressWarnings("unchecked")
224    V[] sortedArray = (V[]) sortedSet.toArray();
225   
226  0 for (int i = 0; i < sortedArray.length; i++) {
227  0 for (int j = 0; j < mapValues.size(); j++) {
228  0 if (mapValues.get(j).equals(sortedArray[i])) {
229  0 map.put(mapKeys.get(j), sortedArray[i]);
230    }
231    }
232    }
233  0 return map;
234    }
235   
236    /**
237    * Get the identifier corresponding to the given two-leter country code. Currently the only supported values are
238    * "en" and "fr".
239    *
240    * @param lang the two-letter ISO 3166-1 alpha-2 code of a country
241    * @return {@code 0} for French ("fr") or {@code 1} for English ("en") and all other passed values
242    */
 
243  0 toggle public int getLanguageConstant(String lang)
244    {
245  0 if (lang.trim().equalsIgnoreCase("fr")) {
246  0 return AutoTagPlugin.LANG_FRENCH;
247    }
248    // default English
249  0 return AutoTagPlugin.LANG_ENGLISH;
250    }
251   
252    /**
253    * Split the text into tokens. Newlines, spaces, tabs, comma, dot, semi-colon, colon, exclamation, question mark,
254    * and apostrophe are considered separators.
255    *
256    * @param tagCloud the instance to process
257    * @return the resulting list of tokens, which is also stored in the instance {@link TagCloud#getWordList()
258    * TagCloud}
259    */
 
260  0 toggle private String[] splitWords(TagCloud tagCloud)
261    {
262  0 String text = tagCloud.getText();
263  0 text = text.replaceAll("\n", " ");
264  0 text = text.replaceAll("\r", " ");
265  0 text = text.replaceAll("'", " ");
266  0 text = text.replaceAll("\u0092", " ");
267  0 text = text.toLowerCase();
268  0 String[] words = text.split("[\\s,.;:!\\?]+");
269  0 tagCloud.setWordList(words);
270  0 return words;
271    }
272   
273    /**
274    * Count all the appearances of each token extracted from the text. This method must be called after
275    * {@link #splitWords(TagCloud)}.
276    *
277    * @param tagCloud the instance to process
278    * @return the resulting map of {@code token->number of appearances} count for each token present in the text, which
279    * is also stored in the instance {@link TagCloud#getCountedWordMap() TagCloud}
280    */
 
281  0 toggle private Map<String, Integer> countWords(TagCloud tagCloud)
282    {
283  0 String[] words = tagCloud.getWordList();
284  0 Map<String, Integer> wordsCount = new HashMap<String, Integer>();
285   
286  0 for (int i = 0; i < words.length; i++) {
287  0 String word = words[i];
288  0 if (!wordsCount.containsKey(word)) {
289  0 wordsCount.put(word, Integer.valueOf(0));
290    }
291  0 wordsCount.put(word, Integer.valueOf((wordsCount.get(word)).intValue() + 1));
292    }
293  0 tagCloud.setCountedWordMap(wordsCount);
294  0 return wordsCount;
295    }
296   
297    /**
298    * Remove "stop words", words that should be ignored since they don't add any value, they're very common. This
299    * method must be called after {@link #countWords(TagCloud)}.
300    *
301    * @param tagCloud the instance to process
302    * @param lang the language in which the text is written, {@code 0} for French or {@code 1} for English
303    * @return the resulting map of {@code token->number of appearances} count for each token present in the text, which
304    * is also stored in the instance {@link TagCloud#getCountedWordMap() TagCloud}
305    */
 
306  0 toggle private Map<String, Integer> clearStopWords(TagCloud tagCloud, int lang)
307    {
308  0 Map<String, Integer> words = tagCloud.getCountedWordMap();
309  0 String[] stopWordsArray = new String[0];
310  0 switch (lang) {
311  0 case LANG_ENGLISH:
312  0 stopWordsArray = ENGLISH_STOP_WORDS;
313  0 break;
314  0 case LANG_FRENCH:
315  0 stopWordsArray = FRENCH_STOP_WORDS;
316  0 break;
317  0 default:
318    // nothing
319  0 break;
320    }
321  0 for (String word : stopWordsArray) {
322  0 words.remove(word);
323    }
324   
325  0 Set<String> ignored = new HashSet<String>();
326  0 for (String word : words.keySet()) {
327  0 if (SPECIAL_CHARS.matcher(word).find()) {
328  0 ignored.add(word);
329    }
330    }
331   
332  0 for (String word : this.ignoreList) {
333  0 if (!this.dontignoreList.contains(word)) {
334  0 words.remove(word);
335    }
336    }
337  0 for (String word : ignored) {
338  0 if (!this.dontignoreList.contains(word)) {
339  0 words.remove(word);
340    }
341    }
342   
343  0 tagCloud.setCountedWordMap(words);
344  0 return words;
345    }
346   
347    /**
348    * Group tokens based on their common stem. For example, "hand" and "hands" both refer to the same term, "hand",
349    * thus they will be grouped together under the "hand" stem. This method must be called after
350    * {@link #countWords(TagCloud)} or {@link #clearStopWords(TagCloud, int)}.
351    *
352    * @param tagCloud the instance to process
353    * @param lang the language in which the text is written, {@code 0} for French or {@code 1} for English
354    * @return the resulting list of token groups, which is also stored in the instance
355    * {@link TagCloud#getStemmedWordMap() TagCloud}
356    */
 
357  0 toggle private Map<String, Map<String, Integer>> stemmer(TagCloud tagCloud, int lang)
358    {
359  0 Map<String, Integer> words = tagCloud.getCountedWordMap();
360    // SnowballProgram stemmer;
361    // if (lang == LANG_FRENCH) {
362    // stemmer = new frenchStemmer();
363    // } else {
364    // stemmer = new englishStemmer();
365    // }
366  0 FrenchStemmer stemmer;
367  0 Map<String, Map<String, Integer>> stemmedWordMap = new HashMap<String, Map<String, Integer>>();
368   
369  0 stemmer = new FrenchStemmer();
370   
371  0 for (String word : words.keySet()) {
372  0 if (word.length() <= 2) {
373  0 continue;
374    }
375  0 String stemmedWord = stemmer.stem(word);
376  0 if (!stemmedWordMap.containsKey(stemmedWord)) {
377  0 stemmedWordMap.put(stemmedWord, new HashMap<String, Integer>());
378    }
379  0 stemmedWordMap.get(stemmedWord).put(word, words.get(word));
380    }
381  0 tagCloud.setStemmedWordMap(stemmedWordMap);
382  0 return stemmedWordMap;
383    }
384   
385    /**
386    * Determine which are the most frequent {@link #maximumNumberOfTags} tokens and build a tag cloud using their
387    * relative frequencies. This method must be called after {@link #stemmer(TagCloud, int)}.
388    *
389    * @param tagCloud the instance to process
390    * @return the resulting set of tags, which is also stored in the instance {@link TagCloud#getTags() TagCloud}
391    */
 
392  0 toggle private Set<Tag> calculateTags(TagCloud tagCloud)
393    {
394  0 Map<String, Map<String, Integer>> stemmedWords = tagCloud.getStemmedWordMap();
395  0 Map<String, Integer> stemmedWordFreqMap = new HashMap<String, Integer>();
396   
397    // Determine the "lead" word for each stem as the most common token by comparing the frequency of each sub-token
398    // Calculate the total frequency of each lead word as the sum of the frequencies of all tokens having that stem
399  0 for (Map.Entry<String, Map<String, Integer>> stemmedWord : stemmedWords.entrySet()) {
400  0 Integer totalFreqency = Integer.valueOf(0);
401  0 String leadWord = "";
402  0 Integer leadFrequency = Integer.valueOf(0);
403   
404  0 Map<String, Integer> wordMap = stemmedWord.getValue();
405   
406  0 for (Map.Entry<String, Integer> word : wordMap.entrySet()) {
407  0 Integer frequency = word.getValue();
408   
409  0 totalFreqency = Integer.valueOf(frequency.intValue() + totalFreqency.intValue());
410   
411  0 if (frequency.intValue() > leadFrequency.intValue()) {
412  0 leadFrequency = word.getValue();
413  0 leadWord = word.getKey();
414    }
415    }
416  0 stemmedWordFreqMap.put(leadWord, totalFreqency);
417    }
418  0 tagCloud.setStemmedWordFreqMap(stemmedWordFreqMap);
419  0 return calculateTagSizes(tagCloud);
420    }
421   
422    /**
423    * Build a tag cloud using the relative frequencies of the selected tags. This method must be called by
424    * {@link #calculateTags(TagCloud)}.
425    *
426    * @param tagCloud the instance to process
427    * @return the resulting set of tags, which is also stored in the instance {@link TagCloud#getTags() TagCloud}
428    */
 
429  0 toggle private Set<Tag> calculateTagSizes(TagCloud tagCloud)
430    {
431  0 Map<String, Integer> stemmedWordFreqMap = tagCloud.getStemmedWordFreqMap();
432    // If there's no text, just use an empty set of tags and return
433  0 if (stemmedWordFreqMap == null || stemmedWordFreqMap.size() == 0) {
434  0 tagCloud.setTags(new TreeSet<Tag>());
435  0 return tagCloud.getTags();
436    }
437    // We order the list by the value to select the most frequent tags
438  0 Map<String, Integer> orderedMap = sortMap(stemmedWordFreqMap);
439   
440  0 Map<String, Integer> tagMap = new LinkedHashMap<String, Integer>();
441   
442  0 int i = 0;
443  0 for (Entry<String, Integer> word : orderedMap.entrySet()) {
444  0 tagMap.put(word.getKey(), word.getValue());
445  0 if (++i > this.maximumNumberOfTags) {
446  0 break;
447    }
448    }
449  0 Integer[] freqs = tagMap.values().toArray(new Integer[0]);
450   
451  0 Integer minFreq = freqs[0];
452  0 Integer maxFreq = freqs[freqs.length - 1];
453   
454  0 int ftot = 0;
455   
456  0 for (Integer f : freqs) {
457  0 ftot += f.intValue();
458    }
459   
460  0 SortedSet<Tag> tagSet = new TreeSet<Tag>();
461   
462  0 for (String tagName : sortSet(tagMap.keySet())) {
463  0 long size = getTagSize(tagMap.get(tagName), maxFreq, minFreq, ftot);
464  0 Tag tag = new Tag(tagName, size);
465  0 tagSet.add(tag);
466    }
467  0 tagCloud.setTags(tagSet);
468  0 return tagSet;
469    }
470   
471    /**
472    * Get the size that corresponds to a given tag popularity, relative to all the other tag frequencies.
473    *
474    * @param tagOccurrences the number of occurrences of the tag
475    * @param maxOccurrences the maximum number of occurrences among all tags
476    * @param minOccurrences the minimum number of occurrences among all tags
477    * @param totalOccurrences the total number of occurrences of all the tags
478    * @return a number between {@link #minTagSize} and {@link #maxTagSize} corresponding to the relative popularity of
479    * this tag compared to all the other tags
480    */
 
481  0 toggle private long getTagSize(double tagOccurrences, double maxOccurrences, double minOccurrences,
482    double totalOccurrences)
483    {
484    // The number of available tag sizes
485  0 int fontRange = this.maxTagSize - this.minTagSize;
486   
487    // tweak this if all the words seem too similar in size or extremely different
488    // rely on the cumulative by x% (0 = 0%, 1 = 100%)
489  0 double cumulativeImportance = 0.7;
490   
491    // sizes based on word's frequency vs total/cumulative frequency
492  0 double sumpx = ((fontRange * cumulativeImportance) + 1) * (fontRange * cumulativeImportance) / 2;
493  0 double px = tagOccurrences / totalOccurrences * sumpx;
494   
495    // sizes based on word's frequency deviation from max/min frequencies
496  0 px += Math.pow((tagOccurrences - minOccurrences)
497  0 / (1 > maxOccurrences - minOccurrences ? 1 : maxOccurrences - minOccurrences), 0.8)
498    * (fontRange * (1 - cumulativeImportance));
499  0 double result = this.maxTagSize < px + this.minTagSize ? this.maxTagSize : px + this.minTagSize;
500  0 return Math.round(result);
501    }
502    }