1 |
|
|
2 |
|
|
3 |
|
|
4 |
|
|
5 |
|
|
6 |
|
|
7 |
|
|
8 |
|
|
9 |
|
|
10 |
|
|
11 |
|
|
12 |
|
|
13 |
|
|
14 |
|
|
15 |
|
|
16 |
|
|
17 |
|
|
18 |
|
|
19 |
|
|
20 |
|
package com.xpn.xwiki.plugin.autotag; |
21 |
|
|
22 |
|
import java.util.ArrayList; |
23 |
|
import java.util.Collections; |
24 |
|
import java.util.HashMap; |
25 |
|
import java.util.HashSet; |
26 |
|
import java.util.LinkedHashMap; |
27 |
|
import java.util.List; |
28 |
|
import java.util.Map; |
29 |
|
import java.util.Set; |
30 |
|
import java.util.SortedSet; |
31 |
|
import java.util.TreeSet; |
32 |
|
import java.util.Map.Entry; |
33 |
|
import java.util.regex.Pattern; |
34 |
|
|
35 |
|
import com.xpn.xwiki.XWikiContext; |
36 |
|
import com.xpn.xwiki.plugin.XWikiDefaultPlugin; |
37 |
|
import com.xpn.xwiki.plugin.XWikiPluginInterface; |
38 |
|
|
39 |
|
|
40 |
|
|
41 |
|
|
42 |
|
@version |
43 |
|
@deprecated |
44 |
|
|
45 |
|
@Deprecated |
|
|
| 0% |
Uncovered Elements: 180 (180) |
Complexity: 35 |
Complexity Density: 0.27 |
|
46 |
|
public class AutoTagPlugin extends XWikiDefaultPlugin implements XWikiPluginInterface |
47 |
|
{ |
48 |
|
|
49 |
|
public static final int LANG_FRENCH = 0; |
50 |
|
|
51 |
|
|
52 |
|
public static final int LANG_ENGLISH = 1; |
53 |
|
|
54 |
|
|
55 |
|
|
56 |
|
|
57 |
|
|
58 |
|
private static final String PLUGIN_NAME = "autotag"; |
59 |
|
|
60 |
|
|
61 |
|
|
62 |
|
|
63 |
|
private static final Pattern SPECIAL_CHARS = Pattern.compile("<|>|=|/|\"|\u0093"); |
64 |
|
|
65 |
|
|
66 |
|
private static final String A = "a"; |
67 |
|
|
68 |
|
|
69 |
|
private static final String ON = "on"; |
70 |
|
|
71 |
|
|
72 |
|
private static final String[] FRENCH_STOP_WORDS = { |
73 |
|
A, "afin", "ai", "ainsi", "apr\u00e8s", "attendu", "au", "aujourd", "auquel", "aussi", |
74 |
|
"autre", "autres", "aux", "auxquelles", "auxquels", "avait", "avant", "avec", "avoir", |
75 |
|
"c", "car", "ce", "ceci", "cela", "celle", "celles", "celui", "cependant", "certain", |
76 |
|
"certaine", "certaines", "certains", "ces", "cet", "cette", "ceux", "chez", "ci", |
77 |
|
"combien", "comme", "comment", "concernant", "contre", "d", "dans", "de", "debout", |
78 |
|
"dedans", "dehors", "del\u00e0", "depuis", "derri\u00e8re", "des", "d\u00e9sormais", |
79 |
|
"desquelles", "desquels", "dessous", "dessus", "devant", "devers", "devra", "divers", |
80 |
|
"diverse", "diverses", "doit", "donc", "dont", "du", "duquel", "durant", "d\u00e8s", |
81 |
|
"elle", "elles", "en", "entre", "environ", "est", "et", "etc", "etre", "eu", "eux", |
82 |
|
"except\u00e9", "hormis", "hors", "h\u00e9las", "hui", "il", "ils", "j", "je", "jusqu", |
83 |
|
"jusque", "l", "la", "laquelle", "le", "lequel", "les", "lesquelles", "lesquels", "leur", |
84 |
|
"leurs", "lorsque", "lui", "l\u00e0", "ma", "mais", "malgr\u00e9", "me", "merci", "mes", |
85 |
|
"mien", "mienne", "miennes", "miens", "moi", "moins", "mon", "moyennant", "m\u00eame", |
86 |
|
"m\u00eames", "n", "ne", "ni", "non", "nos", "notre", "nous", "n\u00e9anmoins", |
87 |
|
"n\u00f4tre", "n\u00f4tres", ON, "ont", "ou", "outre", "o\u00f9", "par", "parmi", |
88 |
|
"partant", "pas", "pass\u00e9", "pendant", "plein", "plus", "plusieurs", "pour", |
89 |
|
"pourquoi", "proche", "pr\u00e8s", "puisque", "qu", "quand", "que", "quel", "quelle", |
90 |
|
"quelles", "quels", "qui", "quoi", "quoique", "revoici", "revoil\u00e0", "s", "sa", |
91 |
|
"sans", "sauf", "se", "selon", "seront", "ses", "si", "sien", "sienne", "siennes", |
92 |
|
"siens", "sinon", "soi", "soit", "son", "sont", "sous", "suivant", "sur", "ta", "te", |
93 |
|
"tes", "tien", "tienne", "tiennes", "tiens", "toi", "ton", "tous", "tout", "toute", |
94 |
|
"toutes", "tu", "un", "une", "va", "vers", "voici", "voil\u00e0", "vos", "votre", "vous", |
95 |
|
"vu", "v\u00f4tre", "v\u00f4tres", "y", "\u00e0", "\u00e7a", "\u00e8s", "\u00e9t\u00e9", |
96 |
|
"\u00eatre", "\u00f4", "avez", "parce", "suis"}; |
97 |
|
|
98 |
|
|
99 |
|
private static final String[] ENGLISH_STOP_WORDS = { |
100 |
|
"the", "of", "and", A, "to", "in", "is", "you", "that", "it", "he", "was", "for", ON, |
101 |
|
"are", "as", "with", "his", "they", "I", "at", "be", "this", "have", "from", "or", "one", |
102 |
|
"had", "by", "but", "not", "what", "all", "were", "we", "when", "your", "can", "said", |
103 |
|
"there", "use", "an", "each", "which", "she", "do", "how", "their", "if", "will", "up", |
104 |
|
"other", "about", "out", "many", "then", "them", "these", "so", "some", "her", "would", |
105 |
|
"make", "like", "him", "into", "time", "has", "look", "two", "more", "go", "see", "no", |
106 |
|
"way", "could", "my", "than", "first", "been", "call", "who", "its", "now", "find", "long", |
107 |
|
"down", "day", "did", "get", "come", "may"}; |
108 |
|
|
109 |
|
|
110 |
|
private List<String> ignoreList = Collections.synchronizedList(new ArrayList<String>()); |
111 |
|
|
112 |
|
@link |
113 |
|
private List<String> dontignoreList = Collections.synchronizedList(new ArrayList<String>()); |
114 |
|
|
115 |
|
|
116 |
|
private int maximumNumberOfTags = 100; |
117 |
|
|
118 |
|
|
119 |
|
private int maxTagSize = 64; |
120 |
|
|
121 |
|
|
122 |
|
private int minTagSize = 12; |
123 |
|
|
124 |
|
|
125 |
|
|
126 |
|
|
127 |
|
@param |
128 |
|
@param |
129 |
|
@param |
130 |
|
|
|
|
| 0% |
Uncovered Elements: 2 (2) |
Complexity: 1 |
Complexity Density: 0.5 |
|
131 |
0 |
public AutoTagPlugin(String name, String className, XWikiContext context)... |
132 |
|
{ |
133 |
0 |
super(name, className, context); |
134 |
0 |
init(context); |
135 |
|
} |
136 |
|
|
|
|
| 0% |
Uncovered Elements: 1 (1) |
Complexity: 1 |
Complexity Density: 1 |
|
137 |
0 |
@Override... |
138 |
|
public void init(XWikiContext context) |
139 |
|
{ |
140 |
0 |
super.init(context); |
141 |
|
} |
142 |
|
|
|
|
| 0% |
Uncovered Elements: 1 (1) |
Complexity: 1 |
Complexity Density: 1 |
|
143 |
0 |
@Override... |
144 |
|
public String getName() |
145 |
|
{ |
146 |
0 |
return PLUGIN_NAME; |
147 |
|
} |
148 |
|
|
|
|
| 0% |
Uncovered Elements: 1 (1) |
Complexity: 1 |
Complexity Density: 1 |
|
149 |
0 |
@Override... |
150 |
|
public AutoTagPluginAPI getPluginApi(XWikiPluginInterface plugin, XWikiContext context) |
151 |
|
{ |
152 |
0 |
return new AutoTagPluginAPI((AutoTagPlugin) plugin, context); |
153 |
|
} |
154 |
|
|
155 |
|
|
156 |
|
|
157 |
|
|
158 |
|
|
159 |
|
|
160 |
|
@param |
161 |
|
@param |
162 |
|
@return |
163 |
|
|
|
|
| 0% |
Uncovered Elements: 3 (3) |
Complexity: 1 |
Complexity Density: 0.33 |
|
164 |
0 |
public TagCloud generateTagCloud(String text, int lang)... |
165 |
|
{ |
166 |
0 |
TagCloud tagcloud = countWords(text, lang); |
167 |
0 |
calculateTags(tagcloud); |
168 |
0 |
return tagcloud; |
169 |
|
} |
170 |
|
|
171 |
|
|
172 |
|
|
173 |
|
|
174 |
|
@link |
175 |
|
|
176 |
|
|
177 |
|
@param |
178 |
|
@param |
179 |
|
@return |
180 |
|
|
|
|
| 0% |
Uncovered Elements: 7 (7) |
Complexity: 1 |
Complexity Density: 0.14 |
|
181 |
0 |
public TagCloud countWords(String text, int lang)... |
182 |
|
{ |
183 |
0 |
TagCloud tagcloud = new TagCloud(); |
184 |
0 |
tagcloud.setText(text); |
185 |
|
|
186 |
0 |
splitWords(tagcloud); |
187 |
0 |
countWords(tagcloud); |
188 |
0 |
clearStopWords(tagcloud, lang); |
189 |
0 |
stemmer(tagcloud, lang); |
190 |
0 |
return tagcloud; |
191 |
|
} |
192 |
|
|
193 |
|
|
194 |
|
|
195 |
|
|
196 |
|
|
197 |
|
|
198 |
|
@param |
199 |
|
@param |
200 |
|
@return |
201 |
|
|
|
|
| 0% |
Uncovered Elements: 1 (1) |
Complexity: 1 |
Complexity Density: 1 |
|
202 |
0 |
public static <T extends Comparable<T>> SortedSet<T> sortSet(Set<T> oSet)... |
203 |
|
{ |
204 |
0 |
return new TreeSet<T>(oSet); |
205 |
|
} |
206 |
|
|
207 |
|
|
208 |
|
|
209 |
|
|
210 |
|
@param |
211 |
|
@param |
212 |
|
@param |
213 |
|
@return |
214 |
|
|
|
|
| 0% |
Uncovered Elements: 16 (16) |
Complexity: 4 |
Complexity Density: 0.4 |
|
215 |
0 |
public static <K, V> Map<K, V> sortMap(Map<K, V> hmap)... |
216 |
|
{ |
217 |
0 |
Map<K, V> map = new LinkedHashMap<K, V>(); |
218 |
|
|
219 |
0 |
List<K> mapKeys = new ArrayList<K>(hmap.keySet()); |
220 |
0 |
List<V> mapValues = new ArrayList<V>(hmap.values()); |
221 |
|
|
222 |
0 |
Set<V> sortedSet = new TreeSet<V>(mapValues); |
223 |
0 |
@SuppressWarnings("unchecked") |
224 |
|
V[] sortedArray = (V[]) sortedSet.toArray(); |
225 |
|
|
226 |
0 |
for (int i = 0; i < sortedArray.length; i++) { |
227 |
0 |
for (int j = 0; j < mapValues.size(); j++) { |
228 |
0 |
if (mapValues.get(j).equals(sortedArray[i])) { |
229 |
0 |
map.put(mapKeys.get(j), sortedArray[i]); |
230 |
|
} |
231 |
|
} |
232 |
|
} |
233 |
0 |
return map; |
234 |
|
} |
235 |
|
|
236 |
|
|
237 |
|
|
238 |
|
|
239 |
|
|
240 |
|
@param |
241 |
|
@return |
242 |
|
|
|
|
| 0% |
Uncovered Elements: 5 (5) |
Complexity: 2 |
Complexity Density: 0.67 |
|
243 |
0 |
public int getLanguageConstant(String lang)... |
244 |
|
{ |
245 |
0 |
if (lang.trim().equalsIgnoreCase("fr")) { |
246 |
0 |
return AutoTagPlugin.LANG_FRENCH; |
247 |
|
} |
248 |
|
|
249 |
0 |
return AutoTagPlugin.LANG_ENGLISH; |
250 |
|
} |
251 |
|
|
252 |
|
|
253 |
|
|
254 |
|
|
255 |
|
|
256 |
|
@param |
257 |
|
@return@link |
258 |
|
|
259 |
|
|
|
|
| 0% |
Uncovered Elements: 9 (9) |
Complexity: 1 |
Complexity Density: 0.11 |
|
260 |
0 |
private String[] splitWords(TagCloud tagCloud)... |
261 |
|
{ |
262 |
0 |
String text = tagCloud.getText(); |
263 |
0 |
text = text.replaceAll("\n", " "); |
264 |
0 |
text = text.replaceAll("\r", " "); |
265 |
0 |
text = text.replaceAll("'", " "); |
266 |
0 |
text = text.replaceAll("\u0092", " "); |
267 |
0 |
text = text.toLowerCase(); |
268 |
0 |
String[] words = text.split("[\\s,.;:!\\?]+"); |
269 |
0 |
tagCloud.setWordList(words); |
270 |
0 |
return words; |
271 |
|
} |
272 |
|
|
273 |
|
|
274 |
|
|
275 |
|
@link |
276 |
|
|
277 |
|
@param |
278 |
|
@return |
279 |
|
@link |
280 |
|
|
|
|
| 0% |
Uncovered Elements: 13 (13) |
Complexity: 3 |
Complexity Density: 0.33 |
|
281 |
0 |
private Map<String, Integer> countWords(TagCloud tagCloud)... |
282 |
|
{ |
283 |
0 |
String[] words = tagCloud.getWordList(); |
284 |
0 |
Map<String, Integer> wordsCount = new HashMap<String, Integer>(); |
285 |
|
|
286 |
0 |
for (int i = 0; i < words.length; i++) { |
287 |
0 |
String word = words[i]; |
288 |
0 |
if (!wordsCount.containsKey(word)) { |
289 |
0 |
wordsCount.put(word, Integer.valueOf(0)); |
290 |
|
} |
291 |
0 |
wordsCount.put(word, Integer.valueOf((wordsCount.get(word)).intValue() + 1)); |
292 |
|
} |
293 |
0 |
tagCloud.setCountedWordMap(wordsCount); |
294 |
0 |
return wordsCount; |
295 |
|
} |
296 |
|
|
297 |
|
|
298 |
|
|
299 |
|
@link |
300 |
|
|
301 |
|
@param |
302 |
|
@param |
303 |
|
@return |
304 |
|
@link |
305 |
|
|
|
|
| 0% |
Uncovered Elements: 31 (31) |
Complexity: 6 |
Complexity Density: 0.24 |
|
306 |
0 |
private Map<String, Integer> clearStopWords(TagCloud tagCloud, int lang)... |
307 |
|
{ |
308 |
0 |
Map<String, Integer> words = tagCloud.getCountedWordMap(); |
309 |
0 |
String[] stopWordsArray = new String[0]; |
310 |
0 |
switch (lang) { |
311 |
0 |
case LANG_ENGLISH: |
312 |
0 |
stopWordsArray = ENGLISH_STOP_WORDS; |
313 |
0 |
break; |
314 |
0 |
case LANG_FRENCH: |
315 |
0 |
stopWordsArray = FRENCH_STOP_WORDS; |
316 |
0 |
break; |
317 |
0 |
default: |
318 |
|
|
319 |
0 |
break; |
320 |
|
} |
321 |
0 |
for (String word : stopWordsArray) { |
322 |
0 |
words.remove(word); |
323 |
|
} |
324 |
|
|
325 |
0 |
Set<String> ignored = new HashSet<String>(); |
326 |
0 |
for (String word : words.keySet()) { |
327 |
0 |
if (SPECIAL_CHARS.matcher(word).find()) { |
328 |
0 |
ignored.add(word); |
329 |
|
} |
330 |
|
} |
331 |
|
|
332 |
0 |
for (String word : this.ignoreList) { |
333 |
0 |
if (!this.dontignoreList.contains(word)) { |
334 |
0 |
words.remove(word); |
335 |
|
} |
336 |
|
} |
337 |
0 |
for (String word : ignored) { |
338 |
0 |
if (!this.dontignoreList.contains(word)) { |
339 |
0 |
words.remove(word); |
340 |
|
} |
341 |
|
} |
342 |
|
|
343 |
0 |
tagCloud.setCountedWordMap(words); |
344 |
0 |
return words; |
345 |
|
} |
346 |
|
|
347 |
|
|
348 |
|
|
349 |
|
|
350 |
|
@link@link |
351 |
|
|
352 |
|
@param |
353 |
|
@param |
354 |
|
@return |
355 |
|
@link |
356 |
|
|
|
|
| 0% |
Uncovered Elements: 17 (17) |
Complexity: 3 |
Complexity Density: 0.23 |
|
357 |
0 |
private Map<String, Map<String, Integer>> stemmer(TagCloud tagCloud, int lang)... |
358 |
|
{ |
359 |
0 |
Map<String, Integer> words = tagCloud.getCountedWordMap(); |
360 |
|
|
361 |
|
|
362 |
|
|
363 |
|
|
364 |
|
|
365 |
|
|
366 |
0 |
FrenchStemmer stemmer; |
367 |
0 |
Map<String, Map<String, Integer>> stemmedWordMap = new HashMap<String, Map<String, Integer>>(); |
368 |
|
|
369 |
0 |
stemmer = new FrenchStemmer(); |
370 |
|
|
371 |
0 |
for (String word : words.keySet()) { |
372 |
0 |
if (word.length() <= 2) { |
373 |
0 |
continue; |
374 |
|
} |
375 |
0 |
String stemmedWord = stemmer.stem(word); |
376 |
0 |
if (!stemmedWordMap.containsKey(stemmedWord)) { |
377 |
0 |
stemmedWordMap.put(stemmedWord, new HashMap<String, Integer>()); |
378 |
|
} |
379 |
0 |
stemmedWordMap.get(stemmedWord).put(word, words.get(word)); |
380 |
|
} |
381 |
0 |
tagCloud.setStemmedWordMap(stemmedWordMap); |
382 |
0 |
return stemmedWordMap; |
383 |
|
} |
384 |
|
|
385 |
|
|
386 |
|
@link |
387 |
|
@link |
388 |
|
|
389 |
|
@param |
390 |
|
@return@link |
391 |
|
|
|
|
| 0% |
Uncovered Elements: 18 (18) |
Complexity: 2 |
Complexity Density: 0.12 |
|
392 |
0 |
private Set<Tag> calculateTags(TagCloud tagCloud)... |
393 |
|
{ |
394 |
0 |
Map<String, Map<String, Integer>> stemmedWords = tagCloud.getStemmedWordMap(); |
395 |
0 |
Map<String, Integer> stemmedWordFreqMap = new HashMap<String, Integer>(); |
396 |
|
|
397 |
|
|
398 |
|
|
399 |
0 |
for (Map.Entry<String, Map<String, Integer>> stemmedWord : stemmedWords.entrySet()) { |
400 |
0 |
Integer totalFreqency = Integer.valueOf(0); |
401 |
0 |
String leadWord = ""; |
402 |
0 |
Integer leadFrequency = Integer.valueOf(0); |
403 |
|
|
404 |
0 |
Map<String, Integer> wordMap = stemmedWord.getValue(); |
405 |
|
|
406 |
0 |
for (Map.Entry<String, Integer> word : wordMap.entrySet()) { |
407 |
0 |
Integer frequency = word.getValue(); |
408 |
|
|
409 |
0 |
totalFreqency = Integer.valueOf(frequency.intValue() + totalFreqency.intValue()); |
410 |
|
|
411 |
0 |
if (frequency.intValue() > leadFrequency.intValue()) { |
412 |
0 |
leadFrequency = word.getValue(); |
413 |
0 |
leadWord = word.getKey(); |
414 |
|
} |
415 |
|
} |
416 |
0 |
stemmedWordFreqMap.put(leadWord, totalFreqency); |
417 |
|
} |
418 |
0 |
tagCloud.setStemmedWordFreqMap(stemmedWordFreqMap); |
419 |
0 |
return calculateTagSizes(tagCloud); |
420 |
|
} |
421 |
|
|
422 |
|
|
423 |
|
|
424 |
|
@link |
425 |
|
|
426 |
|
@param |
427 |
|
@return@link |
428 |
|
|
|
|
| 0% |
Uncovered Elements: 28 (28) |
Complexity: 4 |
Complexity Density: 0.17 |
|
429 |
0 |
private Set<Tag> calculateTagSizes(TagCloud tagCloud)... |
430 |
|
{ |
431 |
0 |
Map<String, Integer> stemmedWordFreqMap = tagCloud.getStemmedWordFreqMap(); |
432 |
|
|
433 |
0 |
if (stemmedWordFreqMap == null || stemmedWordFreqMap.size() == 0) { |
434 |
0 |
tagCloud.setTags(new TreeSet<Tag>()); |
435 |
0 |
return tagCloud.getTags(); |
436 |
|
} |
437 |
|
|
438 |
0 |
Map<String, Integer> orderedMap = sortMap(stemmedWordFreqMap); |
439 |
|
|
440 |
0 |
Map<String, Integer> tagMap = new LinkedHashMap<String, Integer>(); |
441 |
|
|
442 |
0 |
int i = 0; |
443 |
0 |
for (Entry<String, Integer> word : orderedMap.entrySet()) { |
444 |
0 |
tagMap.put(word.getKey(), word.getValue()); |
445 |
0 |
if (++i > this.maximumNumberOfTags) { |
446 |
0 |
break; |
447 |
|
} |
448 |
|
} |
449 |
0 |
Integer[] freqs = tagMap.values().toArray(new Integer[0]); |
450 |
|
|
451 |
0 |
Integer minFreq = freqs[0]; |
452 |
0 |
Integer maxFreq = freqs[freqs.length - 1]; |
453 |
|
|
454 |
0 |
int ftot = 0; |
455 |
|
|
456 |
0 |
for (Integer f : freqs) { |
457 |
0 |
ftot += f.intValue(); |
458 |
|
} |
459 |
|
|
460 |
0 |
SortedSet<Tag> tagSet = new TreeSet<Tag>(); |
461 |
|
|
462 |
0 |
for (String tagName : sortSet(tagMap.keySet())) { |
463 |
0 |
long size = getTagSize(tagMap.get(tagName), maxFreq, minFreq, ftot); |
464 |
0 |
Tag tag = new Tag(tagName, size); |
465 |
0 |
tagSet.add(tag); |
466 |
|
} |
467 |
0 |
tagCloud.setTags(tagSet); |
468 |
0 |
return tagSet; |
469 |
|
} |
470 |
|
|
471 |
|
|
472 |
|
|
473 |
|
|
474 |
|
@param |
475 |
|
@param |
476 |
|
@param |
477 |
|
@param |
478 |
|
@return@link@link |
479 |
|
|
480 |
|
|
|
|
| 0% |
Uncovered Elements: 11 (11) |
Complexity: 3 |
Complexity Density: 0.43 |
|
481 |
0 |
private long getTagSize(double tagOccurrences, double maxOccurrences, double minOccurrences,... |
482 |
|
double totalOccurrences) |
483 |
|
{ |
484 |
|
|
485 |
0 |
int fontRange = this.maxTagSize - this.minTagSize; |
486 |
|
|
487 |
|
|
488 |
|
|
489 |
0 |
double cumulativeImportance = 0.7; |
490 |
|
|
491 |
|
|
492 |
0 |
double sumpx = ((fontRange * cumulativeImportance) + 1) * (fontRange * cumulativeImportance) / 2; |
493 |
0 |
double px = tagOccurrences / totalOccurrences * sumpx; |
494 |
|
|
495 |
|
|
496 |
0 |
px += Math.pow((tagOccurrences - minOccurrences) |
497 |
0 |
/ (1 > maxOccurrences - minOccurrences ? 1 : maxOccurrences - minOccurrences), 0.8) |
498 |
|
* (fontRange * (1 - cumulativeImportance)); |
499 |
0 |
double result = this.maxTagSize < px + this.minTagSize ? this.maxTagSize : px + this.minTagSize; |
500 |
0 |
return Math.round(result); |
501 |
|
} |
502 |
|
} |