View Javadoc

1   /***
2    * BSD-style license; for more info see http://pmd.sourceforge.net/license.html
3    */
4   package net.sourceforge.pmd.cpd;
5   
6   import java.util.ArrayList;
7   import java.util.Collections;
8   import java.util.HashMap;
9   import java.util.HashSet;
10  import java.util.Iterator;
11  import java.util.List;
12  import java.util.Map;
13  import java.util.Set;
14  
15  public class MatchCollector {
16  
17      private MatchAlgorithm ma;
18      private Map startMap = new HashMap();
19      private Map fileMap = new HashMap();
20  
21      public MatchCollector(MatchAlgorithm ma) {
22          this.ma = ma;
23      }
24  
25      public void collect(int minimumLength, List marks) {
26          //first get a pairwise collection of all maximal matches
27          for (int i = 0; i < marks.size() - 1; i++) {
28              TokenEntry mark1 = (TokenEntry) marks.get(i);
29              for (int j = i + 1; j < marks.size(); j++) {
30  				TokenEntry mark2 = (TokenEntry) marks.get(j);
31  				int diff = mark1.getIndex() - mark2.getIndex();
32                  if (-diff < minimumLength) {
33                      continue;
34                  }
35                  if (hasPreviousDupe(mark1, mark2)) {
36                      continue;
37                  }
38                  int dupes = countDuplicateTokens(mark1, mark2);
39                  //false positive check
40                  if (dupes < minimumLength) {
41                      continue;
42                  }
43                  //is it still too close together
44                  if (diff + dupes >= 1) {
45                      continue;
46                  }
47                  determineMatch(mark1, mark2, dupes);
48              }
49          }
50      }
51      
52      public List getMatches() {
53  		ArrayList matchList = new ArrayList(startMap.values());
54  		groupMatches(matchList);
55  		return matchList;
56      }
57  
58      /***
59       * A greedy algorithm for determining non-overlapping matches
60       */
61      private void determineMatch(TokenEntry mark1, TokenEntry mark2, int dupes) {
62          Match match = new Match(dupes, mark1, mark2);
63          String fileKey = mark1.getTokenSrcID() + mark2.getTokenSrcID();
64          ArrayList pairMatches = (ArrayList) fileMap.get(fileKey);
65          if (pairMatches == null) {
66              pairMatches = new ArrayList();
67              fileMap.put(fileKey, pairMatches);
68          }
69          boolean add = true;
70          for (int k = 0; k < pairMatches.size(); k++) {
71              Match other = (Match) pairMatches.get(k);
72              if (other.getFirstMark().getIndex() + other.getTokenCount() - mark1.getIndex()
73                  > 0) {
74                  boolean ordered = other.getSecondMark().getIndex() - mark2.getIndex() < 0;
75                  if ((ordered && (other.getEndIndex() - mark2.getIndex() > 0))
76                      || (!ordered && (match.getEndIndex() - other.getSecondMark().getIndex()) > 0)) {
77                      if (other.getTokenCount() >= match.getTokenCount()) {
78                          add = false;
79                          break;
80                      } else {
81                          pairMatches.remove(k);
82                          startMap.remove(other.getMatchCode());
83                      }
84                  }
85              }
86          }
87          if (add) {
88              pairMatches.add(match);
89              startMap.put(match.getMatchCode(), match);
90          }
91      }
92  
93      private void groupMatches(ArrayList matchList) {
94          Collections.sort(matchList);
95          HashSet matchSet = new HashSet();
96          Match.MatchCode matchCode = new Match.MatchCode();
97          for (int i = matchList.size(); i > 1; i--) {
98              Match match1 = (Match) matchList.get(i - 1);
99              TokenEntry mark1 = (TokenEntry) match1.getMarkSet().iterator().next();
100             matchSet.clear();
101             matchSet.add(match1.getMatchCode());
102             for (int j = i - 1; j > 0; j--) {
103                 Match match2 = (Match) matchList.get(j - 1);
104                 if (match1.getTokenCount() != match2.getTokenCount()) {
105                     break;
106                 }
107                 TokenEntry mark2 = null;
108                 for (Iterator iter = match2.getMarkSet().iterator(); iter.hasNext();) {
109                     mark2 = (TokenEntry) iter.next();
110                     if (mark2 != mark1) {
111                         break;
112                     }
113                 }
114                 int dupes = countDuplicateTokens(mark1, mark2);
115                 if (dupes < match1.getTokenCount()) {
116                     break;
117                 }
118                 matchSet.add(match2.getMatchCode());
119                 match1.getMarkSet().addAll(match2.getMarkSet());
120                 matchList.remove(i - 2);
121                 i--;
122             }
123             if (matchSet.size() == 1) {
124                 continue;
125             }
126             //prune the mark set
127             Set pruned = match1.getMarkSet();
128             boolean done = false;
129             ArrayList a1 = new ArrayList(match1.getMarkSet());
130             Collections.sort(a1);
131             for (int outer = 0; outer < a1.size() - 1 && !done; outer++) {
132                 TokenEntry cmark1 = (TokenEntry) a1.get(outer);
133                 for (int inner = outer + 1; inner < a1.size() && !done; inner++) {
134                     TokenEntry cmark2 = (TokenEntry) a1.get(inner);
135                     matchCode.setFirst(cmark1.getIndex());
136                     matchCode.setSecond(cmark2.getIndex());
137                     if (!matchSet.contains(matchCode)) {
138                         if (pruned.size() > 2) {
139                             pruned.remove(cmark2);
140                         }
141                         if (pruned.size() == 2) {
142                             done = true;
143                         }
144                     }
145                 }
146             }
147         }
148     }
149 
150     private boolean hasPreviousDupe(TokenEntry mark1, TokenEntry mark2) {
151         if (mark1.getIndex() == 0) {
152             return false;
153         }
154         return !matchEnded(ma.tokenAt(-1, mark1), ma.tokenAt(-1, mark2));
155     }
156 
157     private int countDuplicateTokens(TokenEntry mark1, TokenEntry mark2) {
158         int index = 0;
159         while (!matchEnded(ma.tokenAt(index, mark1), ma.tokenAt(index, mark2))) {
160             index++;
161         } 
162         return index;
163     }
164 
165     private boolean matchEnded(TokenEntry token1, TokenEntry token2) {
166         return token1.getIdentifier() != token2.getIdentifier() || token1 == TokenEntry.EOF || token2 == TokenEntry.EOF;
167     }
168 }