Source code for plag

from __future__ import print_function
from errors import NoValidArgumentError, OutOfRangeError
from PlagResult import PlagResult



#List of queues containing matches (used by RKR_GST)
#matchList = []
#tiles = []

[docs]def run(s1, s2, mML=3, treshold=0.5): ''' This method runs a comparison on the two given strings s1 and s2 returning a PlagResult object containing the similarity value, the similarities as list of tiles and a boolean value indicating suspected plagiarism. :Argument1: s1 {string} -- string 1 :Argument2: s2 {string} -- string 2 :Argument3: mML {number} -- minimumMatchingLength (default: {3}) :Argument4: treshold {number} -- a single value between 0 and 1 that determines whether a comparsion between string should be marked as plagiarised (default: {0.5}) :Returns: object -- PlagResult :Raises: OutOfRangeError, OutOfRangeError, NoValidArgumentError ''' #check if the preconditions are fullfilled if mML<1: raise OutOfRangeError if not (0 <= treshold <= 1): raise OutOfRangeError if s1 == None or s2 == None: raise NoValidArgumentError #if type(s1) != type('') or type(s2) != type(''): # raise NoValidArgumentError if s1 == '' or s2 == '': return PlagResult(hash(s1), hash(s2)) #compute tiles global tiles, matchList tiles = [] #TODO: anders regeln ? tiles matchList = [] #TODO: anders regeln`? matchList tiles = RKR_GST(s1, s2, mML) #print(tiles) #compute similarity simResult = calcSimilarity(s1.split(), s2.split(), tiles, treshold) similarity = simResult[0] #print similarity if similarity>1: similarity = 1 #create PlagResult and set attributes result = PlagResult() result.setIdentifier(hash(s1), hash(s2)) result.setTiles(tiles) result.setSimilarity(similarity) result.setSuspectedPlagiarism(simResult[1]) #print simResult[1] #return result of similarity check as PlagResult object return result
#============= #===RKR-GST=== #=============
[docs]def RKR_GST(P, T, minimalMatchingLength = 3, initsearchSize = 20): '''Computes Running-Karp-Rabin-Greedy-String-Tiling :Argument1: P {string} -- pattern :Argument2: T {string} -- text :Argument3: minimalMatchingLength {number} -- minimal matching length to be considered (default: {3}) :Argument4: initsearchSize {number} -- initial search size (default: {20}) :Returns: list -- tiles ''' PList = P.split() TList = T.split() s = initsearchSize #tiles = [] stop = False while not stop: #Lmax is size of largest maximal-matches from this scan Lmax = scanpattern(s, PList, TList) #if very long string no tiles marked. Iterate with larger s if Lmax > 2*s: s = Lmax else: #Create tiles from matches taken from list of queues markstrings(s, PList, TList) if s > 2*minimalMatchingLength: s = s / 2 elif s > minimalMatchingLength: s = minimalMatchingLength else: stop = True return list(tiles)
[docs]def scanpattern(s, P, T): """Scans the pattern and text string lists for matches. If a match is found that is twice as big as the search length s that size is returned, to be used to restart the scanpattern with it. All matches found are stored in a list of matches in queues. """ longestMaxMatch = 0 queue = [] hashtable = GSTHashtable() #Starting at the first unmarked token in T #for each unmarked Tt do #if distance to next tile <= s then #advance t to first unmarked token after next tile #else create the KR-hash value for substring Tt to Tt+s-1 and add to hashtable t = 0 noNextTile = False while t<len(T): if isMarked(T[t]): t = t+1 continue dist = distToNextTile(t, T) if dist == None: #no next Tile was found dist = len(T)-t noNextTile = True if dist < s: if noNextTile: t = len(T) else: t = jumpToNextUnmarkedTokenAfterTile(t, T) if t == None: t = len(T) else: substring = "".join(T[t:t+s]) h = createKRHashValue(substring) hashtable.add(h, t) t = t+1 #Starting at the first unmarked token of P #for each unmarked Pp do #if distance to next tile <= s then #advance p to first unmarked token after next tile #else #create the KR hash-value for substring Pp to Pp+s-1 #check hashtable for hash of KR hash-value #for each hash-table entry with equal hashed KR hash-value do #if for all j from 0 to s-1, Pp+ j = Tt+ j then /* IE match is not hash artifact */ #k: = s #while Pp+k = Tt+k AND unmarked(Pp+k) AND unmarked(Tt+k) do #k := k + 1 #if k > 2 *s then return(k) /* and restart scanpattern with s = k */ #else record new maximal-match noNextTile = False p = 0 while p<len(P): #for p in range(0, len(P)): if isMarked(P[p]): p = p+1 continue dist = distToNextTile(p, P) if dist == None: #no next Tile was found dist = len(P)-p noNextTile = True if dist < s: #if dist <= s: if noNextTile: p = len(P) else: p = jumpToNextUnmarkedTokenAfterTile(p, P) #TODO: if p == None: p = len(P) #no next unmarked token after Tile was found else: substring = "".join(P[p:p+s]) #substring = "".join(P[p:p+dist]) h = createKRHashValue(substring) values = hashtable.get(h) if values != None: for val in values: if "".join(T[val:val+s]) == substring: t = val k = s while (p+k<len(P) and t+k<len(T) and P[p+k] == T[t+k] and isUnmarked(P[p+k]) and isUnmarked(T[t+k])): k = k+1 if k > 2*s: return k else: if longestMaxMatch < s: longestMaxMatch = s queue.append((p, t, k)) #recordMaxMatch() #TODO p = p+1 #add queue to matchList if its not empty if queue != []: matchList.append(queue) #Return(length of longest maximal-match) return longestMaxMatch
[docs]def markstrings(s, P, T): lengthOfTokenTiled = 0 #for each queue while matchList != []: queue = matchList.pop(0) while queue != []: #while queue is not empty match = queue.pop(0) #match = (p-position, t-position, length) if not isOccluded(match, tiles): for j in range(0, match[2]): P[match[0]+j] = markToken(P[match[0]+j]) T[match[1]+j] = markToken(T[match[1]+j]) lengthOfTokenTiled = lengthOfTokenTiled+match[2] tiles.append(match)
[docs]def createKRHashValue(substring): """Creates a Karp-Rabin Hash Value for the given substring and returns it. Based on: http://www-igm.univ-mlv.fr/~lecroq/string/node5.html """ #=============================================================================== # return hash(substring) #=============================================================================== hashValue = 0 for c in substring:#for i in range(0, len(substring)): hashValue = ((hashValue<<1) + ord(c)) #hashValue = ((hashValue<<1) + substring[i]) return hashValue
[docs]def isUnmarked(s): """If string s is unmarked returns True otherwise False. """ if len(s) > 0: return s[0] != '*' else: return False #True or False?
[docs]def isMarked(s): return not isUnmarked(s)
[docs]def markToken(s): """Mark string s. """ return '*'+s
[docs]def isOccluded(match, tiles): """Returns true if the match is already occluded by another match in the tiles list. "Note that "not occluded" is taken to mean that none of the tokens Pp to Pp+maxmatch-1 and Tt to Tt+maxmatch-1 has been marked during the creation of an earlier tile. However, given that smaller tiles cannot be created before larger ones, it suffices that only the ends of each new putative tile be testet for occlusion, rather than the whole maxmimal match." ["String Similarity via Greedy String Tiling and Running Karp-Rabin Matching" http://www.pam1.bcs.uwa.edu.au/~michaelw/ftp/doc/RKR_GST.ps] """ for m in tiles: if (m[0]+m[2] == match[0]+match[2] and m[1]+m[2] == match[1]+match[2]): return True return False
[docs]def distToNextTile(pos, stringList): """Returns distance to next tile, i.e. to next marked token. If not tile was found, it returns None. case 1: there is a next tile -> pos + dist = first marked token -> return dist case 2: there is no next tile -> pos + dist = len(stringList) -> return None dist is also number of unmarked token 'til next tile """ #case 2: is last token in list if pos == len(stringList): return None dist = 0 while pos+dist+1<len(stringList) and isUnmarked(stringList[pos+dist+1]): dist = dist+1 #case 2: no tile was found if pos+dist+1 == len(stringList): return None #case 1: return dist+1
[docs]def jumpToNextUnmarkedTokenAfterTile(pos, stringList): """Returns the first postion of an unmarked token after the next tile. case 1: -> normal case -> tile exists -> there is an unmarked token after the tile case 2: -> tile exists -> but NO unmarked token after the tile case 3: -> NO tile exists """ dist = distToNextTile(pos, stringList) #case3: No Tile was found if dist == None: return None pos = pos+dist #pos on first marked token while pos+1<len(stringList) and not isUnmarked(stringList[pos+1]): pos = pos+1 #case 2: No unmarked Token after Tile was found if pos+1> len(stringList)-1: return None #case 1: return pos+1
#=============================================================================== # Computation of the Similarity #===============================================================================
[docs]def calcSimilarity(s1List, s2List, tiles, treshold): """Calculates Similarity and returns list [similarity:float, suspectedPlagiarism:bool]""" #compute similarity similarity = sim(s1List, s2List, tiles) #check if it is suspected plagiarism suspPlag = similarity >= treshold return [similarity, suspPlag]
[docs]def sim(A, B, tiles): """Returns similarity value for token of text A and B and the similary tiles covered. """ return float(2 * coverage(tiles)) / float(len(A) + len(B))
[docs]def coverage(tiles): """Sum of length of all tiles. """ accu = 0 for tile in tiles: accu = accu + tile[2] return accu
#=============================================================================== # Extra Class: GSTHashtable #===============================================================================
[docs]class GSTHashtable: def __init__(self): self.dict = {}
[docs] def add(self, key, ob): """Stores object 'ob' for key 'key' in a list. If there are already objects stored in the list for the key -> 'ob' is appended. """ if self.dict.has_key(key): values = self.dict.get(key) values.append(ob) self.dict.setdefault(key, values) else: self.dict.setdefault(key, [ob])
[docs] def get(self, key): """Returns a list with all objects for key 'key'. If the key does not exist 'None' is returned. """ if self.dict.has_key(key): return self.dict.get(key) else: return None
[docs] def clear(self): """Clears the GSTHashtable, i.e. all entries are removed. """ self.dict = {}
[docs]def main_func(text1,text2,index_list): """ This function takes original text files processed text and input file processed text ,compares them and return similarity content. :Argument1: text1 {String} -- Combined original files text. :Argument1: text2 {String} -- Input file text :Argument1: index_list {list of integers} -- list of index of end of text file """ resultvalue=run(text1,text2) print ('tile content is ',resultvalue.getTiles()) print (resultvalue.similarity) print ('tile similarity content copy authenication ') tmp1=text1.split() for j in resultvalue.getTiles(): print ('THIS IS FOR ' , j) lenh=j[2] for i in range(len(index_list)): if j[1]<index_list[i][0]: print ('FROM FILE ',index_list[i][1]) break for k in range(j[0],j[0]+j[2]): print (tmp1[k],end=' ') print('\n')