from __future__ import print_function
from errors import NoValidArgumentError, OutOfRangeError
from PlagResult import PlagResult
#List of queues containing matches (used by RKR_GST)
#matchList = []
#tiles = []
[docs]def run(s1, s2, mML=3, treshold=0.5):
'''
This method runs a comparison on the two given strings s1 and s2 returning
a PlagResult object containing the similarity value, the similarities as
list of tiles and a boolean value indicating suspected plagiarism.
:Argument1: s1 {string} -- string 1
:Argument2: s2 {string} -- string 2
:Argument3: mML {number} -- minimumMatchingLength (default: {3})
:Argument4: treshold {number} -- a single value between 0 and 1 that determines whether a comparsion between
string should be marked as plagiarised (default: {0.5})
:Returns: object -- PlagResult
:Raises: OutOfRangeError, OutOfRangeError, NoValidArgumentError
'''
#check if the preconditions are fullfilled
if mML<1:
raise OutOfRangeError
if not (0 <= treshold <= 1):
raise OutOfRangeError
if s1 == None or s2 == None:
raise NoValidArgumentError
#if type(s1) != type('') or type(s2) != type(''):
# raise NoValidArgumentError
if s1 == '' or s2 == '':
return PlagResult(hash(s1), hash(s2))
#compute tiles
global tiles, matchList
tiles = [] #TODO: anders regeln ? tiles
matchList = [] #TODO: anders regeln`? matchList
tiles = RKR_GST(s1, s2, mML)
#print(tiles)
#compute similarity
simResult = calcSimilarity(s1.split(), s2.split(), tiles, treshold)
similarity = simResult[0]
#print similarity
if similarity>1: similarity = 1
#create PlagResult and set attributes
result = PlagResult()
result.setIdentifier(hash(s1), hash(s2))
result.setTiles(tiles)
result.setSimilarity(similarity)
result.setSuspectedPlagiarism(simResult[1])
#print simResult[1]
#return result of similarity check as PlagResult object
return result
#=============
#===RKR-GST===
#=============
[docs]def RKR_GST(P, T, minimalMatchingLength = 3, initsearchSize = 20):
'''Computes Running-Karp-Rabin-Greedy-String-Tiling
:Argument1: P {string} -- pattern
:Argument2: T {string} -- text
:Argument3: minimalMatchingLength {number} -- minimal matching length to be considered (default: {3})
:Argument4: initsearchSize {number} -- initial search size (default: {20})
:Returns: list -- tiles
'''
PList = P.split()
TList = T.split()
s = initsearchSize
#tiles = []
stop = False
while not stop:
#Lmax is size of largest maximal-matches from this scan
Lmax = scanpattern(s, PList, TList)
#if very long string no tiles marked. Iterate with larger s
if Lmax > 2*s:
s = Lmax
else:
#Create tiles from matches taken from list of queues
markstrings(s, PList, TList)
if s > 2*minimalMatchingLength:
s = s / 2
elif s > minimalMatchingLength:
s = minimalMatchingLength
else:
stop = True
return list(tiles)
[docs]def scanpattern(s, P, T):
"""Scans the pattern and text string lists for matches.
If a match is found that is twice as big as the search length s
that size is returned, to be used to restart the scanpattern with it.
All matches found are stored in a list of matches in queues.
"""
longestMaxMatch = 0
queue = []
hashtable = GSTHashtable()
#Starting at the first unmarked token in T
#for each unmarked Tt do
#if distance to next tile <= s then
#advance t to first unmarked token after next tile
#else create the KR-hash value for substring Tt to Tt+s-1 and add to hashtable
t = 0
noNextTile = False
while t<len(T):
if isMarked(T[t]):
t = t+1
continue
dist = distToNextTile(t, T)
if dist == None: #no next Tile was found
dist = len(T)-t
noNextTile = True
if dist < s:
if noNextTile: t = len(T)
else:
t = jumpToNextUnmarkedTokenAfterTile(t, T)
if t == None: t = len(T)
else:
substring = "".join(T[t:t+s])
h = createKRHashValue(substring)
hashtable.add(h, t)
t = t+1
#Starting at the first unmarked token of P
#for each unmarked Pp do
#if distance to next tile <= s then
#advance p to first unmarked token after next tile
#else
#create the KR hash-value for substring Pp to Pp+s-1
#check hashtable for hash of KR hash-value
#for each hash-table entry with equal hashed KR hash-value do
#if for all j from 0 to s-1, Pp+ j = Tt+ j then /* IE match is not hash artifact */
#k: = s
#while Pp+k = Tt+k AND unmarked(Pp+k) AND unmarked(Tt+k) do
#k := k + 1
#if k > 2 *s then return(k) /* and restart scanpattern with s = k */
#else record new maximal-match
noNextTile = False
p = 0
while p<len(P): #for p in range(0, len(P)):
if isMarked(P[p]):
p = p+1
continue
dist = distToNextTile(p, P)
if dist == None: #no next Tile was found
dist = len(P)-p
noNextTile = True
if dist < s: #if dist <= s:
if noNextTile: p = len(P)
else:
p = jumpToNextUnmarkedTokenAfterTile(p, P) #TODO:
if p == None: p = len(P) #no next unmarked token after Tile was found
else:
substring = "".join(P[p:p+s]) #substring = "".join(P[p:p+dist])
h = createKRHashValue(substring)
values = hashtable.get(h)
if values != None:
for val in values:
if "".join(T[val:val+s]) == substring:
t = val
k = s
while (p+k<len(P) and t+k<len(T) and
P[p+k] == T[t+k]
and isUnmarked(P[p+k]) and isUnmarked(T[t+k])):
k = k+1
if k > 2*s: return k
else:
if longestMaxMatch < s: longestMaxMatch = s
queue.append((p, t, k))
#recordMaxMatch() #TODO
p = p+1
#add queue to matchList if its not empty
if queue != []:
matchList.append(queue)
#Return(length of longest maximal-match)
return longestMaxMatch
[docs]def markstrings(s, P, T):
lengthOfTokenTiled = 0
#for each queue
while matchList != []:
queue = matchList.pop(0)
while queue != []: #while queue is not empty
match = queue.pop(0) #match = (p-position, t-position, length)
if not isOccluded(match, tiles):
for j in range(0, match[2]):
P[match[0]+j] = markToken(P[match[0]+j])
T[match[1]+j] = markToken(T[match[1]+j])
lengthOfTokenTiled = lengthOfTokenTiled+match[2]
tiles.append(match)
[docs]def createKRHashValue(substring):
"""Creates a Karp-Rabin Hash Value for the given substring
and returns it.
Based on:
http://www-igm.univ-mlv.fr/~lecroq/string/node5.html
"""
#===============================================================================
# return hash(substring)
#===============================================================================
hashValue = 0
for c in substring:#for i in range(0, len(substring)):
hashValue = ((hashValue<<1) + ord(c)) #hashValue = ((hashValue<<1) + substring[i])
return hashValue
[docs]def isUnmarked(s):
"""If string s is unmarked returns True otherwise False.
"""
if len(s) > 0: return s[0] != '*'
else: return False #True or False?
[docs]def isMarked(s):
return not isUnmarked(s)
[docs]def markToken(s):
"""Mark string s.
"""
return '*'+s
[docs]def isOccluded(match, tiles):
"""Returns true if the match is already occluded by another match
in the tiles list.
"Note that "not occluded" is taken to mean that none of the tokens
Pp to Pp+maxmatch-1 and Tt to Tt+maxmatch-1 has been marked during
the creation of an earlier tile. However, given that smaller tiles
cannot be created before larger ones, it suffices that only the ends
of each new putative tile be testet for occlusion, rather than the whole
maxmimal match."
["String Similarity via Greedy String Tiling and Running Karp-Rabin Matching"
http://www.pam1.bcs.uwa.edu.au/~michaelw/ftp/doc/RKR_GST.ps]
"""
for m in tiles:
if (m[0]+m[2] == match[0]+match[2]
and m[1]+m[2] == match[1]+match[2]):
return True
return False
[docs]def distToNextTile(pos, stringList):
"""Returns distance to next tile, i.e. to next marked token.
If not tile was found, it returns None.
case 1: there is a next tile
-> pos + dist = first marked token
-> return dist
case 2: there is no next tile
-> pos + dist = len(stringList)
-> return None
dist is also number of unmarked token 'til next tile
"""
#case 2: is last token in list
if pos == len(stringList): return None
dist = 0
while pos+dist+1<len(stringList) and isUnmarked(stringList[pos+dist+1]):
dist = dist+1
#case 2: no tile was found
if pos+dist+1 == len(stringList): return None
#case 1:
return dist+1
[docs]def jumpToNextUnmarkedTokenAfterTile(pos, stringList):
"""Returns the first postion of an unmarked token after the next tile.
case 1: -> normal case
-> tile exists
-> there is an unmarked token after the tile
case 2:
-> tile exists
-> but NO unmarked token after the tile
case 3:
-> NO tile exists
"""
dist = distToNextTile(pos, stringList)
#case3: No Tile was found
if dist == None: return None
pos = pos+dist #pos on first marked token
while pos+1<len(stringList) and not isUnmarked(stringList[pos+1]):
pos = pos+1
#case 2: No unmarked Token after Tile was found
if pos+1> len(stringList)-1: return None
#case 1:
return pos+1
#===============================================================================
# Computation of the Similarity
#===============================================================================
[docs]def calcSimilarity(s1List, s2List, tiles, treshold):
"""Calculates Similarity and returns list [similarity:float, suspectedPlagiarism:bool]"""
#compute similarity
similarity = sim(s1List, s2List, tiles)
#check if it is suspected plagiarism
suspPlag = similarity >= treshold
return [similarity, suspPlag]
[docs]def sim(A, B, tiles):
"""Returns similarity value for token of text A and B and the similary tiles covered.
"""
return float(2 * coverage(tiles)) / float(len(A) + len(B))
[docs]def coverage(tiles):
"""Sum of length of all tiles.
"""
accu = 0
for tile in tiles:
accu = accu + tile[2]
return accu
#===============================================================================
# Extra Class: GSTHashtable
#===============================================================================
[docs]class GSTHashtable:
def __init__(self):
self.dict = {}
[docs] def add(self, key, ob):
"""Stores object 'ob' for key 'key' in a list. If there are already
objects stored in the list for the key -> 'ob' is appended.
"""
if self.dict.has_key(key):
values = self.dict.get(key)
values.append(ob)
self.dict.setdefault(key, values)
else:
self.dict.setdefault(key, [ob])
[docs] def get(self, key):
"""Returns a list with all objects for key 'key'. If the key does not exist
'None' is returned.
"""
if self.dict.has_key(key):
return self.dict.get(key)
else:
return None
[docs] def clear(self):
"""Clears the GSTHashtable, i.e. all entries are removed.
"""
self.dict = {}
[docs]def main_func(text1,text2,index_list):
"""
This function takes original text files processed text and input file processed text ,compares them and return similarity content.
:Argument1: text1 {String} -- Combined original files text.
:Argument1: text2 {String} -- Input file text
:Argument1: index_list {list of integers} -- list of index of end of text file
"""
resultvalue=run(text1,text2)
print ('tile content is ',resultvalue.getTiles())
print (resultvalue.similarity)
print ('tile similarity content copy authenication ')
tmp1=text1.split()
for j in resultvalue.getTiles():
print ('THIS IS FOR ' , j)
lenh=j[2]
for i in range(len(index_list)):
if j[1]<index_list[i][0]:
print ('FROM FILE ',index_list[i][1])
break
for k in range(j[0],j[0]+j[2]):
print (tmp1[k],end=' ')
print('\n')