Source code for textpreforrkr

# -*- coding: cp1252 -*-
from __future__ import division

import string

from plag import main_func
#from plag import *
import codecs
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.stem import PorterStemmer

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer



[docs]def pre_processing(text,flag=True): '''This function cleans out the unnecessary information from the text and does the required pre processing . Pre processing steps: *Sentence segmentation (Seg)* Split text in the document into sentences and thereby allowing line-by-line processing in the subsequent tests. *Tokenisation (Tok)* Determine token (words, punctuation symbols, etc.) boundaries in sentences. *Lowercase (Low)* Substitute every uppercase letters with lowercase to generalise the matching. *Stop-word removal (Stop)* Remove functional words ( articles pronouns prepositions complementisers and determiners ) . *Punctuation removal (Pun)* Remove punctuation symbols. *Stemming (Stem)* Transform words into their stems in order to generalise the comparison analysis *Lemmatisation (Lem)* Transform words into their dictionary base forms in order to generalise the comparison analysis. :Argument1: text {string} -- text to be pre-processed :Argument2: flag {bool} -- stop-word arg . (default: {True}) :returns: string -- pre-processed string ''' text=text.lower() #sent_tokenize_list = sent_tokenize(text) #print sent_tokenize_list #print len(sent_tokenize_list) #tokenise words a=stopwords.words('english') stop_words=set(a) #stop_words.append('u') words=word_tokenize(text) print words result = [] #remove stop words if flag: for item in words: if item not in stop_words: result.append(item) #print "Filtered",result fil=str(result) else: result #remove punctuation repstr=" " * 32 table=string.maketrans(string.punctuation,repstr) s=fil.translate(table) #return s #lemmatizing lemmatizer=WordNetLemmatizer() h=lemmatizer.lemmatize(s) #print "Lemma",lemmatizer.lemmatize(s) #stemming wordss=word_tokenize(h) ps=PorterStemmer() list1=[] for i in wordss: k=(ps.stem(i)) list1.append(k) #print list1 final= ' '.join(list1) finall=str(final) finallstr='' sanwrd = 'u' splitfinall = finall.split() for wrd in splitfinall: if wrd != sanwrd: finallstr += str(wrd)+str(' ') finallstr=str(finallstr) #print finallstr return finallstr
[docs]def main_method(file_list,inputFile): """ This function takes a list of original files which is to be compared with input file and displays the similar text and similarity score. :Argument1: file_list (list of files) -- A list of original files . :Argument2: inputFile (file) -- Input file which is suspected to have plagiarism """ fileLastIndex=[] combinedFile='' inputText='' for file in file_list: with codecs.open(str(file), 'r', encoding='utf-8', errors='ignore') as rd: originalFile = rd.read() combinedFile= combinedFile+originalFile+'\n' fileLastIndex.append((len(combinedFile.split()),file)) with codecs.open(str(inputFile), 'r', encoding='utf-8', errors='ignore') as rd: inputText = rd.read() main_func(inputText, combinedFile,fileLastIndex)
#main_method(['orig_taska.txt','g1pA_taska.txt','g2pE_taska.txt'],'g0pA_taska.txt')