Source code for bigram_check

from __future__ import division
import nltk
import string
import glob
import os
import sys

from nltk.stem import PorterStemmer

from nltk.util import ngrams
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem import WordNetLemmatizer

[docs]def inputfile(path): """returns text from file {0}.txt :param path: C:/Python27/corp/*.txt :type path: str :returns: *.txt.read() :rtype: str """ for names in files: try: f=open("C:/Python27/corp/{0}.txt".format(x),"r") text =f.read() except UnicodeDecodeError: reload(sys) sys.setdefaultencoding('utf-8') return text text1=open("A.txt","r").read() text1=text1.lower() text2=inputfile(path) text2=text2.lower()
[docs]def pre_processing(text): """ returns pre-processed text in text{0} :param text: text1 :type text: str :returns: finall :rtype: str """ sent_tokenize_list = sent_tokenize(text) #print sent_tokenize_list #print len(sent_tokenize_list) #tokenise words #print stop_words words=word_tokenize(text) stop_words = str(stopwords.words('english')) alpha=stop_words.replace("u'", "") #print words result = [] #print alpha #remove stop words for item in words: if item not in alpha: result.append(item) #print "Filtered",result fil=str(result) #remove punctuation repstr=" " * 32 table=string.maketrans(string.punctuation,repstr) s=fil.translate(table) #return s #lemmatizing lemmatizer=WordNetLemmatizer() h=lemmatizer.lemmatize(s) #print "Lemma",lemmatizer.lemmatize(s) #stemming wordss=word_tokenize(h) ps=PorterStemmer() list1=[] for i in wordss: k=(ps.stem(i)) list1.append(k) #print list1 final= ' '.join(list1) finall=str(final) return finall
[docs]def plagcheck(textt1,textt2): """returns Similarity between reference document al.txt and test document {0}.txt(Bigrams) returns Intersection of Bigrams between documents and the ratio of Plagiarism by Bigram-matching :param textt1: text1 :param textt2: text2 :type textt1: str :type textt2: str :returns: a --output of compare(bigramss1,bigramss2) :rtype: list """ tex1=pre_processing(textt1) tex2=pre_processing(textt2) n=2 bigrams1 = list(ngrams(tex1.split(), n)) bigrams2 = list(ngrams(tex2.split(), n))
#print bigrams1 #print bigrams2
[docs]def compare(bigramss1, bigramss2): """returns bigrams of text using ngram(text.split(),2) :param bigrams1: bigrams1 :param bigrams2: bigrams2 :type bigrams1: list :type bigrams2: list :returns: common :rtype: list """ common=[] for gram in bigramss1: if gram in bigramss2: common.append(gram) return common a=compare(bigrams1,bigrams2) print "SIMILARITY",a print "intersection",len(a) print "ref file",len(bigrams1) print "Ratio",len(a)/len(bigrams1)