Source code for bigram_check

from __future__ import division
import nltk
import string
import glob
import os
import sys

from nltk.stem import PorterStemmer

from nltk.util import ngrams
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem import WordNetLemmatizer

[docs]def inputfile(path):
	"""returns text from file {0}.txt

	:param path: C:/Python27/corp/*.txt
	:type path: str
	:returns: *.txt.read()
	:rtype: str


	"""
	for names in files:
		try:
			f=open("C:/Python27/corp/{0}.txt".format(x),"r")
			text =f.read()
                
		except UnicodeDecodeError:
			reload(sys)
			sys.setdefaultencoding('utf-8')
            
	return text
            
    
	text1=open("A.txt","r").read()
	text1=text1.lower()
	text2=inputfile(path)
	text2=text2.lower()
       
[docs]def pre_processing(text):
	""" returns pre-processed text in text{0}

	:param text: text1
	:type text: str
	:returns: finall
	:rtype: str


	"""
	sent_tokenize_list = sent_tokenize(text)
        #print sent_tokenize_list
        #print len(sent_tokenize_list)
        #tokenise words
        #print stop_words
	words=word_tokenize(text)
	stop_words = str(stopwords.words('english'))
	alpha=stop_words.replace("u'", "")
    #print words
	result = []
    #print alpha
    #remove stop words
	for item in words:
		if item not in alpha:
			result.append(item)
    #print "Filtered",result
	fil=str(result)
    #remove punctuation
	repstr=" " * 32
	table=string.maketrans(string.punctuation,repstr)
	s=fil.translate(table)
    #return s


    #lemmatizing
	lemmatizer=WordNetLemmatizer()
	h=lemmatizer.lemmatize(s)
    #print "Lemma",lemmatizer.lemmatize(s)
    #stemming
	wordss=word_tokenize(h)
	ps=PorterStemmer()
	list1=[]
	for i in wordss:
		k=(ps.stem(i))
		list1.append(k)
    #print list1
	final= '  '.join(list1)
	finall=str(final)
	return finall

[docs]def plagcheck(textt1,textt2):
	"""returns Similarity between reference document al.txt and test document {0}.txt(Bigrams)
	returns Intersection of Bigrams between documents and the ratio of Plagiarism by Bigram-matching
	:param textt1: text1
	:param textt2: text2
	:type textt1: str
	:type textt2: str
	:returns: a --output of compare(bigramss1,bigramss2)
	:rtype: list

	"""
	tex1=pre_processing(textt1)
	tex2=pre_processing(textt2)
	n=2
	bigrams1 = list(ngrams(tex1.split(), n))
	bigrams2 = list(ngrams(tex2.split(), n))
    #print bigrams1
    #print bigrams2
    
[docs]def compare(bigramss1, bigramss2):            
	"""returns bigrams of text using ngram(text.split(),2)
	:param bigrams1: bigrams1
	:param bigrams2: bigrams2
	:type bigrams1: list
	:type bigrams2: list
	:returns: common
	:rtype: list
	"""
            
	common=[]
	for gram in bigramss1:
		if gram in bigramss2:
			common.append(gram)
	return common
        a=compare(bigrams1,bigrams2)
        print "SIMILARITY",a
        print "intersection",len(a)
        print "ref file",len(bigrams1)
        print "Ratio",len(a)/len(bigrams1)