PHP Classes
elePHPant
Icontem

File: dictionary.py

Recommend this page to a friend!
  Classes of Ravindu Taveesha  >  Non-Word PHP Spell Checker  >  dictionary.py  >  Download  
File: dictionary.py
Role: Auxiliary data
Content type: text/plain
Description: Auxiliary data
Class: Non-Word PHP Spell Checker
Detect incorrectly spelled words and suggest fixes
Author: By
Last change:
Date: 11 days ago
Size: 3,343 bytes
 

 

Contents

Class file image Download
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import json
import PyPDF2
import textract
from urllib.request import urlopen

dictionary = {}
text = ""

urls = [
    'http://www.gutenberg.org/files/56006/56006-0.txt',
    'http://www.gutenberg.org/cache/epub/2776/pg2776.txt',
    'http://www.gutenberg.org/cache/epub/17090/pg17090.txt',
    'http://www.gutenberg.org/files/3400/3400-0.txt',
    'http://www.gutenberg.org/cache/epub/23531/pg23531.txt',
    'http://www.gutenberg.org/files/38046/38046-0.txt',
    'http://www.gutenberg.org/cache/epub/27250/pg27250.txt',
    'http://www.gutenberg.org/cache/epub/41189/pg41189.txt',
    'http://www.gutenberg.org/cache/epub/49739/pg49739.txt',
    'http://www.gutenberg.org/cache/epub/1319/pg1319.txt',
    'http://www.gutenberg.org/files/1289/1289-0.txt',
    'http://www.gutenberg.org/files/98/98-0.txt',
    'http://www.gutenberg.org/cache/epub/2542/pg2542.txt',
    'http://www.gutenberg.org/cache/epub/345/pg345.txt'

]

# for url in urls:
for url in urls:
    text += urlopen(url).read().decode('utf8')

#fileHandler = open('gutenberg.txt')
#text = fileHandler.read()

print('==============> text count %s ' % len(text))

#text = "Hello there, how are you doing today? The weather is great and THE Python is awesome. The sky is pinkish, do not eat bread. How old are you. you done it your way"
tokens = word_tokenize(text)
# remove punctuations and lowercase all words
words = [word.lower() for word in tokens if word.isalpha()]
count = 0

print('==============> unigram start')
unigramDict = {}
for word in words:
    print('==============> add word')
    if word not in unigramDict:
        unigramDict[word] = 1
    else:
        print('==============> update word ' + word + ' frequency')
        unigramDict[word] += 1
    count += 1


dictionary['unigram'] = unigramDict
print('==============> %s word added to bigram ' % len(unigramDict))

# length dictionary
print('==============> length start')
lengthDict = {}
# loop throught dictionary and update similar length words
for word in words:
    if len(word) not in lengthDict.keys():
        print('==============> add length %s ' % len(word))
        lengthDict[len(word)] = [word]
    else:
        if word not in lengthDict[len(word)]:
            print('==============> add word => length %s ' % word)
            lengthDict[len(word)].append(word)

dictionary['length'] = lengthDict

# bigram dictionary
print('==============> start bigram dictionary')
bigrams = list(nltk.bigrams(words))
bigramDict = {}

for phrase in bigrams:
    print('==============> add word')
    first = phrase[0]
    second = phrase[1]
    if first not in bigramDict:
        bigramDict[first] = {}
        bigramDict[first][second] = 1
    else:
        if second in bigramDict[first].keys():
            print('==============> update phrase ' + second + ' frequency')
            bigramDict[first][second] += 1
        else:
            print('==============> add phrase ' + second + ' ==> ' + first)
            bigramDict[first][second] = 1

dictionary['bigram'] = bigramDict

# # creat unigram dictionary
with open('dictionary.json', 'w') as f:
    json.dump(dictionary, f)

print('==============> dictionary created')
print('==============> total words %s ' % count)
print('==============> %s word added' % len(unigramDict))