2015-03-03 16:10:12 -06:00
|
|
|
from bisect import bisect_left
|
2015-06-11 09:52:42 -05:00
|
|
|
import re
|
2015-03-31 06:59:36 -05:00
|
|
|
|
2015-04-01 06:39:59 -05:00
|
|
|
class WhiteList:
|
2015-05-30 13:50:36 -05:00
|
|
|
|
2015-06-11 09:52:42 -05:00
|
|
|
def __init__(self):
|
|
|
|
self.sequenceList = []
|
|
|
|
|
2015-05-30 13:50:36 -05:00
|
|
|
def setWords(self, words):
|
2015-04-01 15:30:08 -05:00
|
|
|
self.words = words
|
2015-03-03 16:10:12 -06:00
|
|
|
self.numWords = len(self.words)
|
|
|
|
|
2015-06-11 09:52:42 -05:00
|
|
|
def setSequenceList(self, sequences):
|
|
|
|
self.sequenceList = sequences
|
|
|
|
|
|
|
|
def getSequenceList(self, word):
|
2015-06-12 06:21:20 -05:00
|
|
|
return self.sequenceList[word] if word and word in self.sequenceList else None
|
2015-06-11 09:52:42 -05:00
|
|
|
|
2015-03-03 16:10:12 -06:00
|
|
|
def cleanText(self, text):
|
2015-05-30 13:50:36 -05:00
|
|
|
return text.strip('.,?!').lower()
|
2015-03-03 16:10:12 -06:00
|
|
|
|
|
|
|
def isWord(self, text):
|
2015-04-01 15:30:08 -05:00
|
|
|
return self.cleanText(text) in self.words
|
2015-03-03 16:10:12 -06:00
|
|
|
|
|
|
|
def isPrefix(self, text):
|
|
|
|
text = self.cleanText(text)
|
|
|
|
i = bisect_left(self.words, text)
|
2015-04-01 15:30:08 -05:00
|
|
|
|
2015-05-30 13:50:36 -05:00
|
|
|
return i != self.numWords and self.words[i].startswith(text)
|
|
|
|
|
|
|
|
def getReplacement(self, text, av=None, garbler=None):
|
2015-06-11 09:52:42 -05:00
|
|
|
return '\x01WLRed\x01%s\x02' % text if not garbler else garbler.garble(av, len(text.split(' ')))
|
2015-03-03 16:10:12 -06:00
|
|
|
|
2015-05-30 13:50:36 -05:00
|
|
|
def processText(self, text, av=None, garbler=None):
|
2015-06-11 09:52:42 -05:00
|
|
|
if not self.words:
|
2015-05-30 13:50:36 -05:00
|
|
|
return text
|
2015-04-02 07:23:24 -05:00
|
|
|
|
2015-05-30 13:50:36 -05:00
|
|
|
words = text.split(' ')
|
|
|
|
newWords = []
|
2015-04-02 07:23:24 -05:00
|
|
|
|
2015-05-30 13:50:36 -05:00
|
|
|
for word in words:
|
|
|
|
if (not word) or self.isWord(word):
|
|
|
|
newWords.append(word)
|
|
|
|
else:
|
|
|
|
newWords.append(self.getReplacement(word, av, garbler))
|
2015-04-02 07:23:24 -05:00
|
|
|
|
2015-05-30 13:50:36 -05:00
|
|
|
lastWord = words[-1]
|
|
|
|
|
2015-06-18 11:07:51 -05:00
|
|
|
if not garbler:
|
|
|
|
if (not lastWord) or self.isPrefix(lastWord):
|
|
|
|
newWords[-1] = lastWord
|
|
|
|
else:
|
|
|
|
newWords[-1] = self.getReplacement(lastWord, av, garbler)
|
2015-04-02 07:23:24 -05:00
|
|
|
|
2015-05-30 13:50:36 -05:00
|
|
|
return ' '.join(newWords)
|
|
|
|
|
2015-06-11 09:52:42 -05:00
|
|
|
def processSequences(self, text, av=None, garbler=None):
|
|
|
|
if not self.sequenceList:
|
|
|
|
return text
|
|
|
|
|
|
|
|
words = text.split(' ')
|
|
|
|
|
|
|
|
for wordNum in xrange(len(words)):
|
|
|
|
word = words[wordNum].lower()
|
|
|
|
sequences = self.getSequenceList(word)
|
|
|
|
|
|
|
|
if not sequences:
|
|
|
|
continue
|
|
|
|
|
|
|
|
for sequenceNum in xrange(len(sequences)):
|
|
|
|
sequence = sequences[sequenceNum].split()
|
|
|
|
total = wordNum + len(sequence) + 1
|
|
|
|
|
2015-06-12 06:21:20 -05:00
|
|
|
if total <= len(words) and sequence == [word.lower() for word in words[wordNum + 1:total]]:
|
2015-06-11 09:52:42 -05:00
|
|
|
words[wordNum:total] = self.getReplacement(' '.join(words[wordNum:total]), av, garbler).split()
|
|
|
|
|
|
|
|
return ' '.join(words)
|
|
|
|
|
2015-05-30 13:50:36 -05:00
|
|
|
def processThroughAll(self, text, av=None, garbler=None):
|
2015-06-11 09:52:42 -05:00
|
|
|
if (text.startswith('~') and not garbler):
|
|
|
|
return text
|
|
|
|
|
|
|
|
return self.processSequences(self.processText(re.sub(' +', ' ', text), av, garbler), av, garbler)
|