defworldFilter(keywords, text): for eve in keywords: text = text.replace(eve, "***") return text keywords = ("关键词1", "关键词2", "关键词3") content = "这是一个关键词替换的例子,这里涉及到了关键词1还有关键词2,最后还会有关键词3。" print(worldFilter(keywords, content))
defworldFilter(keywords, text): for eve in keywords: text = text.replace(eve, "***") return text keywords =[ "关键词" + str(i) for i inrange(0,10000)] content = "这是一个关键词替换的例子,这里涉及到了关键词1还有关键词2,最后还会有关键词3。" * 1000 startTime = time.time() worldFilter(keywords, content) print(time.time()-startTime)
此时的输出结果是:0.12426114082336426,可以看到性能非常差。
正则表达方法
与其用replace,还不如通过正则表达re.sub来的更加快速。
1 2 3 4 5 6 7 8 9
import time import re defworldFilter(keywords, text): return re.sub("|".join(keywords), "***", text) keywords =[ "关键词" + str(i) for i inrange(0,10000)] content = "这是一个关键词替换的例子,这里涉及到了关键词1还有关键词2,最后还会有关键词3。" * 1000 startTime = time.time() worldFilter(keywords, content) print(time.time()-startTime)
# 查找敏感词函数 defsearch(self, content): p = self.root result = [] currentposition = 0
while currentposition < len(content): word = content[currentposition] while word in p.next == Falseand p != self.root: p = p.fail
if word in p.next: p = p.next[word] else: p = self.root
if p.isWord: result.append(p.word) p = self.root currentposition += 1 return result
# 加载敏感词库函数 defparse(self, path): withopen(path, encoding='utf-8') as f: for keyword in f: temp_root = self.root for char instr(keyword).strip(): if char notin temp_root.next: temp_root.next[char] = Node() temp_root = temp_root.next[char] temp_root.isWord = True temp_root.word = str(keyword).strip()
# 敏感词替换函数 defwordsFilter(self, text): """ :param ah: AC自动机 :param text: 文本 :return: 过滤敏感词之后的文本 """ result = list(set(self.search(text))) for x in result: m = text.replace(x, '*' * len(x)) text = m return text
# 查找敏感词函数 defsearch(self, content): p = self.root result = [] currentposition = 0
while currentposition < len(content): word = content[currentposition] while word in p.next == Falseand p != self.root: p = p.fail
if word in p.next: p = p.next[word] else: p = self.root
if p.isWord: result.append(p.word) p = self.root currentposition += 1 return result
# 加载敏感词库函数 defparse(self, path): withopen(path, encoding='utf-8') as f: for keyword in f: temp_root = self.root for char instr(keyword).strip(): if char notin temp_root.next: temp_root.next[char] = Node() temp_root = temp_root.next[char] temp_root.isWord = True temp_root.word = str(keyword).strip()
# 敏感词替换函数 defwordsFilter(self, text): """ :param ah: AC自动机 :param text: 文本 :return: 过滤敏感词之后的文本 """ result = list(set(self.search(text))) for x in result: m = text.replace(x, '*' * len(x)) text = m return text
Serverless architecture grew out of cloud computing. It inherits all the advantages of cloud computing while being extremely flexible, cost effective, and maintenance free.