尝试将匹配词导入字典,同时返回错误"大写"
from __future__ import print_functionimport string
import re
import unittest
# You want to build a word cloud, an infographic where the size of a
# word corresponds to how often it appears in the body of text.
# To do this, you'll need data. Write code that takes a long string and
# builds its word cloud data in a dictionary, where the keys are
# words and the values are the number of times the words occurred.
# Think about capitalized words. For example, look at these sentences:
#'After beating the eggs, Dana read the next step:'
#'Add milk and eggs, then add flour and sugar.'
# What do we want to do with "After", "Dana", and "add"? In this
# example, your final dictionary should include one "Add" or "add" with
# a value of 22. Make reasonable (not necessarily perfect) decisions
# about cases like "After" and "Dana".
# Assume the input will only contain words and standard punctuation.
# ignore all punctuation except sentence enders . ! and ?
# lowercase any word that starts a sentence IFF it is also in the
# corpus of words as a lowercase word.
# How?
# split corpus into sentences (strings ending with a . ? or !)
# strip sentences into words
# push all words into a case sensitive, word frequency counting, dict
# scan the dict
# if a cap word is in the dict as cap and downcase then downcase
# return dict
# we do make two passes through the input stream, but if this were a
# real problem, I'd use a lexing and parsing library to implement a
# real world's problem's requirements.
def word_cloud(input):
"""map string of words into a dict of word frequencies"""
sentence_enders = r"\.|!|\?"
sentences = re.split(sentence_enders, input)
freq = {}
for sentence in sentences:
words = re.split(r"[^a-zA-Z0-9-]+", sentence)
for word in words:
count = freq.get(word, 0)
freq = count + 1
def is_cap(word):
ch = word
return ch in string.uppercase
for word, count in freq.items():
if is_cap(word) and word.lower() in freq:
count = freq
freq += count
del freq
return freq
class TestWordCloud(unittest.TestCase):
def test_examples(self):
"""test the given example"""
test = 'After beating the eggs, Dana read the next step:' + \
'Add milk and eggs, then add flour and sugar-free diet coke.'
soln = {
'After': 1,
'Dana': 1,
'add': 2,
'and': 2,
'beating': 1,
'coke': 1,
'diet': 1,
'eggs': 2,
'flour': 1,
'milk': 1,
'next': 1,
'read': 1,
'step': 1,
'sugar-free': 1,
'the': 2,
'then': 1,
}
cloud = word_cloud(test)
self.assertDictEqual(soln, cloud)
def test_more_examples(self):
"test some additional examples"
tests = [
["We came, we saw, we conquered...then we ate Bill's "
"(Mille-Feuille) cake."
"The bill came to five dollars.",
{
'Mille-Feuille': 1,
'The': 1,
'ate': 1,
'bill': 2,
'cake': 1,
'came': 2,
'conquered': 1,
'dollars': 1,
'five': 1,
's': 1,
'saw': 1,
'then': 1,
'to': 1,
'we': 4
}
]
]
for test, soln in tests:
cloud = word_cloud(test)
self.assertDictEqual(soln, cloud)
if __name__ == "__main__":
unittest.main()
suite = unittest.TestLoader().loadTestsFromTestCase(TestWordCloud)
unittest.TextTestRunner(verbosity=2).run(suite)
返回每个字符在给定的翻译表中映射的字符串的副本。表必须是通过__getitem__(),通常是映射或序列实现索引的对象。当由 Unicode 定向(整数)索引时,表对象可以进行以下任何工作:返回 Unicode 定向或字符串,将字符映射到一个或多个其他字符:返回无,从返回字符串中删除字符:或提出一个查找异常,以映射自己的字符。
您可以使用 str.maketrans() 以不同格式创建不同格式的字符到字符映射的翻译映射。 你在字典"freq"上翻来翻去。但是,然后你从字典中删除一个项目。因此,Python 不知道如何继续迭代。在迭转时,不允许更改字典 我很好地理解你的逻辑,你首先是在字典中添加所有单词。然后,您正在做大量的工作来检查字典,并将以大写字母开出的单词移动到小写字母中的同一个单词,并删除以大写字母开出的单词。
我的建议是:首先降低所有键的低位。然后字典不需要后处理。freq = {}
for sentence in sentences:
words = re.split(r"[^a-zA-Z0-9-]+", sentence)
for word in words:
count = freq.get(word.lower(), 0)
freq = count + 1
页:
[1]