[已解决]尝试将匹配词导入字典，同时返回错误"大写"

并轨 · 发表于 2021-5-21 04:08:40

from __future__ import print_function
import string
import re
import unittest
# You want to build a word cloud, an infographic where the size of a
# word corresponds to how often it appears in the body of text.
# To do this, you'll need data. Write code that takes a long string and
# builds its word cloud data in a dictionary, where the keys are
# words and the values are the number of times the words occurred.
# Think about capitalized words. For example, look at these sentences:
# 'After beating the eggs, Dana read the next step:'
# 'Add milk and eggs, then add flour and sugar.'
# What do we want to do with "After", "Dana", and "add"? In this
# example, your final dictionary should include one "Add" or "add" with
# a value of 22. Make reasonable (not necessarily perfect) decisions
# about cases like "After" and "Dana".
# Assume the input will only contain words and standard punctuation.
# ignore all punctuation except sentence enders . ! and ?
# lowercase any word that starts a sentence IFF it is also in the
# corpus of words as a lowercase word.
# How?
# split corpus into sentences (strings ending with a . ? or !)
# strip sentences into words
# push all words into a case sensitive, word frequency counting, dict
# scan the dict
# if a cap word is in the dict as cap and downcase then downcase
# return dict
# we do make two passes through the input stream, but if this were a
# real problem, I'd use a lexing and parsing library to implement a
# real world's problem's requirements.
def word_cloud(input):
"""map string of words into a dict of word frequencies"""
sentence_enders = r"\.|!|\?"
sentences = re.split(sentence_enders, input)
freq = {}
for sentence in sentences:
words = re.split(r"[^a-zA-Z0-9-]+", sentence)
for word in words:
count = freq.get(word, 0)
freq[word] = count + 1
def is_cap(word):
ch = word[0:1]
return ch in string.uppercase
for word, count in freq.items():
if is_cap(word) and word.lower() in freq:
count = freq[word]
freq[word.lower()] += count
del freq[word]
return freq
class TestWordCloud(unittest.TestCase):
def test_examples(self):
"""test the given example"""
test = 'After beating the eggs, Dana read the next step:' + \
'Add milk and eggs, then add flour and sugar-free diet coke.'
soln = {
'After': 1,
'Dana': 1,
'add': 2,
'and': 2,
'beating': 1,
'coke': 1,
'diet': 1,
'eggs': 2,
'flour': 1,
'milk': 1,
'next': 1,
'read': 1,
'step': 1,
'sugar-free': 1,
'the': 2,
'then': 1,
}
cloud = word_cloud(test)
self.assertDictEqual(soln, cloud)
def test_more_examples(self):
"test some additional examples"
tests = [
["We came, we saw, we conquered...then we ate Bill's "
"(Mille-Feuille) cake."
"The bill came to five dollars.",
{
'Mille-Feuille': 1,
'The': 1,
'ate': 1,
'bill': 2,
'cake': 1,
'came': 2,
'conquered': 1,
'dollars': 1,
'five': 1,
's': 1,
'saw': 1,
'then': 1,
'to': 1,
'we': 4
}
]
]
for test, soln in tests:
cloud = word_cloud(test)
self.assertDictEqual(soln, cloud)
if __name__ == "__main__":
unittest.main()
suite = unittest.TestLoader().loadTestsFromTestCase(TestWordCloud)
unittest.TextTestRunner(verbosity=2).run(suite)

复制代码

最佳答案

趙小航

2021-5-30 04:27:32

我很好地理解你的逻辑，你首先是在字典中添加所有单词。然后，您正在做大量的工作来检查字典，并将以大写字母开出的单词移动到小写字母中的同一个单词，并删除以大写字母开出的单词。
我的建议是：首先降低所有键的低位。然后字典不需要后处理。

freq = {}
for sentence in sentences:
words = re.split(r"[^a-zA-Z0-9-]+", sentence)
for word in words:
count = freq.get(word.lower(), 0)
freq[word.lower()] = count + 1

复制代码

跳转到最佳答案楼层

韵涵 · 发表于 2021-5-24 14:28:00

返回每个字符在给定的翻译表中映射的字符串的副本。表必须是通过__getitem__（），通常是映射或序列实现索引的对象。当由 Unicode 定向（整数）索引时，表对象可以进行以下任何工作：返回 Unicode 定向或字符串，将字符映射到一个或多个其他字符：返回无，从返回字符串中删除字符：或提出一个查找异常，以映射自己的字符。
您可以使用 str.maketrans（）以不同格式创建不同格式的字符到字符映射的翻译映射。

情參似塰 · 发表于 2021-5-28 13:45:27

你在字典"freq"上翻来翻去。但是，然后你从字典中删除一个项目。因此，Python 不知道如何继续迭代。在迭转时，不允许更改字典

趙小航 · 发表于 2021-5-30 04:27:32

我很好地理解你的逻辑，你首先是在字典中添加所有单词。然后，您正在做大量的工作来检查字典，并将以大写字母开出的单词移动到小写字母中的同一个单词，并删除以大写字母开出的单词。
我的建议是：首先降低所有键的低位。然后字典不需要后处理。

freq = {}
for sentence in sentences:
words = re.split(r"[^a-zA-Z0-9-]+", sentence)
for word in words:
count = freq.get(word.lower(), 0)
freq[word.lower()] = count + 1

复制代码

		自动登录	找回密码
密码			立即注册