python-将英文文章切割为单词,并添加发音和释义

26次阅读
没有评论

20230712,进行了小修改,主要是以下2个:
1、支持选择多个文件进行转换
2、将单个字符和2个字符的单词给过滤了,如a,at等。

成品地址(源码在压缩包里):https://wwch.lanzoul.com/iPQBt124yqmf
PS:这次解压后,执行的是article2words v1.01.exe的

经历多次Chatgpt后的成品,虽然图形界面有点丑,不过也懒得弄了。
不足:有些带时态或复数的单词,无法添加发音和释义的。

成品见这里:https://wwch.lanzoul.com/in1Zg11uejab
解压后执行main.exe即可。

源码见这里(如果有人优化了,希望也能给我一份优化后的代码):
[Python] 纯文本查看 复制代码

import tkinter as tk
from tkinter import filedialog
from tkinter import messagebox
from functools import partial
from threading import Thread
import pandas as pd
import re
import requests
from lxml import etree
from openpyxl import load_workbook
from concurrent.futures import ThreadPoolExecutor
from openpyxl.styles import Font, NamedStyle

def get_word_info(word):
# 构造请求URL
url = f'https://www.youdao.com/w/eng/{word}'
try:
paraphrase = ""
data = requests.get(url).text
html = etree.HTML(data)
British_pronunciation = html.xpath('//*[@id="phrsListTab"]/h2/div/span[1]/span/text()')[0]
American_pronunciation = html.xpath('//*[@id="phrsListTab"]/h2/div/span[2]/span/text()')[0]
li_elements = html.xpath('//*[@id="phrsListTab"]/div/ul')
for li in li_elements:
paraphrase = ''.join(li.xpath('.//text()'))
return British_pronunciation, American_pronunciation, paraphrase
except Exception as e:
print(e, word)
return None

def process_text_file(file_path):
# 读取文本文件
with open(file_path, 'r') as file:
content = file.read()
# 将内容分隔为单词列表
words = re.split(r"\b[,.:?!()'\"\s\n\t\r]+?\b", content)
# 全部转换为小写
lowercase_words = [word.lower() for word in words]
# 去重
unique_words = list(set(lowercase_words))
# 排序
sorted_words = sorted(unique_words)
# 清洗
filtered_words = [word for word in sorted_words if "'" not in word and not re.search(r'[\u4e00-\u9fff]', word) and not re.search(r'\d', word)] # 删除包含乱码、数字和缩写的单词
# 导出到Excel
df = pd.DataFrame(filtered_words, columns=['Words'])
output_file = file_path.replace('.txt', '.xlsx')
df.to_excel(output_file, index=False)
# 打开Excel文件
workbook = load_workbook(output_file)
worksheet = workbook.active
worksheet.cell(row=1, column=2, value="British_pronunciation")
worksheet.cell(row=1, column=3, value="American_pronunciation")
worksheet.cell(row=1, column=4, value="paraphrase")
# 创建样式并设置为加粗
bold_style = NamedStyle(name="bold_style")
bold_style.font = Font(bold=True)
worksheet.cell(row=1, column=2).style = bold_style
worksheet.cell(row=1, column=3).style = bold_style
worksheet.cell(row=1, column=4).style = bold_style
# 使用线程池处理请求
with ThreadPoolExecutor() as executor:
futures = [executor.submit(get_word_info, word) for word in filtered_words]
# 遍历每个单元格,获取单词并添加发音和释义
row_index = 2 # 设置初始单元格
for future, row in zip(futures, worksheet.iter_rows(min_row=2, max_col=4)):
word = row[0].value
word_info = future.result()
if word_info:
British_pronunciation, American_pronunciation, paraphrase = word_info
worksheet.cell(row=row_index, column=2).value = British_pronunciation
worksheet.cell(row=row_index, column=3).value = American_pronunciation
worksheet.cell(row=row_index, column=4).value = paraphrase
else:
# 如果单词发音获取不到,则检查单词是否s,ed,ing结尾,如果是,则去除s,d,ing后再试试
if word.endswith(('s', 'ed', 'ing')):
word_without_suffix = re.sub(r'(s|d|ing)$', '', word)
word_info = get_word_info(word_without_suffix)
if word_info:
British_pronunciation, American_pronunciation, paraphrase = word_info
worksheet.cell(row=row_index, column=2).value = British_pronunciation
worksheet.cell(row=row_index, column=3).value = American_pronunciation
worksheet.cell(row=row_index, column=4).value = paraphrase
row_index += 1
# 保存修改后的Excel文件
workbook.save(output_file)
messagebox.showinfo('Success', 'Process completed successfully.')

def browse_file(file_entry):
file_path = filedialog.askopenfilename(filetypes=[('Text Files', '*.txt')])
if file_path:
file_entry.delete(0, tk.END)
file_entry.insert(tk.END, file_path)

def execute_function(file_entry):
file_path = file_entry.get()
if not file_path:
messagebox.showerror('Error', 'Please select a file.')
return
execute_button.config(state=tk.DISABLED)
thread = Thread(target=process_text_file, args=(file_path,))
thread.start()

# Create the main window
window = tk.Tk()
window.title('英文文章切割为单词 V1.0')
window.configure(bg='sky blue')
# Create the file browse widget
file_label = tk.Label(window, text='Select a text file:', bg='sky blue')
file_label.pack()
file_entry = tk.Entry(window, width=50)
file_entry.pack()
browse_button = tk.Button(window, text='Browse', command=partial(browse_file, file_entry))
browse_button.pack()
# Create the execute button
execute_button = tk.Button(window, text='Execute', command=partial(execute_function, file_entry))
execute_button.pack()
# Start the main loop
window.mainloop()

正文完
 0
116博客
版权声明:本篇文章由 116博客 于2024-11-28发表,共计4201字。
转载说明:除特殊说明外本站文章皆由CC-4.0协议发布,转载请注明出处。
评论(没有评论)
验证码