-
Notifications
You must be signed in to change notification settings - Fork 0
/
rename-pdfs.py
92 lines (78 loc) · 3.39 KB
/
rename-pdfs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import requests
import base64
import urllib.parse
import pandas as pd
import os
import re
import tkinter as tk
from tkinter import filedialog
import configparser
# 创建配置解析器
config = configparser.ConfigParser()
# 读取配置文件
config.read('config.ini')
# 从配置文件获取百度API的访问信息
APP_ID = config['BaiduAPI']['APP_ID']
API_KEY = config['BaiduAPI']['API_KEY']
SECRET_KEY = config['BaiduAPI']['SECRET_KEY']
def select_directory():
root = tk.Tk()
root.withdraw() # 隐藏主窗口
folder_selected = filedialog.askdirectory() # 弹出文件夹选择对话框
return folder_selected
# 百度OCR API配置
TOKEN_URL = 'https://aip.baidubce.com/oauth/2.0/token'
OCR_URL = "https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice"
# 获取access_token
def get_access_token():
params = {
'grant_type': 'client_credentials',
'client_id': API_KEY,
'client_secret': SECRET_KEY
}
response = requests.post(TOKEN_URL, params=params)
if response:
return response.json().get('access_token')
# 处理PDF文件
def process_pdf(file_path, access_token, invoice_details):
with open(file_path, 'rb') as f:
pdf_content = base64.b64encode(f.read()).decode()
params = urllib.parse.urlencode({"pdf_file": pdf_content})
request_url = f"{OCR_URL}?access_token={access_token}"
headers = {'content-type': 'application/x-www-form-urlencoded'}
response = requests.post(request_url, data=params, headers=headers)
if response:
result = response.json()
# 提取发票信息
if 'words_result' in result:
words_result = result['words_result']
invoice_date = re.sub(r'年|月', '-', words_result['InvoiceDate']).replace('日', '')
commodity_names = " ".join([item['word'] for item in words_result.get('CommodityName', [])]) or "none"
new_filename = f"{words_result.get('InvoiceCodeConfirm', 'none')} {words_result.get('InvoiceNumConfirm', 'none')} {invoice_date} {words_result.get('AmountInFiguers', 'none')} {words_result.get('SellerName', 'none')}.pdf".replace("none", "none")
new_path = os.path.join(os.path.dirname(file_path), new_filename)
os.rename(file_path, new_path)
invoice_details.append({
"发票代码": words_result.get('InvoiceCodeConfirm', 'none'),
"发票号码": words_result.get('InvoiceNumConfirm', 'none'),
"开票日期": invoice_date,
"项目名称": commodity_names,
"价税合计": words_result.get('AmountInFiguers', 'none'),
"销售方名称": words_result.get('SellerName', 'none')
})
# 处理指定目录下的所有PDF文件
def process_invoices(directory):
access_token = get_access_token()
invoice_details = []
for filename in os.listdir(directory):
if filename.lower().endswith('.pdf'):
file_path = os.path.join(directory, filename)
process_pdf(file_path, access_token, invoice_details)
# 将提取的信息保存到Excel文件中
df = pd.DataFrame(invoice_details)
excel_path = os.path.join(directory, '发票明细.xlsx')
df.to_excel(excel_path, index=False)
# 示例调用
#directory = "D:\\Documents\\" # 更新为实际的发票文件目录
directory = select_directory()
print("Selected directory:", directory)
process_invoices(directory)