表格索引多种查询方法
需求: 有时候会遇到在固定表格里面索引的需求,然而这些表格的格式并不统一,需要进行处理后再索引,因此有了这些方案
转换表格为数据
思路一,py 转 pdf 为 excel
展开代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import pdfplumber
from openpyxl import Workbook
def convert_pdf_to_excel(pdf_path, excel_path):
pdf = pdfplumber.open(pdf_path)
wb = Workbook()
ws = wb.active
for page in pdf.pages:
table = page.extract_tables()[0] # 提取第一个表格
for row in table:
ws.append(row)
pdf.close()
wb.save(excel_path)
wb.close()
# 示例用法
pdf_path = r'grid.pdf'
excel_path =r'output.xlsx'
convert_pdf_to_excel(pdf_path, excel_path)
思路二,将图片转为表格,ocr 识别
展开代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import pytesseract
from PIL import Image
pytesseract.pytesseract.tesseract_cmd = r'Tesseract-OCR'
path = ""
# 加载图片
image_path = 'image.png' # 图片文件路径
image = Image.open(path+'\\'+image_path)
# 使用OCR识别图片中的字符串
text = pytesseract.image_to_string(image)
# 打印识别结果
print(text)
思路三,读取 csv 文件,并用 pandas 索引
展开代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import csv
import os
import pandas as pd
# 读取CSV文件并获取指定列和行的数据
def read_csv(filename):
data = []
with open(filename, 'r') as file:
reader = csv.reader(file)
header = next(reader) # 读取CSV文件的标题行
for row in reader:
for element in row: #'1 6 N K J V 9 9 Q N 9 V 8'
cleaned_row = element.replace(" ", "")
#for i in cleaned_row:
#cleaned_row = ",".join(cleaned_row)
data.append(cleaned_row)
#header = ["".join(header[0].replace(" ", ""))]
header = [header[0].replace(" ", "")]
hh =['N']
for h in header[0]:
hh.append(h)
#print(hh)
l = [[0] * 13 for j in range(7)]
#print(l)
for i in range(7):
for j in range(13):
l[i][j] = (data[i][j])
return l,hh
# 示例用法
filename = 'grid.csv' # CSV文件路径
#columns = [N A B C D E F G H I J K L] # 需要获取的列名称列表
path = ""
data,header = read_csv(path + "\\" + filename)
index = [1,2,3,4,5,6,7]
df = pd.DataFrame(data,index=index,columns=header)
df = df.drop(df.columns[0],axis=1)
print(df)
m = 'A'
n = 3
ci = df.columns.get_loc(m)
ri = df.index.get_loc(n)
print(df.iloc[ri,ci])
思路四,读取 pdf 并打印结果
展开代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import pandas as pd
import pdfplumber
import sys
#import getpass
def extract_table_from_pdf(file_path):
with pdfplumber.open(file_path) as pdf:
table = pdf.pages[0].extract_tables()[0]
return table
try:
file = ""
table = extract_table_from_pdf(file)
index = [1,2,3,4,5,6,7]
df = pd.DataFrame(table[1:], columns=table[0], index=index)
#print(df)
ip = sys.stdin.read().rstrip('\n')
ip_list = ip.splitlines()
ip_list = [ip for ip in ip_list if ip.strip()]
for i in range(len(ip_list)):
#print(ip_list[i][0],ip_list[i][1])
ci = df.columns.get_loc(ip_list[i][0])
ri = df.index.get_loc(int(ip_list[i][1]))
#print(ci,ri)
#res.append(df.iloc[ri,ci])
print(df.iloc[ri,ci])
#print(res)
except Exception as e:
print(f"An error occurred: {e}")
Excel 表格处理
优点: 不需要下载工具上传数据,使用便捷,基本都有 excel 和 adobe
缺点: 需要手工操作,公式设置复杂
用 Adobe Reader 打开 pdf 表格,将表格复制到 excel 中,用分列进行初始化排列好顺序
=INDEX(表格区域,MATCH(INT(RIGHT(目标索引,1)),列),MATCH(LEFT(目标索引,1),行,0))
Python 制作本地小工具
优点: 不用再设置公式,本地工具方便操作
缺点: 依赖工具,后续改进需要重新编译
Python 图形化界面模板
展开代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from tkinter import *
import time
from tkinter import filedialog
import ctypes
import tkinter as tk
def run1():
print(1)
# 创建主窗口
root = tk.Tk()
# 设置窗口标题
root.title("简单GUI界面")
root.geometry('460x300')
txt1 = Text(root)
txt1.place(relx=0.1, rely=0.2, relwidth=0.2, relheight=0.4)
txt2 = Text(root)
txt2.place(relx=0.6, rely=0.2, relwidth=0.2, relheight=0.4)
btn1 = Button(root,text ='提交',command=lambda:run1(txt1.get('0.0','end')))
btn1.place(relx=0.4, rely=0.3, relwidth=0.1, relheight=0.1)
myappid = "company.product.version" # 这里可以设置任意文本
ctypes.windll.shell32.SetCurrentProcessExplicitAppUserModelID(myappid)
root.wm_iconbitmap('favicon.ico') # 设置程序图标
# 创建按钮点击事件处理函数
def button_click():
label.config(text="按钮被点击了!")
# 运行主循环
root.mainloop()
pandas 序列化图形化处理
展开代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from tkinter import *
import time
import tkinter as tk
import ctypes
import pandas as pd
import pdfplumber
def extract_table_from_pdf(file_path):
with pdfplumber.open(file_path) as pdf:
table = pdf.pages[0].extract_tables()[0]
return table
def run1(ip):
try:
if txt2.get('0.0','end') != "":
txt2.delete('0.0','end')
file = 'grid.pdf'
table = extract_table_from_pdf(file)
#print(table)
index = [1,2,3,4,5,6,7]
df = pd.DataFrame(table[1:], columns=table[0], index=index)
#print(df)
#ip = sys.stdin.read().rstrip('\n')
ip_list = ip.splitlines()
ip_list = [ip for ip in ip_list if ip.strip()]
for i in range(len(ip_list)):
ci = df.columns.get_loc(ip_list[i][0])
ri = df.index.get_loc(int(ip_list[i][1]))
#print(df.iloc[ri,ci])
txt2.insert(END, df.iloc[ri,ci])
#txt2.insert(END, df.iloc[ri,ci]+'\n')
except Exception as e:
#print(f"An error occurred: {e}")
txt2.insert(END, f"An error occurred: {e}")
# 创建主窗口
root = tk.Tk()
#root.iconbitmap("favicon.ico")
#icon_path = "favicon.ico"
# 设置窗口标题
root.title("grid card转换器")
root.geometry('460x300')
lb1 = Label(root, text='请将密码表名称改为grid.pdf并与本程序放到同个目录下')
lb1.place(relx=0.1, rely=0.05)
lb2 = Label(root, text='复制二次验证网页的字符串到下方第一个窗口中并点击转换\n对应密码会在第二个窗口出现')
lb2.place(relx=0.1, rely=0.15)
txt1 = Text(root)
txt1.place(relx=0.1, rely=0.3, relwidth=0.2, relheight=0.4)
txt2 = Text(root)
txt2.place(relx=0.6, rely=0.3, relwidth=0.2, relheight=0.4)
btn1 = Button(root,text ='转换',command=lambda:run1(txt1.get('0.0','end')))
btn1.place(relx=0.4, rely=0.3, relwidth=0.1, relheight=0.1)
myappid = "company.product.version" # 这里可以设置任意文本
ctypes.windll.shell32.SetCurrentProcessExplicitAppUserModelID(myappid)
root.wm_iconbitmap(r'D:\person\tansfer grid\new version\favicon.ico') # 设置程序图标
# 运行主循环
root.mainloop()
Golang 进行操作
如果想要搭建在线网址,可以用 golang 进行处理
优点: 直接复制表格,索引到网页,设置好参数即可识别返回结果
缺点: 需要上传数据到网站,增加了维护成本
展开代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
package trans
import (
"encoding/csv"
"fmt"
"log"
"os"
"regexp"
"strings"
)
// 读取CSV文件并获取指定列和行的数据
func ReadCSV(filename string) [][]string {
file, err := os.Open(filename)
if err != nil {
log.Fatal(err)
}
defer file.Close()
reader := csv.NewReader(file)
rows, err := reader.ReadAll()
//fmt.Println(rows)
if err != nil {
log.Fatal(err)
}
//fmt.Println(rows)
return rows
}
// 去除空格定义表格,进行取样
func Dealdata(data string, c_if int, r_if int, cc int, rr int) [][]string {
if cc == 0 {
cc = 12
fmt.Println("cc等于12")
}
if rr == 0 {
rr = 7
fmt.Println("rr等于7")
}
if cc >= 20 || rr >= 20 {
defer func() {
if err := recover(); err != nil {
fmt.Println(err)
}
}()
l := make([][]string, 1)
return l
}
l := make([][]string, rr)
if c_if == 1 {
//跳过前缀行
data = data[cc:]
fmt.Println(len(data), data)
}
//通过note判断1个字符还是2个字符
var note int
if len(data) > 91 {
note = 2
} else {
note = 1
}
var sl string
for i := 0; i < rr; i++ {
//跳过行首序列
if r_if == 1 {
if len(data) > cc*note {
sl = data[1 : cc*note+1]
} else if len(data) <= cc*note {
sl = data[1:]
}
fmt.Println(sl)
for n := 0; n+note < len(sl)+1; n = n + note {
l[i] = append(l[i], sl[n:n+note])
}
fmt.Println(l)
if len(data) > cc*note {
data = data[cc*note+1:]
}
} else {
if len(data) > cc*note {
sl = data[:cc*note]
} else if len(data) <= cc*note {
sl = data
}
//fmt.Println(sl)
for n := 0; n+note < len(sl)+1; n = n + note {
l[i] = append(l[i], sl[n:n+note])
}
//fmt.Println(l)
if len(data) > cc*note {
data = data[cc*note:]
}
}
}
return l
}
func PreDeal(st string, q string) (string, []string) {
//表格预处理
//只留下字符数字并将列表结果转为字符串
re := regexp.MustCompile(`\w`)
q = strings.Join(re.FindAllString(q, -1), "")
st = strings.Join(re.FindAllString(st, -1), "")
//fmt.Println(st, q)
//查询转大写
q = strings.ToUpper(q)
//按2个字符方式取表格
goal := splitStringByInterval(q, 2)
return st, goal
}
func SearchData(st string, q string, c_if int, r_if int, cc int, rr int) string {
//只留下字符数字并将列表结果转为字符串
var goal []string
st, goal = PreDeal(st, q)
fmt.Println(st)
fmt.Println(c_if, r_if)
//处理表格
data := Dealdata(st, c_if, r_if, cc, rr)
//fmt.Println(data)
ans := []string{}
for _, idx := range goal {
//列
m := string(idx[0])
ci := int([]rune(m)[0]) - int('A')
//行
n := int(idx[1]) - int('0')
ri := n - 1
//fmt.Println(ri, ci, string(data[ri][ci]))
ans = append(ans, string(data[ri][ci]))
}
revise_ans := strings.Join(ans, "")
//fmt.Println(ans)
return revise_ans
}
func splitStringByInterval(str string, interval int) []string {
var data []string
for i := 0; i < len(str); i += interval {
end := i + interval
if end > len(str) {
end = len(str)
}
data = append(data, str[i:end])
}
return data
}