表格索引多种查询方法

需求: 有时候会遇到在固定表格里面索引的需求，然而这些表格的格式并不统一，需要进行处理后再索引，因此有了这些方案

转换表格为数据

思路一，py 转 pdf 为 excel

展开代码

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21


import pdfplumber
from openpyxl import Workbook

def convert_pdf_to_excel(pdf_path, excel_path):
    pdf = pdfplumber.open(pdf_path)
    wb = Workbook()
    ws = wb.active

    for page in pdf.pages:
        table = page.extract_tables()[0]  # 提取第一个表格
        for row in table:
            ws.append(row)

    pdf.close()
    wb.save(excel_path)
    wb.close()

# 示例用法
pdf_path = r'grid.pdf'
excel_path =r'output.xlsx'
convert_pdf_to_excel(pdf_path, excel_path)

思路二，将图片转为表格，ocr 识别

展开代码

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17


import pytesseract
from PIL import Image


pytesseract.pytesseract.tesseract_cmd = r'Tesseract-OCR'

path = ""

# 加载图片
image_path = 'image.png'  # 图片文件路径
image = Image.open(path+'\\'+image_path)

# 使用OCR识别图片中的字符串
text = pytesseract.image_to_string(image)

# 打印识别结果
print(text)

思路三，读取 csv 文件，并用 pandas 索引

展开代码

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47


import csv
import os
import pandas as pd

# 读取CSV文件并获取指定列和行的数据
def read_csv(filename):
    data = []
    with open(filename, 'r') as file:
        reader = csv.reader(file)
        header = next(reader)  # 读取CSV文件的标题行
        for row in reader:
            for element in row: #'1 6 N K J V 9 9 Q N 9 V 8'
                cleaned_row = element.replace(" ", "")
                #for i in cleaned_row:
                #cleaned_row = ",".join(cleaned_row)
                data.append(cleaned_row)
        #header = ["".join(header[0].replace(" ", ""))]
        header = [header[0].replace(" ", "")]
    hh =['N']
    for h in header[0]:
        hh.append(h)
    #print(hh)
    l = [[0] * 13 for j in range(7)]
    #print(l)
    for i in range(7):
        for j in range(13):
            l[i][j] = (data[i][j])

    return l,hh

# 示例用法
filename = 'grid.csv'  # CSV文件路径
#columns = [N A B C D E F G H I J K L]  # 需要获取的列名称列表
path = ""
data,header = read_csv(path + "\\" + filename)
index = [1,2,3,4,5,6,7]
df = pd.DataFrame(data,index=index,columns=header)
df = df.drop(df.columns[0],axis=1)

print(df)

m = 'A'
n = 3
ci = df.columns.get_loc(m)
ri = df.index.get_loc(n)

print(df.iloc[ri,ci])

思路四，读取 pdf 并打印结果

展开代码

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30


import pandas as pd
import pdfplumber
import sys
#import getpass

def extract_table_from_pdf(file_path):
    with pdfplumber.open(file_path) as pdf:
        table = pdf.pages[0].extract_tables()[0]
        return table

try:
    file = ""
    table = extract_table_from_pdf(file)
    index = [1,2,3,4,5,6,7]
    df = pd.DataFrame(table[1:], columns=table[0], index=index)
    #print(df)
    ip = sys.stdin.read().rstrip('\n')
    ip_list = ip.splitlines()
    ip_list = [ip for ip in ip_list if ip.strip()]
    for i in range(len(ip_list)):
        #print(ip_list[i][0],ip_list[i][1])
        ci = df.columns.get_loc(ip_list[i][0])
        ri = df.index.get_loc(int(ip_list[i][1]))
        #print(ci,ri)
        #res.append(df.iloc[ri,ci])
        print(df.iloc[ri,ci])
    #print(res)

except Exception as e:
    print(f"An error occurred: {e}")

Excel 表格处理

优点: 不需要下载工具上传数据，使用便捷，基本都有 excel 和 adobe 缺点: 需要手工操作，公式设置复杂用 Adobe Reader 打开 pdf 表格，将表格复制到 excel 中，用分列进行初始化排列好顺序 =INDEX(表格区域,MATCH(INT(RIGHT(目标索引,1)),列),MATCH(LEFT(目标索引,1),行,0))

Python 制作本地小工具

优点: 不用再设置公式，本地工具方便操作缺点: 依赖工具，后续改进需要重新编译 Python 图形化界面模板

展开代码

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36


from tkinter import *
import time
from tkinter import filedialog
import ctypes
import tkinter as tk


def run1():
    print(1)

# 创建主窗口
root = tk.Tk()

# 设置窗口标题
root.title("简单GUI界面")
root.geometry('460x300')

txt1 = Text(root)
txt1.place(relx=0.1, rely=0.2, relwidth=0.2, relheight=0.4)

txt2 = Text(root)
txt2.place(relx=0.6, rely=0.2, relwidth=0.2, relheight=0.4)

btn1 = Button(root,text ='提交',command=lambda:run1(txt1.get('0.0','end')))
btn1.place(relx=0.4, rely=0.3, relwidth=0.1, relheight=0.1)

myappid = "company.product.version" # 这里可以设置任意文本
ctypes.windll.shell32.SetCurrentProcessExplicitAppUserModelID(myappid)
root.wm_iconbitmap('favicon.ico')  # 设置程序图标
# 创建按钮点击事件处理函数
def button_click():
    label.config(text="按钮被点击了！")


# 运行主循环
root.mainloop()

pandas 序列化图形化处理

展开代码

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70


from tkinter import *
import time
import tkinter as tk
import ctypes
import pandas as pd
import pdfplumber

def extract_table_from_pdf(file_path):
    with pdfplumber.open(file_path) as pdf:
        table = pdf.pages[0].extract_tables()[0]
        return table

def run1(ip):
    try:

        if txt2.get('0.0','end') != "":
            txt2.delete('0.0','end')
        file = 'grid.pdf'

        table = extract_table_from_pdf(file)
        #print(table)
        index = [1,2,3,4,5,6,7]
        df = pd.DataFrame(table[1:], columns=table[0], index=index)
        #print(df)
        #ip = sys.stdin.read().rstrip('\n')
        ip_list = ip.splitlines()
        ip_list = [ip for ip in ip_list if ip.strip()]
        for i in range(len(ip_list)):
            ci = df.columns.get_loc(ip_list[i][0])
            ri = df.index.get_loc(int(ip_list[i][1]))
            #print(df.iloc[ri,ci])
            txt2.insert(END, df.iloc[ri,ci])
            #txt2.insert(END, df.iloc[ri,ci]+'\n')

    except Exception as e:
        #print(f"An error occurred: {e}")
        txt2.insert(END, f"An error occurred: {e}")



# 创建主窗口
root = tk.Tk()
#root.iconbitmap("favicon.ico")
#icon_path = "favicon.ico"
# 设置窗口标题
root.title("grid card转换器")
root.geometry('460x300')

lb1 = Label(root, text='请将密码表名称改为grid.pdf并与本程序放到同个目录下')
lb1.place(relx=0.1, rely=0.05)

lb2 = Label(root, text='复制二次验证网页的字符串到下方第一个窗口中并点击转换\n对应密码会在第二个窗口出现')
lb2.place(relx=0.1, rely=0.15)


txt1 = Text(root)
txt1.place(relx=0.1, rely=0.3, relwidth=0.2, relheight=0.4)

txt2 = Text(root)
txt2.place(relx=0.6, rely=0.3, relwidth=0.2, relheight=0.4)

btn1 = Button(root,text ='转换',command=lambda:run1(txt1.get('0.0','end')))
btn1.place(relx=0.4, rely=0.3, relwidth=0.1, relheight=0.1)

myappid = "company.product.version" # 这里可以设置任意文本
ctypes.windll.shell32.SetCurrentProcessExplicitAppUserModelID(myappid)
root.wm_iconbitmap(r'D:\person\tansfer grid\new version\favicon.ico')  # 设置程序图标

# 运行主循环
root.mainloop()

Golang 进行操作

如果想要搭建在线网址，可以用 golang 进行处理优点: 直接复制表格，索引到网页，设置好参数即可识别返回结果缺点: 需要上传数据到网站，增加了维护成本

展开代码

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163


package trans

import (
	"encoding/csv"
	"fmt"
	"log"
	"os"
	"regexp"
	"strings"
)

// 读取CSV文件并获取指定列和行的数据
func ReadCSV(filename string) [][]string {
	file, err := os.Open(filename)
	if err != nil {
		log.Fatal(err)
	}
	defer file.Close()

	reader := csv.NewReader(file)
	rows, err := reader.ReadAll()
	//fmt.Println(rows)
	if err != nil {
		log.Fatal(err)
	}
	//fmt.Println(rows)
	return rows
}

// 去除空格定义表格，进行取样
func Dealdata(data string, c_if int, r_if int, cc int, rr int) [][]string {
	if cc == 0 {
		cc = 12
		fmt.Println("cc等于12")
	}
	if rr == 0 {
		rr = 7
		fmt.Println("rr等于7")
	}
	if cc >= 20 || rr >= 20 {
		defer func() {
			if err := recover(); err != nil {
				fmt.Println(err)
			}
		}()
		l := make([][]string, 1)
		return l
	}
	l := make([][]string, rr)
	if c_if == 1 {
		//跳过前缀行
		data = data[cc:]
		fmt.Println(len(data), data)
	}

	//通过note判断1个字符还是2个字符
	var note int
	if len(data) > 91 {
		note = 2
	} else {
		note = 1
	}

	var sl string

	for i := 0; i < rr; i++ {
		//跳过行首序列
		if r_if == 1 {
			if len(data) > cc*note {
				sl = data[1 : cc*note+1]

			} else if len(data) <= cc*note {
				sl = data[1:]
			}
			fmt.Println(sl)
			for n := 0; n+note < len(sl)+1; n = n + note {
				l[i] = append(l[i], sl[n:n+note])
			}
			fmt.Println(l)
			if len(data) > cc*note {
				data = data[cc*note+1:]
			}
		} else {

			if len(data) > cc*note {
				sl = data[:cc*note]

			} else if len(data) <= cc*note {
				sl = data
			}
			//fmt.Println(sl)
			for n := 0; n+note < len(sl)+1; n = n + note {
				l[i] = append(l[i], sl[n:n+note])
			}
			//fmt.Println(l)
			if len(data) > cc*note {
				data = data[cc*note:]
			}
		}
	}
	return l
}
func PreDeal(st string, q string) (string, []string) {
	//表格预处理
	//只留下字符数字并将列表结果转为字符串
	re := regexp.MustCompile(`\w`)
	q = strings.Join(re.FindAllString(q, -1), "")
	st = strings.Join(re.FindAllString(st, -1), "")
	//fmt.Println(st, q)

	//查询转大写
	q = strings.ToUpper(q)
	//按2个字符方式取表格
	goal := splitStringByInterval(q, 2)

	return st, goal
}

func SearchData(st string, q string, c_if int, r_if int, cc int, rr int) string {

	//只留下字符数字并将列表结果转为字符串
	var goal []string
	st, goal = PreDeal(st, q)
	fmt.Println(st)
	fmt.Println(c_if, r_if)
	//处理表格
	data := Dealdata(st, c_if, r_if, cc, rr)
	//fmt.Println(data)
	ans := []string{}

	for _, idx := range goal {
		//列
		m := string(idx[0])
		ci := int([]rune(m)[0]) - int('A')

		//行
		n := int(idx[1]) - int('0')
		ri := n - 1

		//fmt.Println(ri, ci, string(data[ri][ci]))

		ans = append(ans, string(data[ri][ci]))
	}

	revise_ans := strings.Join(ans, "")
	//fmt.Println(ans)
	return revise_ans

}

func splitStringByInterval(str string, interval int) []string {
	var data []string

	for i := 0; i < len(str); i += interval {
		end := i + interval
		if end > len(str) {
			end = len(str)
		}
		data = append(data, str[i:end])
	}

	return data
}