5

分享合并多个 excel 文件的方法,支持两种格式

 2 years ago
source link: https://www.v2ex.com/t/844013
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.
neoserver,ios ssh client

V2EX  ›  Python

分享合并多个 excel 文件的方法,支持两种格式

  yasea · 17 小时 46 分钟前 · 612 次点击

最开始在网上找的代码只支持一种格式的 EXCEL 合并,经过本菜鸟的优化以后,可以同时对两种格式的 excel 合并,忍不住分享一波。欢迎 V2er 老铁们拍砖!

import pandas as pd
import os,re
from xml.sax import ContentHandler, parse
# 解析 xml 格式的 excel 文件
class ExcelHandler(ContentHandler):
    def __init__(self):
        self.chars = [  ]
        self.cells = [  ]
        self.rows = [  ]
        self.tables = [  ]
    def characters(self, content):
        self.chars.append(content)
    def startElement(self, name, atts):
        if name=="Cell":
            self.chars = [  ]
        elif name=="Row":
            self.cells=[  ]
        elif name=="Table":
            self.rows = [  ]
    def endElement(self, name):
        if name=="Cell":
            self.cells.append(''.join(self.chars))
        elif name=="Row":
            self.rows.append(self.cells)
        elif name=="Table":
            self.tables.append(self.rows)

# 替换 XML 中的非法字符 
def clearContent(srcfile):
    outfile = srcfile + "-output.xls"
    if os.path.isfile(srcfile):
        fin  = open(srcfile, 'r',  encoding='UTF-8', errors='ignore')
        fout = open(outfile, "wt", encoding='UTF-8',errors='ignore')
        reg  = re.compile("\<\Data.*\>(.*)\<\/Data\>") # 正则-取出需要处理的字符串          
        for line in fin:
            if i == 0 and line.find("xml version") < 1:
                print(srcfile, "不是 xml 格式!")
                return False
            if line.find('<Data ss:Type="String">') > 0:
                row = reg.findall(line) 
                if len(row) >0 and row[0] != "":
                    aline = '<Cell ss:StyleID="s_title" ><Data ss:Type="String">%s</Data></Cell>'% (row[0].replace("&"," ").replace(">"," ").replace("<"," "))
                    fout.write("\n" + aline)
                else:
                    fout.write(line)
            else:
                fout.write(line)
        fin.close()
        fout.close()
        os.remove(srcfile)
        os.rename(outfile, srcfile)
        return True
# 存放 EXCEL 原始文件的目录        
src_dir = "D:\\Develop\\Python\\关单原始数据表格"
 
filename_excel = []
frames = []
for root, dirs, files in os.walk(src_dir):
    for file in files:
        filename_excel.append(os.path.join(root,file))

new_list = []
excelHandler = ExcelHandler()  
for xls_file in filename_excel: 
    print("读取文件.........",xls_file)
    try:
        dataframe = pd.read_excel(xls_file, converters={'报关单号':str}) # 可设定指定列的格式
        new_list.append(dataframe)
    except:
        print("不是标准格式的 EXCEL ,尝试 XML 读取............")        
        clearContent(xls_file)             
        parse(xls_file, excelHandler)
        df = pd.DataFrame(excelHandler.tables[0][4:], columns = excelHandler.tables[0][3])
        df.columns = excelHandler.tables[0][0] 
        new_list.append(df)               
# 将合并的结果写入文件
dfnew = pd.concat(new_list)
dfnew.to_excel("D:\\output\\alldata.xlsx",index=False)


About Joyk


Aggregate valuable and interesting links.
Joyk means Joy of geeK