自动提取特定格式的txt文本数据到excel中

技术 > 编程语言 Python

发布时间 : 2022-05-28 15:13

字数:1.4k 阅读 :

评论:

前言
1、参考
2、Python中有序字典用法
3、Python中excel操作方法总结
4、Python实战
1. 4.1、示例1
2. 4.2、示例2

本文目录

前言
1、参考
2、Python中有序字典用法
3、Python中excel操作方法总结
4、Python实战
1. 4.1、示例1
2. 4.2、示例2

前言

Python作为当前最流行最火的编程语言，主要有三大作用：

web开发：比如web框架Django
数据科学：包括机器学习、数据分析和数据可视化
脚本（自动化处理，比如数据处理）

本文主要针对Python中有序字典使用方法和excel操作方法进行总结，并给出实例。

1、参考

【Python系列】Python处理csv文件（读写操作)
【Python系列】创建excel文档.csv
【Python系列】excel读写操作以及对应插件xlwd/xlrd安装方法

python创建有序字典OrderedDict()

2、Python中有序字典用法

from collections import OrderedDict  
dict =OrderedDict()  
dict['foo']=3  
dcit['aol']=1

3、Python中excel操作方法总结

【Python系列】excel读写操作以及对应插件xlwd/xlrd安装方法
 【Python系列】Python处理csv文件
 【Python系列】创建excel文档.csv

4、Python实战

==提取特定格式的txt文本数据到excel中==

4.1、示例1

#-*- coding:utf-8 -*-
# Function: get data from multiple txt with certern format to excel

import os
import re
import sys
import glob
import shutil
import commands
import subprocess
import subprocess as sub
import csv
import codecs
from collections import OrderedDict

space = ' '
delimiter = '/'
MAX_FILE_NAME = 100
count = [0,0]

#new excel file
def create_excel(excel_name):
    file = open(excel_name,'wb')
    writer=csv.writer(file)
    file.close()

#extract file name
def get_file_name(fullfilename):
    tmp = fullfilename.strip()
    name = os.path.split(tmp)[-1]  #提取出文件名，不包含路径
    return os.path.splitext(name)[0]

#获取目录以及子目录下的所有文件
allfiles = []
def get_all_files(rawdir):
    allfilelist = os.listdir(rawdir)
    for f in allfilelist:
        filepath = os.path.join(rawdir, f)
        if os.path.isdir(filepath):
            get_all_files(filepath)
        allfiles.append(filepath)
    return allfiles

#获取特定类型的文件
def get_raw_log（rawdir）：
    isfile = 0
    if os.path.isdir(rawdir):
        get_all_files(rawdir)
        files = [f for f in allfiles if re.research('txt$',f)]
    elif os.path.isfile(rawdir)
        iffile = 1;
        files = [rawdir]
    else:
        files = []
        print("Error: " + sys.argv[1] + " is not a dir or file!")
    files.sort(key=str.lower)
    return [files, isfile]


# collect data
def collect_data_to_excel(excelname, inputfile):
    pFile = open(inputfile, 'a+')
    lines = pFile.readlines()
    data = OrderedDict()
    splitValue = []
    filename = get_file_name(inputfile)
    for i in range(len(lines)):
        if lines[i].find('Anchor') != -1:
            splitValue = ((lines[i].split()).split(':')[3]  ##此处根据具体文本数据格式进行进行分割提取
            data[filename] = [filename, splitValue[0], splitValue[1], splitValue[2], splitValue[3], 0]
        if lines[i].find('Anchor') != -1:
            splitValue2 = ((lines[i].split()).split(':')[3]  ##此处根据具体文本数据格式进行进行分割提取
            data[filename][5] = splitValue2
    pFile = open(excelname, 'a+')
    pFile.write(codecs.BOM_UTF8)
    csv.writer(pFile, dialect='excel')
    if count[0]==0:
        title=['name', 'key1', 'key2', 'key3', 'key4', 'key5']
        csv_writer.writerow(title)
        count[0]=count[0]+1
    for key, value in data.items():
        csv_writer.writerow(value)
    pFile.close()


########main Function Entry##########
if __name__=='__main__':
    if(len(sys.argv) < 3):
        print('Usage: '+ '<CollectDataDir> \n')
        print('For example: python autoCollectData.py E:\test\')
        os._exit(0)
    collectDataDir = sys.argv[1]
    outExcelData = collectDataDir + '__result.csv'
    create_excel(outExcelData)
    [files,isfile] = get_raw_log(collectDataDir) #获取当前目录中指定格式的文件
    #遍历每个指定格式的文件进行数据提取
    for collectData in files:
        ret = collect_data_to_excel(outExcelData,collectData)
    if(ret!=0):
        print("--------Process finished!--------")
        os._exit(0)

4.2、示例2

#-*- coding:utf-8 -*-
# Function: get data from multiple txt with certern format to excel

import os
import re
import sys
import glob
import shutil
import commands
import subprocess
import subprocess as sub
import csv
import codecs
from collections import OrderedDict

space = ' '
delimiter = '/'
MAX_FILE_NAME = 100

#new excel file
def create_excel(excel_name):
    file = open(excel_name,'wb')
    writer=csv.writer(file)
    file.close()

#extract file name
def get_file_name(fullfilename):
    tmp = fullfilename.strip()
    name = os.path.split(tmp)[-1]  #提取出文件名，不包含路径
    return os.path.splitext(name)[0]

#获取目录以及子目录下的所有文件
allfiles = []
def get_all_files(rawdir):
    allfilelist = os.listdir(rawdir)
    for f in allfilelist:
        filepath = os.path.join(rawdir, f)
        if os.path.isdir(filepath):
            get_all_files(filepath)
        allfiles.append(filepath)
    return allfiles

def make_all_dir(path):
    path = path.strip()
    isExist = os.path.exists(path)
    if (not isExist):
        os.makedirs(path)
        print(path+' Successful Create!')
        return True

#获取特定类型的文件
def get_raw_log(rawdir):
    isfile = 0
    if os.path.isdir(rawdir):
        get_all_files(rawdir)
        files = [f for f in allfiles if re.search('txt$',f)]
    elif os.path.isfile(rawdir):
        iffile = 1
        files = [rawdir]
    else:
        files = []
        print("Error: " + sys.argv[1] + " is not a dir or file!")
    files.sort(key=str.lower)
    return [files, isfile]


count = [0,0]
# collect data
def collect_data_to_excel(excelname, inputfile):
    pFile = open(inputfile, 'a+')
    lines = pFile.readlines()
    data = OrderedDict()
    splitValue = []
    filename = get_file_name(inputfile)
    for i in range(len(lines)):
        #print lines
        if lines[i].find('"mac":') != -1 and i==8:
            splitValue = (lines[i].split())[0].split(",")[0].split("\"")[-2]
            print splitValue
            data[filename] = [filename, splitValue, 0, 0, 0, 0, 0, 0]
        if lines[i].find('"imei":') != -1 and i==43:
            splitValue3 = (lines[i].split())[0].split(",")[0].split("\"")[-2]
            print splitValue3
            data[filename][3] = str(splitValue3)+'\t'
        if lines[i].find('"imsi":') != -1 and i==44:
            splitValue4 = (lines[i].split())[0].split(",")[0].split("\"")[-2]
            print splitValue4
            data[filename][4] = str(splitValue4)+'\t'
        if lines[i].find('"iccid":') != -1 and i==45:
            splitValue2 = (lines[i].split())[0].split(",")[0].split("\"")[-2]
            print splitValue2
            data[filename][2] = str(splitValue2)+'\t'
        if lines[i].find('"imei":') != -1 and i==95:
            splitValue5 = (lines[i].split())[0].split(",")[0].split("\"")[-2]
            print splitValue5
            data[filename][5] = str(splitValue5)+'\t'
        if lines[i].find('"imsi":') != -1 and i==96:
            splitValue6 = (lines[i].split())[0].split(",")[0].split("\"")[-2]
            print splitValue6
            data[filename][6] = str(splitValue6)+'\t'
        if lines[i].find('"iccid":') != -1 and i==97:
            splitValue7 = (lines[i].split())[0].split(",")[0].split("\"")[-2]
            print splitValue7
            data[filename][7] = str(splitValue7)+'\t'
    pFile = open(excelname, 'ab+')  # newline=''
    pFile.write(codecs.BOM_UTF8)
    csv_writer = csv.writer(pFile, dialect='excel')
    if count[0]==0:
        title=['filename', 'MAC', 'ICCID', 'IMEI', 'IMSI', 'ICCID', 'IMEI', 'IMSI']
        csv_writer.writerow(title)
        count[0]=count[0]+1
    for key, value in data.items():
        csv_writer.writerow(value)
    pFile.close()


########main Function Entry##########
if __name__ == '__main__':
    if(len(sys.argv) < 3):
        print("Usage: autoExtractToExcel.py targetDir outResultDir\n")
        sys.exit(1)
    collectDataDir = sys.argv[1]
    outResultDir   = sys.argv[2]

    if(not os.path.exists(outResultDir)):
        make_all_dir(outResultDir)
    outExcelData   = outResultDir + delimiter + '__result.xls'
    create_excel(outExcelData)
    [files,isfile] = get_raw_log(collectDataDir) #获取当前目录中指定格式的文件

    #遍历每个指定格式的文件进行数据提取
    for collectData in files:
        print collectData
        ret = collect_data_to_excel(outExcelData,collectData)
    if(ret!=0):
        print("--------Process finished!--------")
        os._exit(0)

THE END!

在这里插入图片描述