zl程序教程

您现在的位置是:首页 >  后端

当前栏目

python解析日志的代码

2023-09-11 14:15:06 时间
#!/usr/bin/env python

import re
import os
import os.path
import gzip
import json
from urlparse import *
import datetime
import time

DICT = {}

def print_time(s):
        #print (datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
        print datetime.datetime.now().strftime("%H:%M:%S.%f") + " " + str(s)

def remove_flag(s):
        return s.replace("[","").replace("]","")

def remove_session_prefix(s):
        return s.replace("session:","")

def parse_url(url):
        a_dict = {}
        aUrl = urlparse("http://"+url)
        url_params = dict([(k,v[0]) for k,v in parse_qs(aUrl.query).items()])

        a_dict.update({"domain":aUrl.netloc})
        a_dict.update({"path":aUrl.path})
        a_dict.update({"params":url_params})
        return a_dict

def get_info(a_dict):
        new_dict = {}

        if a_dict["userid"] != "" and a_dict["userid"] != "0":
                new_dict.update({"userid":a_dict["userid"]})
        else:
                new_dict.update({"userid":a_dict["session"]})

        new_dict.update({"domain":a_dict["url"]["domain"]})
        new_dict.update({"path":a_dict["url"]["path"]})

        if "fr" in a_dict["url"]["params"]:
                new_dict.update({"fr":a_dict["url"]["params"]["fr"]})
        else:
                new_dict.update({"fr":"-"})

        if "ct" in a_dict["url"]["params"]:
                new_dict.update({"ct":a_dict["url"]["params"]["ct"]})
        else:
                new_dict.update({"ct":"-"})

        if "ac" in a_dict["url"]["params"]:
                new_dict.update({"ac":a_dict["url"]["params"]["ac"]})
        else:
                new_dict.update({"ac":"-"})

        key = json.dumps(new_dict)
        if key in DICT:
                DICT[key] += 1
                #print key + " " + str(DICT[key])
        else:
                DICT.update({key:1})


def read_logs(path):
        for item in os.listdir(path):
                f = gzip.open(path+"/"+item,"r")
                if f == None:
                        raise "program can't open this file"

                i = 0
                while True:
                        if i % 10000 == 0:print_time(i)

                        line = f.readline()
                        if i % 10000 == 0:print_time("read to memory")
                        if not line:
                                break

                        aDict = {}
                        list = re.findall('\[.*?\]',line)
                        if i % 10000 == 0:print_time("split to items")

                        urlDict = parse_url(remove_flag(list[4]))
                        if i % 10000 == 0:print_time("url fommat to dict") 
                        aDict.update({"userid":remove_flag(list[2])})
                        aDict.update({"session":remove_session_prefix(remove_flag(list[7]))})
                        aDict.update({"url":urlDict})
                        if i % 10000 == 0:print_time("create new dict") 

                        get_info(aDict)
                        if i % 10000 == 0:print_time("save and diff")

                        i += 1

                f.close()



if __name__ == "__main__":
        #try:
                read_logs("logs/20130908")
                f = open("data","w")
                for i in DICT:
                        f.write(i+" "+str(DICT[i]))
                f.close()
        #except :
        #       print "error"