1 2 3 4 5 6 | SELECT request_url , count ( * ) FROM log GROUP BY request_url ORDER BY count ( * ) LIMIT 0 , 100 ; |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 | cat pd_ng_log_stat . py #!/usr/bin/env python #-*- coding: utf-8 -*- from ng_line_parser import NgLineParser import pandas as pd import socket import struct class PDNgLogStat ( object ) : def __init__ ( self ) : self . ng_line_parser = NgLineParser ( ) def _log_line_iter ( self , pathes ) : “” “解析文件中的每一行并生成一个迭代器” “” for path in pathes : with open ( path , ‘r’ ) as f : for index , line in enumerate ( f ) : self . ng_line_parser . parse ( line ) yield self . ng_line_parser . to_dict ( ) def load_data ( self , path ) : “” “通过给的文件路径加载数据生成 DataFrame” “” self . df = pd . DataFrame ( self . _log_line_iter ( path ) ) def url_req_stat ( self ) : “” “统计那个页面点击量” “” group_by_cols = [ ‘request_url’ ] # 需要分组的列,只计算和显示该列 # 直接统计次数 url_req_grp = self . df [ group_by_cols ] . groupby ( self . df [ ‘request_url’ ] ) return url_req_grp . agg ( [ ‘count’ ] ) [ ‘request_url’ ] . sort_values ( by = ‘count’ , ascending = False ) def main ( ) : file_pathes = [ ‘www.trustauth.cn.access.log’ ] pd_ng_log_stat = PDNgLogStat ( ) pd_ng_log_stat . load_data ( file_pathes ) # 统计页面点击量 print pd_ng_log_stat . url_req_stat ( ) if __name__ == ‘__main__’ : main ( ) |
运行统计和输出结果
1 2 3 4 5 6 7 8 9 10 11 12 13 | python pd_ng_log_stat . py count request_url / wp – admin / admin – ajax . php 246361 / tag / 126012 / 57325 . . . . . . / chufang / 2016 / 06 / 25 / 8634.html 2312 / chufang / 2015 / 03 / 26 / 4686.html 2293 / jiaju / 2014 / 12 / 05 / 1348.html 2230 [ 29205 rows x 1 columns ] |
文章转载来自:trustauth.cn
上一篇:外链点击数-Pandas-Python数据分析(9)
下一篇:时刻PV-Pandas-Python数据分析(5)