------------------------------------------------------------ revno: 11139 revision-id: squid3@treenet.co.nz-20101226013826-rtx0u7bh43v1tfwd parent: squid3@treenet.co.nz-20101224140009-tx1hoi0imghvup8o committer: Amos Jeffries branch nick: trunk timestamp: Sun 2010-12-26 14:38:26 +1300 message: Author: Marcello Romani Author: Amos Jeffries Database Logging Daemon Original code by Marcello Romani, this version has some additions to initialize any missing database tables depended on during its startup phase and some additional polish to fit within the current Squid release. COPYRIGHT AND LICENSE Copyright (C) 2008 by Marcello Romani This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.8 or, at your option, any later version of Perl 5 you may have available. ------------------------------------------------------------ # Bazaar merge directive format 2 (Bazaar 0.90) # revision_id: squid3@treenet.co.nz-20101226013826-rtx0u7bh43v1tfwd # target_branch: http://www.squid-cache.org/bzr/squid3/trunk/ # testament_sha1: d6ef16dfab973621bb18853d28997b473078b160 # timestamp: 2010-12-26 02:53:35 +0000 # source_branch: http://bzr.squid-cache.org/bzr/squid3/trunk/ # base_revision_id: squid3@treenet.co.nz-20101224140009-\ # tx1hoi0imghvup8o # # Begin patch === modified file 'CREDITS' --- CREDITS 2010-10-06 03:50:45 +0000 +++ CREDITS 2010-12-26 01:38:26 +0000 @@ -524,3 +524,13 @@ * . */ +============================================================================== + +helpers/log_daemon/DB/doc/: +helpers/log_daemon/DB/log_db_daemon.pl.in: + + Copyright (C) 2008 by Marcello Romani + + This library is free software; you can redistribute it and/or modify + it under the same terms as Perl itself, either Perl version 5.8.8 or, + at your option, any later version of Perl 5 you may have available. === added directory 'helpers/log_daemon/DB' === added file 'helpers/log_daemon/DB/Makefile.am' --- helpers/log_daemon/DB/Makefile.am 1970-01-01 00:00:00 +0000 +++ helpers/log_daemon/DB/Makefile.am 2010-12-26 01:38:26 +0000 @@ -0,0 +1,17 @@ +include $(top_srcdir)/src/Common.am + +libexec_SCRIPTS = log_db_daemon +CLEANFILES += log_db_daemon log_db_daemon.8 +man_MANS = log_db_daemon.8 +EXTRA_DIST= \ + config.test \ + doc/views.sql \ + doc/date_day_column.sql \ + log_db_daemon.8 \ + log_db_daemon.pl.in + +log_db_daemon.8: log_db_daemon + pod2man log_db_daemon log_db_daemon.8 + +log_db_daemon: log_db_daemon.pl.in + $(subst_perlshell) === added file 'helpers/log_daemon/DB/config.test' --- helpers/log_daemon/DB/config.test 1970-01-01 00:00:00 +0000 +++ helpers/log_daemon/DB/config.test 2010-12-26 01:38:26 +0000 @@ -0,0 +1,7 @@ +#!/bin/sh + +## Test: do we have perl to build the helper scripts? +## Test: do we have pod2man to build the manual? +perl --version >/dev/null && echo | pod2man >/dev/null + +exit $? === added directory 'helpers/log_daemon/DB/doc' === added file 'helpers/log_daemon/DB/doc/date_day_column.sql' --- helpers/log_daemon/DB/doc/date_day_column.sql 1970-01-01 00:00:00 +0000 +++ helpers/log_daemon/DB/doc/date_day_column.sql 2010-12-26 01:38:26 +0000 @@ -0,0 +1,28 @@ +-- we need a separate column to store the date and time of the request +ALTER TABLE access_log ADD COLUMN date_day DATE; +ALTER TABLE access_log ADD COLUMN date_time TIME; + +-- let's populate the new columns, in case some rows already exist; +-- the date and time values should be set by a trigger +UPDATE access_log SET date_day = DATE(FROM_UNIXTIME(time_since_epoch)); +UPDATE access_log SET date_time = TIME(FROM_UNIXTIME(time_since_epoch)); + +-- let's create a view that uses the date column +CREATE VIEW requests_per_day_2 AS SELECT date_day, COUNT(*) AS num_of_requests FROM access_log GROUP BY 1 ORDER BY 1; + +-- that view needs an index on the group by column +CREATE INDEX date_day_idx ON access_log(date_day); + + +-- a trigger that automatically extracts the date value from the time_since_epoch column +-- and stores it in the date_day column +DELIMITER // +CREATE TRIGGER extract_date_day_bi BEFORE INSERT ON access_log FOR EACH ROW +BEGIN + SET NEW.date_day = DATE(FROM_UNIXTIME(NEW.time_since_epoch)); + SET NEW.date_time = TIME(FROM_UNIXTIME(NEW.time_since_epoch)); +END // + + +-- Note: after running this script against an already populated access_log, +-- views have to be recreated, or the new date_day column will not show up. === added file 'helpers/log_daemon/DB/doc/views.sql' --- helpers/log_daemon/DB/doc/views.sql 1970-01-01 00:00:00 +0000 +++ helpers/log_daemon/DB/doc/views.sql 2010-12-26 01:38:26 +0000 @@ -0,0 +1,405 @@ +-- +-- Sample views +-- + +-- ip address of hosts accessing the cache +CREATE OR REPLACE VIEW cache_clients AS + SELECT DISTINCT ip_client FROM access_log ORDER BY 1; + +-- this index helps view 'cache_clients' +CREATE INDEX client_ip_idx ON access_log(ip_client); + + +-- traffic by client +CREATE OR REPLACE VIEW traffic_by_client AS + SELECT + ip_client, + SUM(http_reply_size) AS total_bytes, + SUM(http_reply_size)/1024 AS total_kilobytes, + SUM(http_reply_size)/1048576 AS total_megabytes + FROM access_log + GROUP BY 1 + ORDER BY 1; + +-- most active clients +-- same as before, but sorted by traffic; +-- show only the 10 most active clients +CREATE OR REPLACE VIEW most_active_clients AS + SELECT + ip_client, + SUM(http_reply_size) AS total_bytes, + SUM(http_reply_size)/1024 AS total_kilobytes, + SUM(http_reply_size)/1048576 AS total_megabytes + FROM access_log + GROUP BY 1 + ORDER BY 2 DESC + LIMIT 10; + + +-- traffic per day +CREATE OR REPLACE VIEW traffic_per_day AS + SELECT + date_day, + SUM(http_reply_size) AS total_bytes, + SUM(http_reply_size)/1024 AS total_kilobytes, + SUM(http_reply_size)/1048576 AS total_megabytes + FROM access_log + GROUP BY 1 + ORDER BY 1; + +-- traffic by client per day +CREATE OR REPLACE VIEW traffic_per_day_per_client AS + SELECT + date_day, + ip_client, + SUM(http_reply_size) AS total_bytes, + SUM(http_reply_size)/1024 AS total_kilobytes, + SUM(http_reply_size)/1048576 AS total_megabytes + FROM access_log + GROUP BY 1,2 + ORDER BY 1,2 DESC; + +CREATE OR REPLACE VIEW traffic_per_month_per_client AS + SELECT + YEAR(date_day) AS date_year, + MONTH(date_day) AS date_month, + ip_client, + SUM(http_reply_size) AS total_bytes, + SUM(http_reply_size)/1024 AS total_kilobytes, + SUM(http_reply_size)/1048576 AS total_megabytes + FROM access_log + GROUP BY 2,3 + ORDER BY 1,2,3; + +-- list of clients with some stats +CREATE OR REPLACE VIEW cache_clients_with_infos AS +SELECT + a.ip_client, + COUNT(*) AS total_requests, + (COUNT(*)/(SELECT COUNT(*) FROM access_log))*100 AS requests_perc, + SUM(a.http_reply_size) AS total_traffic, + (SUM(a.http_reply_size)/(SELECT SUM(http_reply_size) FROM access_log))*100 AS traffic_perc, + (SELECT COUNT(*) FROM access_log a1 WHERE a1.ip_client=a.ip_client AND squid_request_status LIKE '%HIT%') + / + (SELECT COUNT(*) FROM access_log) + * 100 AS hit_perc, + (SELECT COUNT(*) FROM access_log a1 WHERE a1.ip_client=a.ip_client AND squid_request_status LIKE '%MISS%') + / + (SELECT COUNT(*) FROM access_log) + * 100 AS miss_perc, + MIN(date_day) AS first_access_date, + MIN(date_time) AS first_access_time, + MAX(date_day) AS last_access_date, + MAX(date_time) AS last_access_time +FROM access_log a +GROUP BY 1 +ORDER BY 1; + +-- this index helps view 'cache_clients_with_infos' +CREATE INDEX client_req_status_idx ON access_log(ip_client, squid_request_status); + + +-- number of requests per day +CREATE OR REPLACE VIEW requests_per_day AS + SELECT + DATE(FROM_UNIXTIME(time_since_epoch)) AS date_day, + COUNT(*) AS num_of_requests + FROM access_log + GROUP BY 1 + ORDER BY 1; + +-- number of requests per minute +CREATE OR REPLACE VIEW requests_per_minute AS + SELECT + DATE(FROM_UNIXTIME(time_since_epoch)) AS date_day, + HOUR(FROM_UNIXTIME(time_since_epoch)) AS date_hour, + MINUTE(FROM_UNIXTIME(time_since_epoch)) AS date_minute, + COUNT(*) AS num_of_requests + FROM access_log + GROUP BY 1,2,3 + ORDER BY 1,2,3; + +-- number of requests per day of each cache client +CREATE OR REPLACE VIEW requests_per_day_per_client AS + SELECT + DATE(FROM_UNIXTIME(time_since_epoch)) AS date_day, + ip_client, + COUNT(*) AS num_of_requests + FROM access_log + GROUP BY 1,2 + ORDER BY 1,2; + +-- percentage of each request status +CREATE OR REPLACE VIEW requests_status_perc AS + SELECT + squid_request_status, + (COUNT(*)/(SELECT COUNT(*) FROM access_log)*100) AS percentage + FROM access_log + GROUP BY squid_request_status + ORDER BY 2 DESC; + +-- this index helps view 'requests_status_perc' +CREATE INDEX req_status_idx ON access_log(squid_request_status); + +-- request hits and misses, in percentage +CREATE OR REPLACE VIEW hits_misses_perc AS + SELECT + 'hits', + (SELECT COUNT(*) FROM access_log WHERE squid_request_status LIKE '%HIT%') + / + (SELECT COUNT(*) FROM access_log)*100 + AS percentage +UNION + SELECT + 'misses', + (SELECT COUNT(*) FROM access_log WHERE squid_request_status LIKE '%MISS%') + / + (SELECT COUNT(*) FROM access_log)*100 + AS pecentage; + +-- response times +CREATE OR REPLACE VIEW time_response_ranges AS + SELECT + '0..500', + COUNT(*) / (SELECT COUNT(*) FROM access_log)*100 AS percentage + FROM access_log + WHERE time_response >= 0 AND time_response < 500 +UNION + SELECT + '500..1000', + COUNT(*) / (SELECT COUNT(*) FROM access_log)*100 AS percentage + FROM access_log + WHERE time_response >= 500 AND time_response < 1000 +UNION + SELECT + '1000..2000', + COUNT(*) / (SELECT COUNT(*) FROM access_log)*100 AS percentage + FROM access_log + WHERE time_response >= 1000 AND time_response < 2000 +UNION + SELECT + '>= 2000', + COUNT(*) / (SELECT COUNT(*) FROM access_log)*100 AS percentage + FROM access_log + WHERE time_response >= 2000; + +-- this index helps view 'time_response_ranges' +CREATE INDEX time_response_idx ON access_log(time_response); + +-- response time graph +CREATE OR REPLACE VIEW time_response_graph AS + SELECT + time_response, + COUNT(*) AS num_req + FROM access_log + GROUP BY 1 + ORDER BY 1; + +-- traffic by mime type +CREATE OR REPLACE VIEW traffic_by_http_mime_type AS + SELECT + http_mime_type, + SUM(http_reply_size) as total_bytes + FROM access_log + GROUP BY http_mime_type + ORDER BY 2 DESC; + +-- last 10 queries +CREATE OR REPLACE VIEW last_10_queries AS + SELECT * + FROM access_log + WHERE + id > (SELECT MAX(id) FROM access_log) - 10 + ORDER BY id DESC; + +-- id of the last query of each client +-- this view is required by the "last n queries by ip" view +CREATE OR REPLACE VIEW last_query_by_client AS + SELECT + ip_client, + MAX(id) AS last_query_id + FROM access_log + GROUP BY ip_client; + + +-- last 10 queries of each client +-- NOTE: this query is conceptually wrong because it assumes that no holes exist +-- in the values of column 'id'. +-- This can be false if e.g. some access_log entries get deleted... +CREATE OR REPLACE VIEW last_10_queries_by_client AS + SELECT * + FROM access_log a + WHERE + id > ( + SELECT l.last_query_id + FROM last_query_by_client l + WHERE l.ip_client = a.ip_client + ) - 10 + ORDER BY a.ip_client, a.id DESC; + +-- this index helps the "last_10_queries_by_client" view +CREATE INDEX client_ip_record_id_idx ON access_log(ip_client, id); + + +-- number of HIT requests per day +CREATE OR REPLACE VIEW hits_per_day AS + SELECT + date_day, + COUNT(*) AS num_hits + FROM access_log + WHERE squid_request_status LIKE '%HIT%' + GROUP BY 1; + +-- HIT requests per day, percent (100% = total number of requests that day) +CREATE OR REPLACE VIEW hits_per_day_perc AS + SELECT + r.date_day, + h.num_hits/r.num_of_requests*100 AS hits_per_day_perc + FROM requests_per_day r + JOIN + hits_per_day h + ON r.date_day = h.date_day; + + +-- request methods (count) +CREATE OR REPLACE VIEW http_methods AS + SELECT + http_method, + COUNT(*) + FROM access_log + GROUP BY 1 + ORDER BY 1; + +-- request methods by percent +CREATE OR REPLACE VIEW http_methods_perc AS + SELECT + http_method, + COUNT(*) / (SELECT COUNT(*) FROM access_log) * 100 AS perc + FROM access_log + GROUP BY 1 + ORDER BY 2 DESC; + + +-- slowest queries +CREATE OR REPLACE VIEW slowest_requests AS + SELECT * + FROM access_log + ORDER BY time_response DESC + LIMIT 10; + + +CREATE OR REPLACE VIEW slowest_request_by_method AS + SELECT * + FROM access_log + GROUP BY http_method + ORDER BY http_method, time_response DESC; + + +-- requests with the biggest reply size +CREATE OR REPLACE VIEW biggest_requests AS + SELECT * + FROM access_log + ORDER BY http_reply_size DESC + LIMIT 10; + + + +-- list each day which has at least one request, with some statistics +CREATE OR REPLACE VIEW days_with_infos AS + SELECT + date_day, + MIN(date_time) AS first_req_time, + MAX(date_time) AS last_req_time, + COUNT(*) AS number_of_requests, + SUM(http_reply_size) AS total_traffic_bytes, + SUM(http_reply_size) / 1048576 AS total_traffic_megabytes, + COUNT(DISTINCT ip_client) AS number_of_clients, + AVG(time_response) AS avg_time_response, + MAX(time_response) AS max_time_response, + + -- client that has made the highest number of requests that day + ( + SELECT ip_client + FROM requests_per_day_per_client r + WHERE r.date_day = a.date_day + ORDER BY r.num_of_requests DESC LIMIT 1 + ) AS most_active_client_r, + + -- the number of requests that client actually made + ( + SELECT r.num_of_requests + FROM requests_per_day_per_client r + WHERE r.date_day = a.date_day + ORDER BY r.num_of_requests DESC LIMIT 1 + ) AS most_active_client_r_nr, + + -- same info but as percentage on the total number of requests that day + -- we have to repeat the whole query because we cannot reference aliases + -- defined in previous columns + -- a date_day column with an index on it would help here; a view would probably help too... + ( + ( + SELECT r.num_of_requests + FROM requests_per_day_per_client r + WHERE r.date_day = a.date_day + ORDER BY 1 DESC LIMIT 1 + ) / ( + SELECT COUNT(*) + FROM access_log a1 + WHERE a.date_day = a1.date_day + ) * 100 + ) AS most_active_client_r_pc, + + -- client that has generated the highest traffic that day + ( + SELECT t.ip_client + FROM traffic_per_day_per_client t + WHERE t.date_day = a.date_day + ORDER BY t.total_bytes DESC LIMIT 1 + ) AS most_active_client_t, + + -- the actual traffic generated by that client + ( + SELECT t.total_bytes + FROM traffic_per_day_per_client t + WHERE t.date_day = a.date_day + ORDER BY t.total_bytes DESC LIMIT 1 + ) AS most_active_client_t_b, + + -- same info expressed in megabytes + ( + SELECT t.total_bytes + FROM traffic_per_day_per_client t + WHERE t.date_day = a.date_day + ORDER BY t.total_bytes DESC LIMIT 1 + ) / 1048576 AS most_active_client_t_mb, + + -- same info in percentage on the total traffic that day + -- see previous comments + ( + ( + SELECT t.total_bytes + FROM traffic_per_day_per_client t + WHERE t.date_day = a.date_day + ORDER BY t.total_bytes DESC LIMIT 1 + ) / ( + SELECT SUM(http_reply_size) + FROM access_log a1 + WHERE a.date_day = a1.date_day + ) * 100 + ) AS most_active_client_t_pc + + FROM access_log a + GROUP BY 1 + ORDER BY 1; + +-- this index helps the "days_with_info" view +CREATE INDEX date_day_idx ON access_log(date_day); + + +CREATE OR REPLACE VIEW requests_in_last_minute AS + select * from access_log where time_since_epoch >= ( (select max(time_since_epoch) from access_log) - 60); + + +CREATE OR REPLACE VIEW avg_req_per_minute AS + SELECT COUNT(*) FROM requests_in_last_minute; === added file 'helpers/log_daemon/DB/log_db_daemon.pl.in' --- helpers/log_daemon/DB/log_db_daemon.pl.in 1970-01-01 00:00:00 +0000 +++ helpers/log_daemon/DB/log_db_daemon.pl.in 2010-12-26 01:38:26 +0000 @@ -0,0 +1,453 @@ +#!@PERL@ +use strict; +use warnings; +use DBI; +use English qw( -no_match_vars ); +use Getopt::Long; +use Pod::Usage; + +$|=1; + +=pod + +=head1 NAME + +log_db_daemon - Database logging daemon for Squid + +=head1 SYNOPSIS + +log_db_daemon DSN [options] + +=head1 DESCRIPTOIN + +This program writes Squid access.log entries to a database. +Presently only accepts the B native format + +=over 8 + +=item B + +Database DSN encoded as a path. This is sent as the access_log file path. + +Sample configuration: + access_log daemon:/host/database/table/username/password squid + + to leave a parameter unspecified use a double slash: + access_log daemon://database/table/username/password squid + +Default "DBI:mysql:database=squid" + +=item B<--debug> + +Write debug messages to Squid stderr or cache.log + +=cut + +# the first argument to this script is the log file path describing the DSN +my $log_file = shift; + +# others may be options +my $debug = 0; +GetOptions( + 'debug' => \$debug, + ); + + +# utility routine to print messages on stderr (so they appear in cache log) +# without using warn, which would clutter the log with source line numbers +sub log_info { + my $msg = shift; + print STDERR "$msg\n"; +} + +# we use logfile to pass database access information to this script +# sample configuration: +# access_log daemon:/host/database/table/username/password squid +# to let a parmeter unspecified, e.g. the database host, use a double slash: +# access_log daemon://database/table/username/password squid +my ( $host, $database, $table, $user, $pass ) = $log_file =~ / \/(.*?)\/(.*?)\/(.*?)\/(.*?)\/(.*?) \z /xms; + +if ( !$host ) { + $host = 'localhost'; + log_info("Database host not specified. Using $host."); +} + +if ( !$database ) { + $database = 'squid_log'; + log_info("Database name not specified. Using $database."); +} + +if ( !$table ) { + $table = 'access_log'; + log_info("Table parameter not specified. Using $table."); +} + +if ( !$user ) { + $user = 'squid'; + log_info("User parameter not specified. Using $user."); +} + +if ( !$pass ) { + log_info('No password specified. Connecting with NO password.'); +} + +# fields that we should have in the table +# Order here must match the order of fields in the Log format and parse() output array. +my @db_fields = qw( + id + time_since_epoch + time_response + ip_client + squid_request_status + http_status_code + http_reply_size + http_method + http_url + http_username + squid_hier_status + ip_server + http_mime_type +); + +# perform db connection +my $dsn = "DBI:mysql:database=$database" . ($host ne "localhost" ? ":$host" : ""); +my $dbh; +my $sth; +eval { + warn "Connecting... dsn='$dsn', username='$user', password='...'"; + $dbh = DBI->connect($dsn, $user, $pass, { AutoCommit => 1, RaiseError => 1, PrintError => 1 }); +}; +if ($EVAL_ERROR) { + die "Cannot connect to database: $DBI::errstr"; +} + + +# a simple test to assure the specified table exists +eval { + my $q = 'SELECT ' . join(',',@db_fields) . " FROM $table LIMIT 1"; + my $sth = $dbh->prepare($q); + $sth->execute; +}; +if ($EVAL_ERROR) { + # run a query to create the table of required syntax + my $create_query = 'CREATE TABLE ' . $table . ' (' . + " id INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY," . + " time_since_epoch DECIMAL(15,3)," . + " time_response INTEGER," . + " ip_client CHAR(15)," . + " ip_server CHAR(15)," . + " http_status_code VARCHAR(10)," . + " http_reply_size INTEGER," . + " http_method VARCHAR(20)," . + " http_url TEXT," . + " http_username VARCHAR(20)," . + " http_mime_type VARCHAR(50)," . + " squid_request_status VARCHAR(50)," . + " squid_hier_status VARCHAR(20)" . + ");" ; + + $sth = $dbh->prepare($create_query); + $sth->execute; + # test again and fail hard if it is still broken. + eval { + my $q = 'SELECT ' . join(',',@db_fields) . " FROM $table LIMIT 1"; + my $sth = $dbh->prepare($q); + $sth->execute; + }; + if ($EVAL_ERROR) { + die "Error initializing database table: $EVAL_ERROR"; + }; +} +# test + +# for better performance, prepare the statement at startup +eval { + my $q = "INSERT INTO $table (" . join(',',@db_fields) . ") VALUES(NULL" . ',?' x (scalar(@db_fields)-1) . ')'; + #$sth = $dbh->prepare("INSERT INTO $table VALUES(NULL,?,?,?,?,?,?,?,?,?,?,?,?)"); + $sth = $dbh->prepare($q); +}; +if ($EVAL_ERROR) { + die "Error while preparing sql statement: $EVAL_ERROR"; +} + +sub parse($) { + my ($line) = @_; + my (@t) = $line =~ /^L(\d+\.\d+) *(\d+?) (.*?) (.*?)\/(\d+?) (\d+?) (.*?) (.*?) (.*?) (.*?)\/(.*?) (.*)$/; +} + +# main loop +while (my $line = <>) { + chomp $line; + + my $cmd = substr($line, 0, 1); # extract command byte + + if ( $cmd eq 'L' ) { + my @log_entry = parse($line); + eval { # we catch db errors to avoid crashing squid in case something goes wrong... + $sth->execute(@log_entry) or die $sth->errstr + }; + if ( $EVAL_ERROR ) { # leave a trace of the error in the logs + warn $EVAL_ERROR . " values=(" . join(', ', @log_entry) . ')'; + } + } +} + +$dbh->disconnect(); + +__END__ + +=head1 DESCRIPTION + +This module exploits the new logfile daemon support available in squid 2.7 and 3.2 to store access log entries in a MySQL database. + +=head1 CONFIGURATION + +=head2 Squid configuration + +=head3 access_log directive + +The path to the access log file is used to provide the database connection parameters. + + access_log daemon:/mysql_host:port/database/table/username/password squid + +The 'daemon' prefix is mandatory and tells squid that the B helper is to be used instead of the normal file logging. + +The last parameter tells squid which log format to use when writing lines to the log daemon. +Presently B format is supported. + +=over 4 + +=item mysql_host:port + +Host where the mysql server is running. If left empty, 'localhost' is assumed. + +=item database + +Name of the database to connect to. If left empty, 'squid_log' is assumed. + +=item table + +Name of the database table where log lines are stored. If left empty, 'access_log' is assumed. + +=item username + +Username to use when connecting to the database. If left empty, 'squid' is assumed. + +=item password + +Password to use when connecting to the database. If left empty, no password is used. + +=back + +To leave all fields to their default values, you can use a single slash: + + access_log daemon:/ squid + +To specify only the database password, which by default is empty, you must leave unspecified all the other parameters by using null strings: + + access_log daemon://///password squid + +=head3 logfile_daemon directive + +This is the current way of telling squid where the logfile daemon resides. + + logfile_daemon /path/to/squid/libexec/logfile-daemon_mysql.pl + +The script must be copied to the location specified in the directive. + +=head2 Database configuration + +Let's call the database 'squid_log' and the log table 'access_log'. The username and password for the db connection will be both 'squid'. + +=head3 Database + +Create the database: + + CREATE DATABASE squid_log; + +=head3 User + +Create the user: + + GRANT INSERT,SELECT,CREATE ON squid_log.* TO 'squid'@'localhost' IDENTIFIED BY 'squid'; + FLUSH PRIVILEGES; + +Note that only CREATE, INSERT and SELECT privileges are granted to the 'squid' user. This ensures that the logfile daemon script cannot change or modify the log entries. + +=head3 Table + +The Daemon will attempt to initialize this table if none exists when it starts. + +The table created should look like: + + CREATE TABLE access_log ( + id INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY, + time_since_epoch DECIMAL(15,3), + time_response INTEGER, + ip_client CHAR(15), + ip_server CHAR(15), + http_status_code VARCHAR(10), + http_reply_size INTEGER, + http_method VARCHAR(20), + http_url TEXT, + http_username VARCHAR(20), + http_mime_type VARCHAR(50), + squid_hier_status VARCHAR(20), + squid_request_status VARCHAR(20) + ); + +=head1 VERSION INFORMATION + +This document refers to C script version 0.5. + +The script has been developed and tested in the following environment: + +=over 4 + +=item squid-2.7 Squid-3.2 + +=item mysql 5.0.26 and 5.1 + +=item perl 5.8.8 + +=item OpenSUSE 10.2 + +=back + +=head1 DATA EXTRACTION + +=head2 Sample queries. + +=over 4 + +=item Clients accessing the cache + + SELECT DISTINCT ip_client FROM access_log; + +=item Number of request per day + + SELECT + DATE(FROM_UNIXTIME(time_since_epoch)) AS date_day, + COUNT(*) AS num_of_requests + FROM access_log + GROUP BY 1 + ORDER BY 1; + +=item Request status count + +To obtain the raw count of each request status: + + SELECT squid_request_status, COUNT(*) AS n + FROM access_log + GROUP BY squid_request_status + ORDER BY 2 DESC; + +To calculate the percentage of each request status: + + SELECT + squid_request_status, + (COUNT(*)/(SELECT COUNT(*) FROM access_log)*100) AS percentage + FROM access_log + GROUP BY squid_request_status + ORDER BY 2 DESC; + +To distinguish only between HITs and MISSes: + + SELECT + 'hits', + (SELECT COUNT(*) + FROM access_log + WHERE squid_request_status LIKE '%HIT%') + / + (SELECT COUNT(*) FROM access_log)*100 + AS percentage + UNION + SELECT + 'misses', + (SELECT COUNT(*) + FROM access_log + WHERE squid_request_status LIKE '%MISS%') + / + (SELECT COUNT(*) FROM access_log)*100 + AS pecentage; + +=item Response time ranges + + SELECT + '0..500', + COUNT(*)/(SELECT COUNT(*) FROM access_log)*100 AS percentage + FROM access_log + WHERE time_response >= 0 AND time_response < 500 + UNION + SELECT + '500..1000', + COUNT(*)/(SELECT COUNT(*) FROM access_log)*100 AS percentage + FROM access_log + WHERE time_response >= 500 AND time_response < 1000 + UNION + SELECT + '1000..2000', + COUNT(*)/(SELECT COUNT(*) FROM access_log)*100 AS percentage + FROM access_log + WHERE time_response >= 1000 AND time_response < 2000 + UNION + SELECT + '>= 2000', + COUNT(*)/(SELECT COUNT(*) FROM access_log)*100 AS percentage + FROM access_log + WHERE time_response >= 2000; + +=item Traffic by mime type + + SELECT + http_mime_type, + SUM(http_reply_size) as total_bytes + FROM access_log + GROUP BY http_mime_type + ORDER BY 2 DESC; + +=item Traffic by client + + SELECT + ip_client, + SUM(http_reply_size) AS total_bytes + FROM access_log + GROUP BY 1 + ORDER BY 2 DESC; + +=back + +=head2 Speed issues + +The MyISAM storage engine is known to be faster than the InnoDB one, so although it doesn't support transactions and referential integrity, it might be more appropriate in this scenario. You might want to append "ENGINE=MYISAM" at the end of the table creation code in the above SQL script. + +Indexes should be created according to the queries that are more frequently run. The DDL script only creates an implicit index for the primary key column. + +=head1 TODO + +=head2 Table cleanup + +This script currently implements only the C (i.e. "append a line to the log") command, therefore the log lines are never purged from the table. This approach has an obvious scalability problem. + +One solution would be to implement e.g. the "rotate log" command in a way that would calculate some summary values, put them in a "summary table" and then delete the lines used to caluclate those values. + +Similar cleanup code could be implemented in an external script and run periodically independently from squid log commands. + +=head2 Testing + +This script has only been tested in low-volume scenarios (single client, less than 10 req/s). Tests in high volume environments could reveal performance bottlenecks and bugs. + +=head1 AUTHOR + +Marcello Romani, marcello.romani@libero.it +Amos Jeffries, amosjeffries@squid-cache.org + +=head1 COPYRIGHT AND LICENSE + +Copyright (C) 2008 by Marcello Romani + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself, either Perl version 5.8.8 or, +at your option, any later version of Perl 5 you may have available. + +=cut === modified file 'helpers/log_daemon/Makefile.am' --- helpers/log_daemon/Makefile.am 2009-11-22 20:37:27 +0000 +++ helpers/log_daemon/Makefile.am 2010-12-26 01:38:26 +0000 @@ -1,5 +1,6 @@ ## Alphabetical list of sub-directories to distribute with Squid: DIST_SUBDIRS = \ + DB \ file SUBDIRS = $(LOG_DAEMON_HELPERS)