Friday, January 15, 2010

MySQL TOP

This blog entry has been updated:
  • added NCQRS, NCQRS/s columns
  • fixed issue with dtrace dropping variables if the script was running for extended time periods
  • cosmetic changes re output

I need to observe MySQL load from time to time and DTrace is one of the tools to use. Usually I'm using one-liners or I come up with a short script. This time I thought it would be nice to write a script so other people like DBAs could use without having to understand how it actually works. The script prints basic statistics for each client connecting to a database. It gives a nice overview for all clients using a database.

CLIENT IP CONN CONN/s QRS QRS/s NCQRS NCQRS/s TIME VTIME
10.10.10.35 10 0 61 0 32 0 0 0
10.10.10.30 17 0 73 0 73 0 0 0
10.10.10.100 52 0 90 0 90 0 0 0
xx-www-11.portal 92 0 249 0 48 0 0 0
xx-cms-1.portal 95 0 1795 5 1669 4 48 48
xx-www-9.portal 198 0 634 1 278 0 0 0
xx-www-13.portal 239 0 986 2 366 1 0 0
xx-www-3.portal 266 0 1028 2 455 1 1 0
xx-www-12.portal 266 0 1070 2 561 1 3 2
xx-www-5.portal 300 0 1431 3 593 1 2 2
xx-www-10.portal 333 0 1221 3 678 1 3 2
xx-www-6.portal 334 0 989 2 446 1 1 0
xx-www-8.portal 358 0 1271 3 497 1 1 0
xx-www-4.portal 395 1 1544 4 744 2 0 0
xx-www-2.portal 445 1 1729 4 764 2 3 2
xx-www-1.portal 962 2 3555 9 1670 4 22 21
xx-www-7.portal 1016 2 3107 8 1643 4 117 115
====== ===== ====== ===== ====== ===== ===== =====
5378 14 20833 58 10607 29 207 199
Running for 359 seconds.

CONN total number of connections
CONN/s average number of connections per second
QRS total number of queries
QRS/s average number of queries per second
NCQRS total number of executed queries
NCQRS/s average number of executed queries per second
TIME total clock time in seconds for all queries
VTIME total CPU time in seconds for all queries

The NCQRS column represents the number of queries which were not served from the MySQL Query Cache while QRS represents all queries issued to MySQL (cached, non-cached or even non-valid queries). If values of VTIME are very close to values of TIME it means that queries are mostly CPU bound. On the other hand the bigger the difference between them the more time is spent on I/O. Another interesting thing to watch is how evenly load is coming from different clients especially in environments where clients are identical www servers behind load balancer and should be generating about the same traffic to a database.

All values are measured since the script was started. There might be some discrepancies with totals in the summary line - this is due to rounding errors. The script should work for MySQL versions 5.0.x, 5.1.x and perhaps for other versions as well. The script doesn't take into account connections made over a socket file - only tcp/ip connections.

The script requires PID of a mysql database as its first argument and a frequency at which output should be refreshed as a second argument, for example to monitor mysql instance with PID 12345 and refresh output every 10s:

./mysql_top.d 12345 10s



# cat mysql_top.d
#!/usr/sbin/dtrace -qCs

/*
Robert Milkowski
*/

#define CLIENTS self->client_ip == "10.10.10.11" ? "xx-www-1.portal" : \
self->client_ip == "10.10.10.12" ? "xx-www-2.portal" : \
self->client_ip == "10.10.10.13" ? "xx-www-3.portal" : \
self->client_ip == "10.10.10.14" ? "xx-www-4.portal" : \
self->client_ip == "10.10.10.15" ? "xx-www-5.portal" : \
self->client_ip == "10.10.10.16" ? "xx-www-6.portal" : \
self->client_ip == "10.10.10.17" ? "xx-www-7.portal" : \
self->client_ip == "10.10.10.18" ? "xx-www-8.portal" : \
self->client_ip == "10.10.10.19" ? "xx-www-9.portal" : \
self->client_ip == "10.10.10.20" ? "xx-www-10.portal" : \
self->client_ip == "10.10.10.21" ? "xx-www-11.portal" : \
self->client_ip == "10.10.10.22" ? "xx-www-12.portal" : \
self->client_ip == "10.10.10.23" ? "xx-www-13.portal" : \
self->client_ip == "10.10.10.29" ? "xx-cms-1.portal" : \
self->client_ip

#define LEGEND "\n \
CONN total number of connections \n \
CONN/s average number of connections per second \n \
QRS total number of queries \n \
QRS/s average number of queries per second \n \
NCQRS total number of executed queries \n \
NCQRS/s average number of executed queries per second \n \
TIME total clock time in seconds for all queries \n \
VTIME total CPU time in seconds for all queries\n"

BEGIN
{
start = timestamp;
total_queries = 0;
total_nc_queries = 0;
total_conn = 0;
total_time = 0;
total_vtime = 0;
}

syscall::getpeername:entry
/ pid == $1 /
{
self->in = 1;

self->arg0 = arg0; /* int s */
self->arg1 = arg1; /* struct sockaddr * */
self->arg2 = arg2; /* size_t len */
}

syscall::getpeername:return
/ self->in /
{
this->len = *(socklen_t *) copyin((uintptr_t)self->arg2, sizeof(socklen_t));
this->socks = (struct sockaddr *) copyin((uintptr_t)self->arg1, this->len);
this->hport = (uint_t)(this->socks->sa_data[0]);
this->lport = (uint_t)(this->socks->sa_data[1]);
this->hport <<= 8; this->port = this->hport + this->lport;

this->a1 = lltostr((uint_t)this->socks->sa_data[2]);
this->a2 = lltostr((uint_t)this->socks->sa_data[3]);
this->a3 = lltostr((uint_t)this->socks->sa_data[4]);
this->a4 = lltostr((uint_t)this->socks->sa_data[5]);
this->s1 = strjoin(this->a1, ".");
this->s2 = strjoin(this->s1, this->a2);
this->s1 = strjoin(this->s2, ".");
this->s2 = strjoin(this->s1, this->a3);
this->s1 = strjoin(this->s2, ".");
self->client_ip = strjoin(this->s1, this->a4);

@conn[CLIENTS] = count();
@conn_ps[CLIENTS] = count();

total_conn++;

self->arg0 = 0;
self->arg1 = 0;
self->arg2 = 0;
}

pid$1::*close_connection*:entry
/ self->in /
{
self->in = 0;
self->client_ip = 0;
}

pid$1::*mysql_parse*:entry
/ self->in /
{
self->t = timestamp;
self->vt = vtimestamp;

@query[CLIENTS] = count();
@query_ps[CLIENTS] = count();

total_queries++;
}

pid$1::*mysql_parse*:return
/ self->in /
{
@time[CLIENTS] = sum(timestamp-self->t);
@vtime[CLIENTS] = sum(vtimestamp-self->vt);

total_time += (timestamp - self->t);
total_vtime += (vtimestamp - self->vt);

self->t = 0;
self->vt = 0;
}

pid$1::*mysql_execute_command*:entry
/ self-> in /
{
@nc_query[CLIENTS] = count();
@nc_query_ps[CLIENTS] = count();

total_nc_queries++;
}

tick-$2
{
/* clear the screen and move cursor to top left corner */
printf("\033[H\033[J");

this->seconds = (timestamp - start) / 1000000000;

normalize(@conn_ps, this->seconds);
normalize(@query_ps, this->seconds);
normalize(@nc_query_ps, this->seconds);
normalize(@time, 1000000000);
normalize(@vtime, 1000000000);

printf("%-16s %s %s %s %s %s %s %s %s\n", \
"CLIENT IP", "CONN", "CONN/s", "QRS", "QRS/s", "NCQRS", "NCQRS/s", "TIME", "VTIME");
printa("%-16s %@6d %@5d %@6d %@5d %@6d %@5d %@5d %@5d\n", \
@conn, @conn_ps, @query, @query_ps, @nc_query, @nc_query_ps, @time, @vtime);
printf("%-16s %s %s %s %s %s %s %s %s\n", \
"", "======", "=====", "======", "=====", "======", "=====", "=====", "=====");
printf("%-16s %6d %5d %6d %5d %6d %5d %5d %5d\n", "", \
total_conn, total_conn/this->seconds, total_queries, total_queries/this->seconds, \
total_nc_queries, total_nc_queries/this->seconds, \
total_time/1000000000, total_vtime/1000000000);

/*
denormalize(@conn_ps);
denormalize(@query_ps);
denormalize(@nc_query_ps);
denormalize(@total_time);
denormalize(@total_vtime);
*/

printf("Running for %d seconds.\n", this->seconds);

printf(LEGEND);
}

1 comment:

xtrondo said...

Awesome!