转载

阿里云收集服务器性能指标的python脚本

下面脚本是阿里云服务器收集性能指标的脚本:

#!/usr/bin/python ######################################### # Function: sample linux performance indices # Usage:    python sampler.py # Author:   CMS DEV TEAM # Company:  Aliyun Inc. # Version:  1.1 ######################################### import os import os.path import sys import time import operator import httplib import logging import socket import random from shutil import copyfile from subprocess import Popen, PIPE from logging.handlers import RotatingFileHandler logger = None REMOTE_HOST = None REMOTE_PORT = None REMOTE_MONITOR_URI = None UUID = None def get_mem_usage_percent():  try:   f = open('/proc/meminfo', 'r')   for line in f:    if line.startswith('MemTotal:'):     mem_total = int(line.split()[1])    elif line.startswith('MemFree:'):     mem_free = int(line.split()[1])    elif line.startswith('Buffers:'):     mem_buffer = int(line.split()[1])    elif line.startswith('Cached:'):     mem_cache = int(line.split()[1])    elif line.startswith('SwapTotal:'):     vmem_total = int(line.split()[1])    elif line.startswith('SwapFree:'):     vmem_free = int(line.split()[1])    else:     continue   f.close()  except:   return None  physical_percent = usage_percent(mem_total - (mem_free + mem_buffer + mem_cache), mem_total)  virtual_percent = 0  if vmem_total > 0:   virtual_percent = usage_percent((vmem_total - vmem_free), vmem_total)  return physical_percent, virtual_percent black_list = ('iso9660',) def usage_percent(use, total):  try:   ret = (float(use) / total) * 100  except ZeroDivisionError:   raise Exception("ERROR - zero division error")  return ret def get_disk_partition():  return_list = []  pd = []  try:   f = open("/proc/filesystems", "r")   for line in f:    if not line.startswith("nodev"):     fs_type = line.strip()     if fs_type not in black_list:      pd.append(fs_type)   f.close()   f = open('/etc/mtab', "r")   for line in f:    if line.startswith('none'):     continue    tmp = line.strip().split()    ft = tmp[2]    if ft not in pd:     continue    return_list.append(tmp[1])   f.close()  except:   return None  return return_list def check_disk():  try:   return_dict = {}   p_list = get_disk_partition()   for i in p_list:    dt = os.statvfs(i)    use = (dt.f_blocks - dt.f_bfree) * dt.f_frsize    all = dt.f_blocks * dt.f_frsize    return_dict[i] = ('%.2f' % (usage_percent(use, all),), ('%.2f' % (all * 1.0 / (1024 * 1000000))))  except:   return None  return return_dict _CLOCK_TICKS = os.sysconf("SC_CLK_TCK") def get_cpu_time():  need_sleep = False  if not os.path.isfile('/tmp/cpu_stat') or os.path.getsize('/tmp/cpu_stat') == 0:   copyfile('/proc/stat', '/tmp/cpu_stat')   need_sleep = True  try:   f1 = open('/tmp/cpu_stat', 'r')   values1 = f1.readline().split()   total_time1 = 0   for i in values1[1:]:    total_time1 += int(i)   idle_time1 = int(values1[4])   iowait_time1 = int(values1[5])  finally:   f1.close()  if need_sleep:   time.sleep(1)  f2 = open('/proc/stat', 'r')  try:   values2 = f2.readline().split()   total_time2 = 0   for i in values2[1:]:    total_time2 += int(i)   idle_time2 = int(values2[4])   iowait_time2 = int(values2[5])  finally:   f2.close()  idle_time = idle_time2 - idle_time1  iowait_time = iowait_time2 - iowait_time1  total_time = total_time2 - total_time1  cpu_percentage = int(100.0 * (total_time - idle_time - iowait_time) / total_time)  # compensate logic  if total_time < 0 or idle_time < 0 or iowait_time < 0 or cpu_percentage < 0 or cpu_percentage > 100:   time.sleep(1)   f3 = open('/proc/stat', 'r')   try:    values3 = f3.readline().split()    total_time3 = 0    for i in values3[1:]:     total_time3 += int(i)    idle_time3 = int(values3[4])    iowait_time3 = int(values3[5])   finally:    f3.close()   idle_time = idle_time3 - idle_time2   iowait_time = iowait_time3 - iowait_time2   total_time = total_time3 - total_time2   cpu_percentage = int(100.0 * (total_time - idle_time - iowait_time) / total_time)  copyfile('/proc/stat', '/tmp/cpu_stat')  return cpu_percentage def network_io_kbitps():  """Return network I/O statistics for every network interface  installed on the system as a dict of raw tuples.  """  f1 = open("/proc/net/dev", "r")  try:   lines1 = f1.readlines()  finally:   f1.close()  retdict1 = {}  for line1 in lines1[2:]:   colon1 = line1.find(':')   assert colon1 > 0, line1   name1 = line1[:colon1].strip()   fields1 = line1[colon1 + 1:].strip().split()   bytes_recv1 = float('%.4f' % (float(fields1[0]) * 0.0078125))   bytes_sent1 = float('%.4f' % (float(fields1[8]) * 0.0078125))   retdict1[name1] = (bytes_recv1, bytes_sent1)  time.sleep(1)  f2 = open("/proc/net/dev", "r")  try:   lines2 = f2.readlines()  finally:   f2.close()  retdict2 = {}  for line2 in lines2[2:]:   colon2 = line2.find(':')   assert colon2 > 0, line2   name2 = line2[:colon2].strip()   fields2 = line2[colon2 + 1:].strip().split()   bytes_recv2 = float('%.4f' % (float(fields2[0]) * 0.0078125))   bytes_sent2 = float('%.4f' % (float(fields2[8]) * 0.0078125))   retdict2[name2] = (bytes_recv2, bytes_sent2)  retdict = merge_with(retdict2, retdict1)  return retdict def disk_io_Kbps():  iostat = Popen("iostat -d -k 1 2 | sed '/Device/|Linux/|^$/d' > /tmp/disk_io", shell=True, stdout=PIPE, stderr=PIPE)  iostat_error = iostat.communicate()[1].strip()  if iostat_error:   logger.error("iostat not exists, %s" % iostat_error)   return None  retdict = {}  exception = None   try:   try:    f = open('/tmp/disk_io', 'r')   except Exception, ex:    exception = ex    logger.error(exception)   if exception:    return None   lines = f.readlines()   for line in lines:    name, _, readkps, writekps, _, _, = line.split()    if name:     readkps = float(readkps)     writekps = float(writekps)     retdict[name] = (readkps, writekps)   return retdict  finally:   f.close() def merge_with(d1, d2, fn=lambda x, y: tuple(map(operator.sub, x, y))):  res = d1.copy() # "= dict(d1)" for lists of tuples  for key, val in d2.iteritems(): # ".. in d2" for lists of tuples   try:    res[key] = fn(res[key], val)   except KeyError:    res[key] = val  return res def get_load():  try:   f = open('/proc/loadavg', 'r')   tmp = f.readline().split()   lavg_1 = float(tmp[0])   lavg_5 = float(tmp[1])   lavg_15 = float(tmp[2])   f.close()  except:   return None  return lavg_1, lavg_5, lavg_15 def get_tcp_status():  check_cmd = "command -v ss"  check_proc = Popen(check_cmd, shell=True, stdout=PIPE)  ss = check_proc.communicate()[0].rstrip('/n')  if ss:   cmd = "ss -ant | awk '{if(NR != 1) print $1}' | awk '{state=$1;arr[state]++} END{for(i in arr){printf /"%s=%s /", i,arr[i]}}' | sed 's/-/_/g' | sed 's/ESTAB=/ESTABLISHED=/g' | sed 's/FIN_WAIT_/FIN_WAIT/g'"  else:   cmd = "netstat -anp | grep tcp | awk '{print $6}' | awk '{state=$1;arr[state]++} END{for(i in arr){printf /"%s=%s /", i,arr[i]}}' | tail -n 1"  tcp_proc = Popen(cmd, shell=True, stdout=PIPE)  tcp_status = tcp_proc.communicate()[0].rstrip('/n')  return tcp_status def get_proc_number():  cmd = "ps axu | wc -l | tail -n 1"  proc_func = Popen(cmd, shell=True, stdout=PIPE)  proc_number = proc_func.communicate()[0].rstrip('/n')  return proc_number def all_index():  return (   int(time.time() * 1000),   get_cpu_time(),   get_mem_usage_percent(),   check_disk(),   disk_io_Kbps(),   network_io_kbitps(),   get_load(),   get_tcp_status(),   get_proc_number()  ) def collector():  timestamp, cpu, mem, disk, disk_io, net, load, tcp_status, process_number = all_index()  disk_utilization = ''  disk_io_read = ''  disk_io_write = ''  internet_networkrx = ''  internet_networktx = ''  tcp_status_count = ''  period_1 = ''  period_5 = ''  period_15 = ''  if UUID:   cpu_utilization = 'vm.CPUUtilization ' + str(timestamp) + ' ' + str(cpu) + ' ns=ACS/ECS unit=Percent instanceId=%s/n' % UUID   memory_utilization = 'vm.MemoryUtilization ' + str(timestamp) + ' ' + str(mem[0]) + ' ns=ACS/ECS unit=Percent instanceId=%s/n' % UUID   if load:    period_1 = 'vm.LoadAverage ' + str(timestamp) + ' ' + str(load[0]) + ' ns=ACS/ECS unit=count' + ' instanceId=%s period=1min/n' % UUID    period_5 = 'vm.LoadAverage ' + str(timestamp) + ' ' + str(load[1]) + ' ns=ACS/ECS unit=count' + ' instanceId=%s period=5min/n' % UUID    period_15 = 'vm.LoadAverage ' + str(timestamp) + ' ' + str(load[2]) + ' ns=ACS/ECS unit=count' + ' instanceId=%s period=15min/n' % UUID   if disk:    for name, value in disk.items():     disk_utilization = disk_utilization + 'vm.DiskUtilization ' + str(timestamp) + ' ' + str(value[0]) + ' ns=ACS/ECS unit=Percent instanceId=%s mountpoint=%s/n' % (UUID, name)   if disk_io:    for name, value in disk_io.items():     disk_io_read = disk_io_read + 'vm.DiskIORead ' + str(timestamp) + ' ' + str(value[0]) + ' ns=ACS/ECS unit=Kilobytes/Second instanceId=%s diskname=%s/n' % (UUID, name)     disk_io_write = disk_io_write + 'vm.DiskIOWrite ' + str(timestamp) + ' ' + str(value[1]) + ' ns=ACS/ECS unit=Kilobytes/Second instanceId=%s diskname=%s/n' % (UUID, name)   for name, value in net.items():    internet_networkrx = internet_networkrx + 'vm.InternetNetworkRX ' + str(timestamp) + ' ' + str(value[0]) + ' ns=ACS/ECS unit=Kilobits/Second instanceId=%s netname=%s/n' % (UUID, name)    internet_networktx = internet_networktx + 'vm.InternetNetworkTX ' + str(timestamp) + ' ' + str(value[1]) + ' ns=ACS/ECS unit=Kilobits/Second instanceId=%s netname=%s/n' % (UUID, name)   if tcp_status:    status_count = tcp_status.split()    for element in status_count:     key_value = element.split('=')     tcp_status_count = tcp_status_count + 'vm.TcpCount ' + str(timestamp) + ' ' + key_value[1] + ' ns=ACS/ECS unit=Count instanceId=%s state=%s/n' % (UUID, key_value[0])   process_count = 'vm.ProcessCount ' + str(timestamp) + ' ' + process_number + ' ns=ACS/ECS unit=Count instanceId=%s/n' % UUID  else:   cpu_utilization = 'vm.CPUUtilization ' + str(timestamp) + ' ' + str(cpu) + ' ns=ACS/ECS unit=Percent/n'   memory_utilization = 'vm.MemoryUtilization ' + str(timestamp) + ' ' + str(mem[0]) + ' ns=ACS/ECS unit=Percent/n'   if load:    period_1 = 'vm.LoadAverage ' + str(timestamp) + ' ' + str(load[0]) + ' ns=ACS/ECS unit=count period=1min/n'    period_5 = 'vm.LoadAverage ' + str(timestamp) + ' ' + str(load[1]) + ' ns=ACS/ECS unit=count period=5min/n'    period_15 = 'vm.LoadAverage ' + str(timestamp) + ' ' + str(load[2]) + ' ns=ACS/ECS unit=count period=15min/n'   if disk:    for name, value in disk.items():     disk_utilization = disk_utilization + 'vm.DiskUtilization ' + str(timestamp) + ' ' + str(value[0]) + ' ns=ACS/ECS unit=Percent mountpoint=%s/n' % name   if disk_io:    for name, value in disk_io.items():     disk_io_read = disk_io_read + 'vm.DiskIORead ' + str(timestamp) + ' ' + str(value[0]) + ' ns=ACS/ECS unit=Kilobytes/Second diskname=%s/n' % name     disk_io_write = disk_io_write + 'vm.DiskIOWrite ' + str(timestamp) + ' ' + str(value[1]) + ' ns=ACS/ECS unit=Kilobytes/Second diskname=%s/n' % name   for name, value in net.items():    internet_networkrx = internet_networkrx + 'vm.InternetNetworkRX ' + str(timestamp) + ' ' + str(value[0]) + ' ns=ACS/ECS unit=Kilobits/Second netname=%s/n' % name    internet_networktx = internet_networktx + 'vm.InternetNetworkTX ' + str(timestamp) + ' ' + str(value[1]) + ' ns=ACS/ECS unit=Kilobits/Second netname=%s/n' % name   if tcp_status:    status_count = tcp_status.split()    for element in status_count:     key_value = element.split('=')     tcp_status_count = tcp_status_count + 'vm.TcpCount ' + str(timestamp) + ' ' + key_value[1] + ' ns=ACS/ECS unit=Count state=%s/n' % key_value[0]   process_count = 'vm.ProcessCount ' + str(timestamp) + ' ' + process_number + ' ns=ACS/ECS unit=Count/n'  data_post = cpu_utilization + memory_utilization + period_1 + period_5 + period_15 + disk_utilization + disk_io_read + disk_io_write + internet_networkrx + internet_networktx + tcp_status_count + process_count  print data_post  interval = random.randint(0, 5000)  time.sleep(interval / 1000.0)  headers = {"Content-Type": "text/plain", "Accept": "text/plain"}  exception = None  http_client = None  try:   try:    http_client = httplib.HTTPConnection(REMOTE_HOST, REMOTE_PORT)    http_client.request(method="POST", url=REMOTE_MONITOR_URI, body=data_post, headers=headers)    response = http_client.getresponse()    if response.status == 200:     return    else:     logger.warn("response code %d" % response.status)     logger.warn("response code %s" % response.read())   except Exception, ex:    exception = ex  finally:   if http_client:    http_client.close()   if exception:    logger.error(exception) if __name__ == '__main__':  REMOTE_HOST = 'open.cms.aliyun.com'  REMOTE_PORT = 80  # get report address  if not os.path.isfile("../cmscfg"):   pass  else:   props = {}   prop_file = file("../cmscfg", 'r')   for line in prop_file.readlines():    kv = line.split('=')    props[kv[0].strip()] = kv[1].strip()   prop_file.close()   if props.get('report_domain'):    REMOTE_HOST = props.get('report_domain')   if props.get('report_port'):    REMOTE_PORT = props.get('report_port')  # get uuid  if not os.path.isfile("../aegis_quartz/conf/uuid"):   pass  else:   uuid_file = file("../aegis_quartz/conf/uuid", 'r')   UUID = uuid_file.readline()   UUID = UUID.lower()  REMOTE_MONITOR_URI = "/metrics/putLines"  MONITOR_DATA_FILE_DIR = "/tmp"  LOG_FILE = "/tmp/" + "vm.log"  LOG_LEVEL = logging.INFO  LOG_FILE_MAX_BYTES = 1024 * 1024  LOG_FILE_MAX_COUNT = 3  logger = logging.getLogger('sampler')  logger.setLevel(LOG_LEVEL)  handler = RotatingFileHandler(filename=LOG_FILE, mode='a', maxBytes=LOG_FILE_MAX_BYTES,           backupCount=LOG_FILE_MAX_COUNT)  formatter = logging.Formatter(fmt='%(asctime)s - %(levelname)s - %(message)s')  handler.setFormatter(formatter)  logger.addHandler(handler)  socket.setdefaulttimeout(10)  try:   collector()  except Exception, e:   logger.error(e)   sys.exit(1) 
正文到此结束
Loading...