priteau · March 26, 2012 19:43
diff --git a/exp.py b/exp.py
 #!/usr/bin/env python

 import random
 import sys

 print random.expovariate(1.0/int(sys.argv[1]))
diff --git a/total_impact.rb b/total_impact.rb
 #!/usr/bin/env ruby

 require 'json'
 require 'logger'
 require 'net/ssh'
 require 'net/ssh/multi'
 require 'pp'
 require 'time'
 require 'yaml'

 PRESERVE_N=10
 MTBF=120
 STUDY_TIME=600

 [ 'RABBITMQ_HOST', 'RUN_NAME' ].each do |var|
  abort "#{var} is not set!" if ENV[var].nil?
 end

 rabbitmq_host = ENV['RABBITMQ_HOST']
 run_name = ENV['RUN_NAME']

 logger = Logger.new(STDOUT)
 logger.level = Logger::INFO

 `source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu reconfigure --bool health.monitor_health=true epu1`
 `source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu reconfigure --int health.boot_timeout=15 epu1`
 `source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu reconfigure --int health.missing_timeout=15 epu1`
 `source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu reconfigure --int health.really_missing_timeout=5 epu1`
 `source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu reconfigure --int health.zombie_seconds=15 epu1`
 `source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu reconfigure --int engine_conf.preserve_n=#{PRESERVE_N} epu1`

 while true
  y = YAML.load(`source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu describe epu1`)
  running_instances = y['instances'].select { |i| i['state'] == '600-RUNNING' }
  logger.info("Found #{running_instances.count} running instances")
  if running_instances.count == PRESERVE_N
    break
  sleep 10
  end
 end

 logger.info "Starting total impact study for #{STUDY_TIME} seconds"
 study_start = Time.now
 study_end = study_start + STUDY_TIME

 killed_instances = []
 next_failures = {}
 while Time.now < study_end
  # Check for new instances
  y = YAML.load(`source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu describe epu1`)
  running_instances = y['instances'].select { |i| i['state'] == '600-RUNNING' }
  running_instances.reject { |i| killed_instances.include?(i['instance_id']) }.each do |i|
    if next_failures[i['instance_id']].nil?
      now = Time.now
      t = `./exp.py #{MTBF}`.to_i
      logger.info("Next failure of node #{i['instance_id']} is in #{t} seconds")
      next_failures[i['instance_id']] = now + t
    end
  end

  if next_failures.empty?
    sleep 10
    next
  end

  sorted_failures = next_failures.sort_by { |instance_id, fail_time| fail_time }
  failed_instance_id, fail_time = sorted_failures.shift
  next_failures.delete(failed_instance_id)
  failed_instance = y['instances'].detect { |i| i['instance_id'] == failed_instance_id }
  now = Time.now
  if now < fail_time
    sleep(fail_time - now)
  else
    logger.info("I am already late to kill instance #{failed_instance_id}")
  end
  Net::SSH.start("#{failed_instance['public_ip']}", "ubuntu", :keys => [ File.expand_path('~/.ssh/ooi') ]) do |ssh|
    ssh.exec!("sudo kill `cat /home/cc/appmonitor/supervisord.pid`")
    logger.info("Killed EPU agent on instance #{failed_instance_id}")
  end
  killed_instances << failed_instance_id
 end

 logger.info("Ending total impact study for #{STUDY_TIME} seconds")

 `mkdir -p ~/.epumgmt/runlogs/#{run_name}/epum/`
 `scp -r ubuntu@#{rabbitmq_host}:/home/epu/epum/logs ~/.epumgmt/runlogs/#{run_name}/epum`

 lines = []
 File.open(File.expand_path("~/.epumgmt/runlogs/#{run_name}/epum/logs/logfile.txt")) do |f|
  lines = f.readlines
 end

 events = []
 lines.each do |l|
  t = l.match(/^([0-9-]* [0-9:.]*) (.*)$/)
  if t
    begin
      time = Time.parse(t[1])
    rescue
      puts "Failed to parse timestamp in line '#{l}'"
      raise
    end
    if time >= study_start and time <= study_end
      events << { :time => time, :data => t[2] }
    end
  end
 end

 # Get all heartbeats
 heartbeats = []
 events.each do |e|
  m = e[:data].match(/Got node heartbeat: (.*)$/)
  if m
    json = m[1].gsub(/u'/, "'")
    json.gsub!(/'/, '"')
    begin
      heartbeats << JSON.parse(json)
    rescue
      puts "Could not parse JSON in line #{e[:data]}"
      raise
    end
  end
 end

 first_hb = heartbeats.map { |h| h['timestamp'] }.sort.first
 last_hb = heartbeats.map { |h| h['timestamp'] }.sort.last
 run_time = (last_hb - first_hb).round(0)
 puts "Run time was #{run_time}"

 total_times = {}
 # Group by node ID
 hb_per_node = heartbeats.group_by { |h| h['node_id'] }
 hb_per_node.each do |node, hbs|
  total_time = 0
  previous_hb = hbs[0]
  hbs.each do |hb|
    total_time += hb['timestamp'] - previous_hb['timestamp']
    previous_hb = hb
  end
  total_times[node] = total_time
  puts "Node #{node} sent heartbeats during #{total_time} seconds"
 end

 sum = total_times.values.inject(:+).round(2)
 puts "#{sum} seconds of heartbeats over #{run_time * PRESERVE_N} total (#{(100.0 * sum / (run_time * PRESERVE_N)).round(2)}%)"
	#!/usr/bin/env python

	import random
	import sys

	print random.expovariate(1.0/int(sys.argv[1]))
	#!/usr/bin/env ruby

	require 'json'
	require 'logger'
	require 'net/ssh'
	require 'net/ssh/multi'
	require 'pp'
	require 'time'
	require 'yaml'

	PRESERVE_N=10
	MTBF=120
	STUDY_TIME=600

	[ 'RABBITMQ_HOST', 'RUN_NAME' ].each do \|var\|
	abort "#{var} is not set!" if ENV[var].nil?
	end

	rabbitmq_host = ENV['RABBITMQ_HOST']
	run_name = ENV['RUN_NAME']

	logger = Logger.new(STDOUT)
	logger.level = Logger::INFO

	`source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu reconfigure --bool health.monitor_health=true epu1`
	`source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu reconfigure --int health.boot_timeout=15 epu1`
	`source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu reconfigure --int health.missing_timeout=15 epu1`
	`source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu reconfigure --int health.really_missing_timeout=5 epu1`
	`source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu reconfigure --int health.zombie_seconds=15 epu1`
	`source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu reconfigure --int engine_conf.preserve_n=#{PRESERVE_N} epu1`

	while true
	y = YAML.load(`source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu describe epu1`)
	running_instances = y['instances'].select { \|i\| i['state'] == '600-RUNNING' }
	logger.info("Found #{running_instances.count} running instances")
	if running_instances.count == PRESERVE_N
	break
	sleep 10
	end
	end

	logger.info "Starting total impact study for #{STUDY_TIME} seconds"
	study_start = Time.now
	study_end = study_start + STUDY_TIME

	killed_instances = []
	next_failures = {}
	while Time.now < study_end
	# Check for new instances
	y = YAML.load(`source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu describe epu1`)
	running_instances = y['instances'].select { \|i\| i['state'] == '600-RUNNING' }
	running_instances.reject { \|i\| killed_instances.include?(i['instance_id']) }.each do \|i\|
	if next_failures[i['instance_id']].nil?
	now = Time.now
	t = `./exp.py #{MTBF}`.to_i
	logger.info("Next failure of node #{i['instance_id']} is in #{t} seconds")
	next_failures[i['instance_id']] = now + t
	end
	end

	if next_failures.empty?
	sleep 10
	next
	end

	sorted_failures = next_failures.sort_by { \|instance_id, fail_time\| fail_time }
	failed_instance_id, fail_time = sorted_failures.shift
	next_failures.delete(failed_instance_id)
	failed_instance = y['instances'].detect { \|i\| i['instance_id'] == failed_instance_id }
	now = Time.now
	if now < fail_time
	sleep(fail_time - now)
	else
	logger.info("I am already late to kill instance #{failed_instance_id}")
	end
	Net::SSH.start("#{failed_instance['public_ip']}", "ubuntu", :keys => [ File.expand_path('~/.ssh/ooi') ]) do \|ssh\|
	ssh.exec!("sudo kill `cat /home/cc/appmonitor/supervisord.pid`")
	logger.info("Killed EPU agent on instance #{failed_instance_id}")
	end
	killed_instances << failed_instance_id
	end

	logger.info("Ending total impact study for #{STUDY_TIME} seconds")

	`mkdir -p ~/.epumgmt/runlogs/#{run_name}/epum/`
	`scp -r ubuntu@#{rabbitmq_host}:/home/epu/epum/logs ~/.epumgmt/runlogs/#{run_name}/epum`

	lines = []
	File.open(File.expand_path("~/.epumgmt/runlogs/#{run_name}/epum/logs/logfile.txt")) do \|f\|
	lines = f.readlines
	end

	events = []
	lines.each do \|l\|
	t = l.match(/^([0-9-]* [0-9:.]) (.)$/)
	if t
	begin
	time = Time.parse(t[1])
	rescue
	puts "Failed to parse timestamp in line '#{l}'"
	raise
	end
	if time >= study_start and time <= study_end
	events << { :time => time, :data => t[2] }
	end
	end
	end

	# Get all heartbeats
	heartbeats = []
	events.each do \|e\|
	m = e[:data].match(/Got node heartbeat: (.*)$/)
	if m
	json = m[1].gsub(/u'/, "'")
	json.gsub!(/'/, '"')
	begin
	heartbeats << JSON.parse(json)
	rescue
	puts "Could not parse JSON in line #{e[:data]}"
	raise
	end
	end
	end

	first_hb = heartbeats.map { \|h\| h['timestamp'] }.sort.first
	last_hb = heartbeats.map { \|h\| h['timestamp'] }.sort.last
	run_time = (last_hb - first_hb).round(0)
	puts "Run time was #{run_time}"

	total_times = {}
	# Group by node ID
	hb_per_node = heartbeats.group_by { \|h\| h['node_id'] }
	hb_per_node.each do \|node, hbs\|
	total_time = 0
	previous_hb = hbs[0]
	hbs.each do \|hb\|
	total_time += hb['timestamp'] - previous_hb['timestamp']
	previous_hb = hb
	end
	total_times[node] = total_time
	puts "Node #{node} sent heartbeats during #{total_time} seconds"
	end

	sum = total_times.values.inject(:+).round(2)
	puts "#{sum} seconds of heartbeats over #{run_time * PRESERVE_N} total (#{(100.0 * sum / (run_time * PRESERVE_N)).round(2)}%)"