Skip to content

Instantly share code, notes, and snippets.

@priteau
Created March 26, 2012 19:43
Show Gist options
  • Select an option

  • Save priteau/2209115 to your computer and use it in GitHub Desktop.

Select an option

Save priteau/2209115 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import random
import sys
print random.expovariate(1.0/int(sys.argv[1]))
#!/usr/bin/env ruby
require 'json'
require 'logger'
require 'net/ssh'
require 'net/ssh/multi'
require 'pp'
require 'time'
require 'yaml'
PRESERVE_N=10
MTBF=120
STUDY_TIME=600
[ 'RABBITMQ_HOST', 'RUN_NAME' ].each do |var|
abort "#{var} is not set!" if ENV[var].nil?
end
rabbitmq_host = ENV['RABBITMQ_HOST']
run_name = ENV['RUN_NAME']
logger = Logger.new(STDOUT)
logger.level = Logger::INFO
`source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu reconfigure --bool health.monitor_health=true epu1`
`source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu reconfigure --int health.boot_timeout=15 epu1`
`source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu reconfigure --int health.missing_timeout=15 epu1`
`source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu reconfigure --int health.really_missing_timeout=5 epu1`
`source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu reconfigure --int health.zombie_seconds=15 epu1`
`source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu reconfigure --int engine_conf.preserve_n=#{PRESERVE_N} epu1`
while true
y = YAML.load(`source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu describe epu1`)
running_instances = y['instances'].select { |i| i['state'] == '600-RUNNING' }
logger.info("Found #{running_instances.count} running instances")
if running_instances.count == PRESERVE_N
break
sleep 10
end
end
logger.info "Starting total impact study for #{STUDY_TIME} seconds"
study_start = Time.now
study_end = study_start + STUDY_TIME
killed_instances = []
next_failures = {}
while Time.now < study_end
# Check for new instances
y = YAML.load(`source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu describe epu1`)
running_instances = y['instances'].select { |i| i['state'] == '600-RUNNING' }
running_instances.reject { |i| killed_instances.include?(i['instance_id']) }.each do |i|
if next_failures[i['instance_id']].nil?
now = Time.now
t = `./exp.py #{MTBF}`.to_i
logger.info("Next failure of node #{i['instance_id']} is in #{t} seconds")
next_failures[i['instance_id']] = now + t
end
end
if next_failures.empty?
sleep 10
next
end
sorted_failures = next_failures.sort_by { |instance_id, fail_time| fail_time }
failed_instance_id, fail_time = sorted_failures.shift
next_failures.delete(failed_instance_id)
failed_instance = y['instances'].detect { |i| i['instance_id'] == failed_instance_id }
now = Time.now
if now < fail_time
sleep(fail_time - now)
else
logger.info("I am already late to kill instance #{failed_instance_id}")
end
Net::SSH.start("#{failed_instance['public_ip']}", "ubuntu", :keys => [ File.expand_path('~/.ssh/ooi') ]) do |ssh|
ssh.exec!("sudo kill `cat /home/cc/appmonitor/supervisord.pid`")
logger.info("Killed EPU agent on instance #{failed_instance_id}")
end
killed_instances << failed_instance_id
end
logger.info("Ending total impact study for #{STUDY_TIME} seconds")
`mkdir -p ~/.epumgmt/runlogs/#{run_name}/epum/`
`scp -r ubuntu@#{rabbitmq_host}:/home/epu/epum/logs ~/.epumgmt/runlogs/#{run_name}/epum`
lines = []
File.open(File.expand_path("~/.epumgmt/runlogs/#{run_name}/epum/logs/logfile.txt")) do |f|
lines = f.readlines
end
events = []
lines.each do |l|
t = l.match(/^([0-9-]* [0-9:.]*) (.*)$/)
if t
begin
time = Time.parse(t[1])
rescue
puts "Failed to parse timestamp in line '#{l}'"
raise
end
if time >= study_start and time <= study_end
events << { :time => time, :data => t[2] }
end
end
end
# Get all heartbeats
heartbeats = []
events.each do |e|
m = e[:data].match(/Got node heartbeat: (.*)$/)
if m
json = m[1].gsub(/u'/, "'")
json.gsub!(/'/, '"')
begin
heartbeats << JSON.parse(json)
rescue
puts "Could not parse JSON in line #{e[:data]}"
raise
end
end
end
first_hb = heartbeats.map { |h| h['timestamp'] }.sort.first
last_hb = heartbeats.map { |h| h['timestamp'] }.sort.last
run_time = (last_hb - first_hb).round(0)
puts "Run time was #{run_time}"
total_times = {}
# Group by node ID
hb_per_node = heartbeats.group_by { |h| h['node_id'] }
hb_per_node.each do |node, hbs|
total_time = 0
previous_hb = hbs[0]
hbs.each do |hb|
total_time += hb['timestamp'] - previous_hb['timestamp']
previous_hb = hb
end
total_times[node] = total_time
puts "Node #{node} sent heartbeats during #{total_time} seconds"
end
sum = total_times.values.inject(:+).round(2)
puts "#{sum} seconds of heartbeats over #{run_time * PRESERVE_N} total (#{(100.0 * sum / (run_time * PRESERVE_N)).round(2)}%)"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment