Created
March 26, 2012 19:43
-
-
Save priteau/2209115 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| import random | |
| import sys | |
| print random.expovariate(1.0/int(sys.argv[1])) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env ruby | |
| require 'json' | |
| require 'logger' | |
| require 'net/ssh' | |
| require 'net/ssh/multi' | |
| require 'pp' | |
| require 'time' | |
| require 'yaml' | |
| PRESERVE_N=10 | |
| MTBF=120 | |
| STUDY_TIME=600 | |
| [ 'RABBITMQ_HOST', 'RUN_NAME' ].each do |var| | |
| abort "#{var} is not set!" if ENV[var].nil? | |
| end | |
| rabbitmq_host = ENV['RABBITMQ_HOST'] | |
| run_name = ENV['RUN_NAME'] | |
| logger = Logger.new(STDOUT) | |
| logger.level = Logger::INFO | |
| `source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu reconfigure --bool health.monitor_health=true epu1` | |
| `source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu reconfigure --int health.boot_timeout=15 epu1` | |
| `source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu reconfigure --int health.missing_timeout=15 epu1` | |
| `source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu reconfigure --int health.really_missing_timeout=5 epu1` | |
| `source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu reconfigure --int health.zombie_seconds=15 epu1` | |
| `source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu reconfigure --int engine_conf.preserve_n=#{PRESERVE_N} epu1` | |
| while true | |
| y = YAML.load(`source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu describe epu1`) | |
| running_instances = y['instances'].select { |i| i['state'] == '600-RUNNING' } | |
| logger.info("Found #{running_instances.count} running instances") | |
| if running_instances.count == PRESERVE_N | |
| break | |
| sleep 10 | |
| end | |
| end | |
| logger.info "Starting total impact study for #{STUDY_TIME} seconds" | |
| study_start = Time.now | |
| study_end = study_start + STUDY_TIME | |
| killed_instances = [] | |
| next_failures = {} | |
| while Time.now < study_end | |
| # Check for new instances | |
| y = YAML.load(`source ~/.virtualenvs/ceiclient/bin/activate; ceictl -n #{run_name} -Y epu describe epu1`) | |
| running_instances = y['instances'].select { |i| i['state'] == '600-RUNNING' } | |
| running_instances.reject { |i| killed_instances.include?(i['instance_id']) }.each do |i| | |
| if next_failures[i['instance_id']].nil? | |
| now = Time.now | |
| t = `./exp.py #{MTBF}`.to_i | |
| logger.info("Next failure of node #{i['instance_id']} is in #{t} seconds") | |
| next_failures[i['instance_id']] = now + t | |
| end | |
| end | |
| if next_failures.empty? | |
| sleep 10 | |
| next | |
| end | |
| sorted_failures = next_failures.sort_by { |instance_id, fail_time| fail_time } | |
| failed_instance_id, fail_time = sorted_failures.shift | |
| next_failures.delete(failed_instance_id) | |
| failed_instance = y['instances'].detect { |i| i['instance_id'] == failed_instance_id } | |
| now = Time.now | |
| if now < fail_time | |
| sleep(fail_time - now) | |
| else | |
| logger.info("I am already late to kill instance #{failed_instance_id}") | |
| end | |
| Net::SSH.start("#{failed_instance['public_ip']}", "ubuntu", :keys => [ File.expand_path('~/.ssh/ooi') ]) do |ssh| | |
| ssh.exec!("sudo kill `cat /home/cc/appmonitor/supervisord.pid`") | |
| logger.info("Killed EPU agent on instance #{failed_instance_id}") | |
| end | |
| killed_instances << failed_instance_id | |
| end | |
| logger.info("Ending total impact study for #{STUDY_TIME} seconds") | |
| `mkdir -p ~/.epumgmt/runlogs/#{run_name}/epum/` | |
| `scp -r ubuntu@#{rabbitmq_host}:/home/epu/epum/logs ~/.epumgmt/runlogs/#{run_name}/epum` | |
| lines = [] | |
| File.open(File.expand_path("~/.epumgmt/runlogs/#{run_name}/epum/logs/logfile.txt")) do |f| | |
| lines = f.readlines | |
| end | |
| events = [] | |
| lines.each do |l| | |
| t = l.match(/^([0-9-]* [0-9:.]*) (.*)$/) | |
| if t | |
| begin | |
| time = Time.parse(t[1]) | |
| rescue | |
| puts "Failed to parse timestamp in line '#{l}'" | |
| raise | |
| end | |
| if time >= study_start and time <= study_end | |
| events << { :time => time, :data => t[2] } | |
| end | |
| end | |
| end | |
| # Get all heartbeats | |
| heartbeats = [] | |
| events.each do |e| | |
| m = e[:data].match(/Got node heartbeat: (.*)$/) | |
| if m | |
| json = m[1].gsub(/u'/, "'") | |
| json.gsub!(/'/, '"') | |
| begin | |
| heartbeats << JSON.parse(json) | |
| rescue | |
| puts "Could not parse JSON in line #{e[:data]}" | |
| raise | |
| end | |
| end | |
| end | |
| first_hb = heartbeats.map { |h| h['timestamp'] }.sort.first | |
| last_hb = heartbeats.map { |h| h['timestamp'] }.sort.last | |
| run_time = (last_hb - first_hb).round(0) | |
| puts "Run time was #{run_time}" | |
| total_times = {} | |
| # Group by node ID | |
| hb_per_node = heartbeats.group_by { |h| h['node_id'] } | |
| hb_per_node.each do |node, hbs| | |
| total_time = 0 | |
| previous_hb = hbs[0] | |
| hbs.each do |hb| | |
| total_time += hb['timestamp'] - previous_hb['timestamp'] | |
| previous_hb = hb | |
| end | |
| total_times[node] = total_time | |
| puts "Node #{node} sent heartbeats during #{total_time} seconds" | |
| end | |
| sum = total_times.values.inject(:+).round(2) | |
| puts "#{sum} seconds of heartbeats over #{run_time * PRESERVE_N} total (#{(100.0 * sum / (run_time * PRESERVE_N)).round(2)}%)" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment