Pastie now auto-senses if line-wrap is a bad or good idea. Feedback?
## mark a section (Learn more)
# ARGV[0] = filename of sequences to analyze # ARGV[1] = window size # ARGV[2] = output CSV file require 'rubygems' require 'benchmark' require 'fileutils' require 'inline' # using inline, commenting this out #class String # def hamming_distance(s) # sum = 0 # for i in 0...length # sum = sum + 1 if slice(i) != s.slice(i) # end # return sum # end #end class String inline do |builder| if test ?d, "/opt/local" then builder.add_compile_flags "-I/opt/local/include" builder.add_link_flags "-L/opt/local/lib" end builder.prefix <<-"END" int rb_str_hamming_distance(char *x, char *y) { int sum = 0; while (*x != '\\0') { if (*x != *y) sum++; ++x; ++y; } return sum; } END builder.prefix <<-"END" VALUE rb_str_sub_sequence(VALUE str, long offset, long window_size) { return rb_str_substr(str, offset, offset + window_size); } END builder.prefix <<-"END" int rb_str_hamming_inject(VALUE sequences, VALUE sequence, long index, long offset, long window_size) { VALUE sub_sequence, sub_sub_sequence; char *cstr_sequence = StringValueCStr(sequence); int sum = 0, i = index; for (i; i < RARRAY(sequences)->len; i++) { sub_sequence = RARRAY(sequences)->ptr[i]; sub_sub_sequence = rb_str_sub_sequence(RARRAY(sequences)->ptr[i], offset, window_size); sum += rb_str_hamming_distance(cstr_sequence, StringValueCStr(sub_sub_sequence)); } return sum; } END builder.c <<-"END" int hamming_distance(char *s) { return rb_str_hamming_distance(StringValueCStr(self), s); } END builder.c <<-"END" int hamming_inject(VALUE sequences, long index, long offset, long window_size) { return rb_str_hamming_inject(sequences, rb_str_sub_sequence(self, offset, window_size), index, offset, window_size); } END end end class Array inline do |builder| if test ?d, "/opt/local" then builder.add_compile_flags "-I/opt/local/include" builder.add_link_flags "-L/opt/local/lib" end builder.c <<-"END" int hamming_sum(long offset, long window_size) { VALUE sequence; int sum = 0, i = 0; for (i; i < RARRAY(self)->len; i++) { sequence = rb_str_sub_sequence(RARRAY(self)->ptr[i], offset, window_size); sum += rb_str_hamming_inject(self, sequence, i, offset, window_size); } return sum; } END end end module DNA class Cache attr_reader :filename def initialize(filename = 'data.cache') @filename = filename + (File.extname(filename) != 'cache' ? '.cache' : '') create end def handle @handle || open end def exists? File.file?(filename) end def clear delete end def data begin @data = Marshal.load(open('rb').read) rescue @data = 0 ensure close end @data end def save(object) handle.write(Marshal.dump(object)) close end protected def create FileUtils.touch(filename) end def open(flag = 'wb') create unless exists? @handle = File.open(filename, flag) end def close if @handle @handle.close @handle = nil end end def delete close FileUtils.rm(filename) end end end Benchmark.benchmark(" "*7 + Benchmark::CAPTION, 7, Benchmark::FMTSTR, "total:", "avg:") do |x| results = [] 10.times do results << x.report { sequences = IO.readlines(ARGV[0]) window_size = ARGV[1].to_i num_of_seqs = sequences.length genome_len = sequences[0].length cache = DNA::Cache.new(ARGV[2]) File.open(ARGV[2], cache.exists? ? 'a' : 'w') do |out| out.puts "window center,mean pairwise hamming distance" unless cache.exists? for offset in cache.data...(genome_len - window_size + 1) # the next line does all of the for loops in the inline code above sum = sequences.hamming_sum(offset, window_size) # commenting out the ruby version, using inline instead #sum = 0 #for i in 0...(num_of_seqs - 1) # temp = sequences[i][offset, offset + window_size] # for j in (i + 1)...(num_of_seqs) # # the next line uses inline for hamming_distance # sum = sum + temp.hamming_distance(sequences[j][offset, offset + window_size]) # end #end out.puts "#{offset + window_size / 2},#{sum.quo(num_of_seqs * (num_of_seqs - 1) / 2).to_f}" cache.save((offset + window_size / 2) - 1) # next two lines are optional, but might help if ruby locks up out.fsync # forces a write to the disk out.flush # flushes the ruby buffer end end cache.clear } end reports = results.inject { |total,report| total + report } [reports, (reports) / results.length] end
This paste will be private.
From the Design Piracy series on my blog: