#!/usr/bin/ruby -w
# -*- ruby -*-

module FileTester

  # the percentage of characters that we allow to be odd in a text file
  ODD_FACTOR = 0.3

  # how many bytes (characters) of a file we test
  TEST_LENGTH = 1024

  # extensions associated with files that are always text:
  KNOWN_TEXT = %w{ txt c cpp mk h hpp html java }

  # extensions associated with files that are never text:
  KNOWN_NONTEXT = %w{ a o obj class elc gif gz jar jpg jpeg png pdf tar Z }

  # returns if the given file is nothing but text (ASCII).
  def FileTester.text?(file)
    # Don't waste our time if it doesn't even exist:
    return false unless File.exists?(file)
    
    if file.index(/\.(\w+)\s*$/)
      suffix = $1
      return true  if KNOWN_TEXT.include?(suffix)
      return false if KNOWN_NONTEXT.include?(suffix)
    end
    
    ntested = 0
    nodd = 0
    f = File.new(file)
    f.each do |line|

      # split returns strings, whereas we want characters (bytes)
      chars = line.split(//, TEST_LENGTH).collect { |w| w[0] }

      # using the limit parameter to split results in the last character being
      # "0" (nil), so remove it

      if chars.size > 1 and chars[-1].to_i == 0
        chars = chars[0 .. -2]
      end
      
      chars.each do |ch|
        ntested += 1

        # never allow null in a text file
        return false if ch.to_i == 0
        
        nodd += 1 unless FileTester.ascii?(ch)
        return FileTester.summary(nodd, ntested) if ntested >= TEST_LENGTH
      end
    end
    
    return FileTester.summary(nodd, ntested)
  end

  def FileTester.summary(nodd, ntested)
    return nodd < ntested * ODD_FACTOR
  end

  # returns if the given character is ASCII.
  def FileTester.ascii?(c)
    # from ctype.h
    return (c.to_i & ~0x7f) == 0
  end

end


if __FILE__ == $0
  ARGV.each do |a|
    istext = FileTester.text?(a)
    puts "#{a}: " + (istext ? "text" : "binary")
  end
end

