# Ruby/Google - a high level interface to the Google Web API
#
# $Id: google.rb,v 1.37 2006/02/08 00:28:18 ianmacd Exp $
# 
# Version : 0.6.0
# Author  : Ian Macdonald <ian@caliban.org>
#
# Copyright (C) 2002-2006 Ian Macdonald
#
#   This program is free software; you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation; either version 2, or (at your option)
#   any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program; if not, write to the Free Software Foundation,
#   Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

=begin

= NAME
Ruby/Google
= SYNOPSIS
  require 'google'

  KEY = File.open("#{ENV['HOME']}/.google_key") {|kf| kf.readline.chomp}

  query = ARGV.shift || 'ruby programming language'
  google = Google::Search.new(KEY)
  google.utf8('iso-8859-15')

  i = 0
  q = google.search(query)
  q.resultElements.each do |result|
    printf "\nResult # %d\n\n", i += 1
    result.each do |key|
      printf("%s = %s\n", key, result.send(key))
    end
  end

  puts '---------------------------------'
  i = 0
  q.directoryCategories.each do |category|
    printf "\nCategory # %d\n\n", i += 1
    category.each do |key|
      printf("%s = %s\n", key, category.send(key))
    end
  end

  printf "Estimated number of results is %d.\n", q.estimatedTotalResultsCount
  printf "Your query took %6f seconds.\n", q.searchTime

= DESCRIPTION
Ruby/Google allows you to programmatically query the
((<Google search-engine|URL:http://www.google.com/>)). It is currently in the
alpha stage and the interface is liable to change at any time.
= CLASS METHODS
--- Search.new(key)
    This constructs a new Google::Search object. The key parameter is the key
    that Google assigned to you when you registered for the Web API download.
    If you don't yet have a key, go to
    ((<Google|URL:https://www.google.com/accounts/NewAccount?continue=http://api.google.com/createkey&followup=http://api.google.com/createkey>))
    and obtain one.
--- Search.query_length_ok?(query)
    This checks to see whether the query length is under ((|MAX_QUERY_LENGTH|))
    characters. It returns either ((*true*)) or ((*false*)).
--- Search.query_words_ok?(query)
    This checks to see whether the number of words in ((|query|)) is under
    ((|MAX_QUERY_WORDS|)). It returns either ((*true*)) or ((*false*)).
--- Search.query_sites_ok?(query)
    This checks to see whether the number of ((*site:*)) restrict terms in
    ((|query|)) is below ((|MAX_QUERY_SITES|)). It returns either ((*true*))
    or ((*false*)).
== -
The above 3 methods check for compliance with the limitations of the Google
Web API, as defined in section 2.7 of the APIs_Reference.html file that came
with your Google API archive.
--- Search.query_ok?(query)
    This encapsulates the above 3 methods and can be used when one wishes to
    know if ((|query|)) is bad, but not necessarily why it is bad. It returns
    either ((*true*)) or ((*false*)).
--- Search.restrict(type, *data)
    This assembles a query term, based on the restrict ((|type|)) and its
    parameter(s), passed as ((|*data|)). A full list of query terms is
    given in section 2.2 of APIs_Reference.html.
* If ((|type|)) is ((*phrase*)), a double-quoted copy of each string passed as ((|*data|)) is returned.
*   If ((|type|)) is ((*daterange*)), the first three parameters of ((|*data|)) must be the year, month, and day of a start date. The next three parameters, if given, form the year, month, and day of an end date.  If these last three parameters are not given, today's date will be substituted.
*   Other supported restrict types are ((*site*)), ((*intitle*)), ((*allintitle*)), ((*inurl*)), ((*allinurl*)), ((*allintext*)), ((*allinlinks*)), ((*filetype*)), ((*notfiletype*)), ((*info*)), ((*link*)), ((*related*)), ((*cache*)), ((*include*)) and ((*exclude*)). Some of these names differ slightly from those given in section 2.2 of APIs_Reference.html in order to simplify their use and memorability.
= INSTANCE METHODS
--- Search#utf8(source)
    If ((|source|)) is not ((*nil*)), subsequent invocations of Search#search
    and Search#spell will convert their first argument from encoding
    ((|source|)) to UTF-8 prior to sending the request to Google.

    You should use this if your query string is not already UTF-8 and contains
    8 bit characters. Otherwise, an XSD::ValueSpaceError exception may be
    thrown.
--- Search#search(query, start, max, filter, restrict, safe, lr, ie, oe)
    This performs a standard Google query search. Only the ((|query|))
    parameter is mandatory.

    The meaning of the other parameters can be obtained from section 2.1 of
    APIs_Reference.html, although the ((|ie|)) and ((|oe|)) parameters are
    now deprecated and should not be used. A warning will be issued if Ruby
    is run in verbose mode and either of these parameters is used.

    This method returns a Struct::Response object, the members of which are
    described in section 3.1 of APIs_Reference.html.

    The ((|resultElements|)) member is an Array of Struct::ResultElement
    objects. Members of the Struct::ResultElement object are described in
    section 3.2 of APIs_Reference.html. Note that the ((*URL*)) parameter is
    actually represented by the ((|url|)) member, since Ruby does not allow
    a variable name to begin with a capital letter.

    The ((|directoryCategories|)) member is an Array of
    Struct::DirectoryCategory. Members of the Struct::DirectoryCategory object
    are described in section 3.3 of APIs_Reference.html.
--- Search#spell(phrase)
    This performs a Google spell-check on ((|phrase|)). If Google has a
    spelling suggestion to make, a String is returned. Otherwise, ((*nil*)) is
    returned.
--- Search#cache(url)
    This attempts to retrieve a copy of the page corresponding to ((|url|))
    from Google's cache. If Google has not cached the URL in question, a page
    containing a message to this effect will be returned instead.

    This method always returns a String.
= ENVIRONMENT
: HTTP_PROXY or http_proxy
  If this is defined, the named system will be used as an HTTP proxy.
= AUTHOR
Written by Ian Macdonald <ian@caliban.org>
= COPYRIGHT
  Copyright (C) 2002-2006 Ian Macdonald

  This is free software; see the source for copying conditions.
  There is NO warranty; not even for MERCHANTABILITY or FITNESS
  FOR A PARTICULAR PURPOSE.
= SEE ALSO
((<"Ruby/Google home page - http://www.caliban.org/ruby/"|URL:http://www.caliban.org/ruby/>))
((<"Google Web APIs - http://www.google.com/apis/"|URL:http://www.google.com/apis/>))
= BUGS
Send all bug reports, enhancement requests and patches to the author.
= HISTORY
$Id: google.rb,v 1.37 2006/02/08 00:28:18 ianmacd Exp $
=end

require 'soap/wsdlDriver'

module Google
  VERSION = '0.6.0'
  WSDL	  = 'http://api.google.com/GoogleSearch.wsdl'
  
  Response = Struct.new('Response', :directoryCategories, :documentFiltering,
    :endIndex, :estimatedTotalResultsCount, :estimateIsExact, :searchComments,
    :searchQuery, :searchTime, :searchTips, :startIndex, :resultElements)
  
  ResultElement = Struct.new('ResultElement', :url, :snippet, :title,
    :cachedSize, :relatedInformationPresent, :directoryTitle, :summary,
    :hostName, :fullViewableName, :specialEncoding)
  
  DirectoryCategory = Struct.new('DirectoryCategory', :fullViewableName,
				 :specialEncoding)
  
  module Each
    def each(&block)
      self.members.each(&block)
    end
  end

  class DateRangeError < RuntimeError; end
  
  class Search
    MAX_QUERY_LENGTH = 2048
    MAX_QUERY_WORDS = 10
    MAX_SITE_RESTRICTS = 1
    RESTRICTS = {
      'site'		=> 'site:',
      'intitle'		=> 'intitle:',
      'allintitle'	=> 'allintitle:',
      'inurl'		=> 'inurl:',
      'allinurl'	=> 'allinurl:',
      'allintext'	=> 'allintext:',
      'allinlinks'	=> 'allinlinks:',
      'filetype'	=> 'filetype:',
      'notfiletype'	=> '-filetype:',
      'info'		=> 'info:',
      'link'		=> 'link:',
      'related'		=> 'related:',
      'cache'		=> 'cache:',
      'include'		=> '+',
      'exclude'		=> '-'
    }
  
    def Search.query_length_ok?(q)
      q.length <= MAX_QUERY_LENGTH ? true : false
    end
  
    def Search.query_words_ok?(q)
      q.scan(/\w+/).length <= MAX_QUERY_WORDS ? true : false
    end
  
    def Search.query_sites_ok?(q)
      q.scan(/#{RESTRICTS['site']}/i).length <= MAX_SITE_RESTRICTS ? true : false
    end
  
    def Search.query_ok?(q)
      return false unless query_length_ok? q
      return false unless query_words_ok? q
      query_sites_ok? q
    end
  
    def Search.restrict(type, *data)
      type.downcase!
  
      case type
      when 'phrase'
	# double-quote phrase
	return data.collect { |x| ' "%s" ' % x }.join
      when 'daterange'
	if data[2].nil?
	  raise DateRangeError, "missing start date"
	end
	civil_start_date = data[0..2]
  
	if data[5].nil?
	  # incomplete or absent end date. Use today instead
	  now = Time.now
	  civil_end_date = now.year, now.month, now.day
	else
	  civil_end_date = data[3..5]
	end
  
	# convert to Julian dates
	julian_start_date = Date.civil_to_jd(*civil_start_date)
	julian_end_date = Date.civil_to_jd(*civil_end_date)
	return ' daterange:%s-%s ' % [ julian_start_date.to_s,
				       julian_end_date.to_s ]
      else
	# some other kind of restrict
	if RESTRICTS.has_key? type
	  data = data.join(' ').gsub(/(\S+)/, "#{RESTRICTS[type]}\\1")
	  return ' %s ' % data
	elsif $VERBOSE
	  $stderr.puts "Warning: ignoring undefined restrict type '#{type}'."
	end
  
	# unknown restrict type: return null string
	return ''
      end
    end

    def initialize(k)
      @@soap = SOAP::WSDLDriverFactory.new(WSDL).create_rpc_driver \
        unless defined? @@soap
      @key = k
    end

    def utf8(src=nil)
      require 'iconv' if src
      @src_encoding = src
      self
    end

    def to_utf8(str)
      # Check whether we've been given a UTF-8 query string. If not, we'll
      # convert it to UTF-8 to avoid a potential XSD::ValueSpaceError
      # exception on accented characters, etc.
      #
      # The regex below was found here:
      #
      # http://www.w3.org/International/questions/qa-forms-utf-8
      unless str =~
        /^(
           [\x09\x0A\x0D\x20-\x7E]            # ASCII
         | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
         |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
         | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
         |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
         |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
         | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
         |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
	)*$/x
        str = Iconv.new('utf-8', @src_encoding).iconv(str)
      end
      
      str
    end
    private :to_utf8

    def search(query, start=0, max=10, filter=false, restrict='', safe=false,
	       lr='', ie='', oe='')

      query = to_utf8(query) if @src_encoding

      if $VERBOSE && (ie || oe)
	$stderr.puts "Use of 'ie' or 'oe' parameters to Google::Search " +
		     "is deprecated."
      end

      r = @@soap.doGoogleSearch(@key, query, start, max, filter, restrict,
				safe, lr, ie, oe)
      response = Response.new
      fields = %w(directoryCategories documentFiltering endIndex
		  estimatedTotalResultsCount documentFiltering endIndex
		  estimatedTotalResultsCount estimateIsExact searchComments
		  searchQuery searchTime searchTips startIndex)
      fields.each { |field| response[field] = r.send(field) }
  
      response.directoryCategories = r.directoryCategories.collect { |e|
	DirectoryCategory.new(e)
      }
  
      response.resultElements = r.resultElements.collect { |e|
	ResultElement.new(e)
      }
  
      response
    end
  
    def spell(phrase)
      phrase = to_utf8(phrase) if @src_encoding

      r = @@soap.doSpellingSuggestion(@key, phrase)
      r if r.class == String
    end
  
    def cache(url)
      @@soap.doGetCachedPage(@key, url)
    end
  end
  
  class ResultElement
    include Each
  
    def initialize(element)
      self.url = element['URL']
      self.snippet = element.snippet
      self.title = element.title
      self.relatedInformationPresent = element.relatedInformationPresent
  
      if element.cachedSize.class == String
	self.cachedSize = element.cachedSize
      end
  
      if element.directoryTitle.class == String
	self.directoryTitle = element.directoryTitle
      end
  
      if element.summary.class == String
	self.summary = element.summary
      end
  
      if element.hostName.class == String
	self.hostName = element.hostName
      end
  
      if element.directoryCategory.fullViewableName.class == String
	self.fullViewableName = element.directoryCategory.fullViewableName
      end
  
      if element.directoryCategory.specialEncoding == String
	self.specialEncoding = element.directoryCategory.specialEncoding
      end
  
      self
    end
  end
  
  class DirectoryCategory
    include Each
  
    def initialize(element)
      if element.fullViewableName.class == String
	self.fullViewableName = element.fullViewableName
      end
  
      if element.specialEncoding == String
	self.specialEncoding = element.specialEncoding
      end
  
      self
    end
  end
  
end
