Translations of this page:

Google Search Appliance Keyword Cloud

You can get a list of keywords and their popularity by asking for a report from the Google Search Appliance Administrative API. The report is in xml format. The results have to be parsed and turned into a cloud.

Authenticating with the Google Search Appliance

The following code demonstrates how to authenticate with the Google Search Appliance using the cURL utility. We grep for a line called “Auth=” and store this in a variable for later use.

$ TOKEN=$(curl --insecure -F "Email=myEmail" -F "Passwd=myPasswd"
  https://myappliance:8443/accounts/ClientLogin 2>/dev/null| grep "Auth=")

Using cURL to Request a Search Report

<?xml version='1.0' encoding='UTF-8'?>
 <entry xmlns='http://www.w3.org/2005/Atom' xmlns:gsa='http://schemas.google.com
/gsa/2007'>
 <gsa:content name='reportName'>march2010</gsa:content>
 <gsa:content name='collectionName'>default_collection</gsa:content>
 <gsa:content name='reportDate'>month_3_2010</gsa:content>
 <gsa:content name='withResults'>true</gsa:content>
 <gsa:content name='topCount'>25</gsa:content>
</entry>

Of note are the report name, if this exists we get an error telling us this is the case. We would probably want to generate the key word cloud on a daily or weekly basis. This code will send the request to the GSA.

curl -x localhost:8888 -H "Content-type:application/atom+xml" -H "Authorization:
 GoogleLogin $TOKEN" --data "@search.xml" http://YOURHOST:8000/feeds/se
archReport/

We have to wait for the report to be generated before we can retrieve the results. This may take some time.

Using cURL to Request a Search Report

Here we are using cURL to request the report aaa created above on the collection default_collection


curl -x localhost:8888 -H "Authorization: GoogleLogin $TOKEN" http://YOUHOST:8000/feeds/searchReport/aaa@default_collection

and here is an except for the topQueries section of the results. We would need to parse this with an XML parser:-

<topQueries>
<topQuery query="whisky">115</topQuery>
 <topQuery query="bbc">83</topQuery>
 <topQuery query="star wars">55</topQuery>
 <topQuery query="video">39</topQuery>
 <topQuery query="videos">22</topQuery>
 <topQuery query="3d">10</topQuery>
 <topQuery query="surveillance">10</topQuery>
 <topQuery query="toyota">6</topQuery>
 <topQuery query="internet payants">2</topQuery>
 <topQuery query="voiture">2</topQuery>
 <topQuery query="cellulite">1</topQuery>
 <topQuery query="pimp my webcam">1</topQuery>
</topQueries>

Font size

Usually the font size of a word in a cloud is determined by its popularity. Given a range of font sizes from 8pt to 16pt the popularity of a word would have to be normalized to this scale. Linear, logarithmic, exponential etc. scales may be used.

STEPPING= (WORDmax – WORDmin / FONTmax – FONTmin)
FONTSIZE = FONTmin + 1)

1) COUNT – WORDmin) / STEPPING) ===== Code =====
 
<%@ tag language="java" pageEncoding="UTF-8" trimDirectiveWhitespaces="true"%>
 
<%@ tag import="java.io.IOException"%>
<%@ tag import="java.io.ByteArrayInputStream"%>
<%@ tag import="java.io.IOException"%>
<%@ tag import="java.io.InputStream"%>
<%@ tag import="java.net.MalformedURLException"%>
<%@ tag import="java.util.HashMap"%>
 
<%@ tag import="javax.xml.parsers.DocumentBuilder"%>
<%@ tag import="javax.xml.parsers.DocumentBuilderFactory"%>
<%@ tag import="javax.xml.parsers.ParserConfigurationException"%>
<%@ tag import="javax.xml.xpath.XPath"%>
<%@ tag import="javax.xml.xpath.XPathConstants"%>
<%@ tag import="javax.xml.xpath.XPathExpression"%>
<%@ tag import="javax.xml.xpath.XPathExpressionException"%>
<%@ tag import="javax.xml.xpath.XPathFactory"%>
 
<%@ tag import="org.w3c.dom.Document"%>
<%@ tag import="org.w3c.dom.NamedNodeMap"%>
<%@ tag import="org.w3c.dom.Node"%>
<%@ tag import="org.w3c.dom.NodeList"%>
<%@ tag import="org.xml.sax.SAXException"%>
 
<%@ tag import="com.google.enterprise.apis.client.GsaClient"%>
<%@ tag import="com.google.enterprise.apis.client.GsaEntry"%>
<%@ tag import="com.google.gdata.util.AuthenticationException"%>
<%@ tag import="com.google.gdata.util.ServiceException"%>
 
<%@ tag import="java.util.Map"%>
<%@ tag import="java.util.Iterator"%>
<%@ tag import="java.util.Set"%>
<%@ tag import="java.text.SimpleDateFormat"%>
<%@ tag import="java.util.Date"%>
 
 
<%@ attribute name="var" type="java.lang.String" required="false"%>
<%@ attribute name="collection" type="java.lang.String" required="true"%>
<%@ attribute name="period" type="java.lang.String" required="true"%>
<%@ attribute name="update" type="java.lang.Long" required="true"%>
<%@ attribute name="count" type="java.lang.String" required="true"%>
 
<%--
   This tag is a utility to retrieve keywords and search frequencies
   from the Google Search Appliance
   See http://mediasuisse.jira.com/wiki/display/CMS/date+tag
 --%>
 
<%!
GsaClient myClient;
static long lastUpdate = 0;
private static String reportName = "tagCloud";
static boolean updateInProgress = false;
static String tagList = "";
 
 
/*
 * getDateRange
 *
 * @param       period  time in days for range (minimum period of GSA)
 * @returns     Google date range string for admin api e.g.: range_4_18_2010_4_19_2010
 */
private String getDateRange(String period) {
       long t = Long.parseLong(period);
       t = t * 24 * 60 * 60 * 1000;// conver to millis
       SimpleDateFormat simpleDateFormat = new SimpleDateFormat("M_d_yyyy");
       Date today = new Date();
       String to = simpleDateFormat.format(today);
       String from = simpleDateFormat.format(new Date(today.getTime() - t));
 
       return "range_" + from + "_" + to;
}
 
 
/**
 * tries to log into the Admin Console of the appliance.
 *
 * @throws AuthenticationException
 *             in case of password problem
 *
 */
void gsaLogin(String protocol, String hostname, int port, String username, String password) throws AuthenticationException {
       if (myClient == null) {
               myClient = new GsaClient(protocol, hostname, port, username, password);
       }
}
 
 
/*
 * Creates an initial keyword report, should be run once
 *
 * @param       range   - date range report should cover
 * @param       collection - GSA collection
 * @param       count - number of results
 */
private void createTagCloud(String range, String collection, String count) throws MalformedURLException, ServiceException, IOException {
       GsaEntry insertEntry = new GsaEntry();
       insertEntry.addGsaContent("reportName", reportName);
       insertEntry.addGsaContent("collectionName", collection);
       insertEntry.addGsaContent("reportDate", range);
       insertEntry.addGsaContent("withResults", "true");
       insertEntry.addGsaContent("topCount", count);
       myClient.insertEntry("searchReport", insertEntry);
}
 
 
/**
 * Update a previously created report, This may have been created by createTagCloud or
 * directly by the GSA Admin interface.
 *
 * @param       range   - date range report should cover
 * @param       collection - GSA collection
 * @throws MalformedURLException
 * @throws ServiceException
 * @throws IOException
 */
private void updateTagCloud(String range, String collection) throws MalformedURLException, ServiceException, IOException {
       GsaEntry updateEntry = new GsaEntry();
       updateEntry.addGsaContent("reportDate", range);
       myClient.updateEntry("searchReport", reportName + "@" + collection, updateEntry);
}
 
 
private void deleteTagCloud(String collection)
               throws MalformedURLException, ServiceException, IOException {
       myClient.deleteEntry("searchReport", reportName + "@" + collection);
}
 
/*
 * @param       scale   e.g. 0-9 - used for the CSS
 * @see http://code.google.com/intl/fr/apis/searchappliance/documentation/62/gdata/acapi_java.html
*/
private String retrieveTagCloud(int scale, String collection) throws MalformedURLException, ServiceException, IOException, ParserConfigurationException, XPathExpressionException, SAXException {
       StringBuffer tagCloud = new StringBuffer();
       GsaEntry entry = myClient.getEntry("searchReport", reportName + "@" + collection);
       String status = entry.getGsaContent("reportState");
       if ("2".equals(status) || "3".equals(status)) {
               DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance();
               domFactory.setNamespaceAware(true); // never forget this!
               DocumentBuilder builder = domFactory.newDocumentBuilder();
 
               InputStream is = new ByteArrayInputStream(entry.getGsaContent("reportContent").getBytes("UTF-8"));
               Document doc = builder.parse(is);
 
               XPathFactory factory = XPathFactory.newInstance();
               XPath xpath = factory.newXPath();
               XPathExpression expr = xpath.compile("//topQuery");
 
               Object result = expr.evaluate(doc, XPathConstants.NODESET);
               NodeList nodes = (NodeList) result;
 
               int max = 0;
               int min = 0;
               HashMap<String, Integer> keyWords = new HashMap<String, Integer>();
               for (int i = 0; i < nodes.getLength(); i++) {
                       int numSearches = Integer.parseInt(nodes.item(i).getTextContent());
                       if (numSearches > max) {
                               max = numSearches;
                       }
                       NamedNodeMap attributes = nodes.item(i).getAttributes();
                       Node n = attributes.getNamedItem("query");
                       String keyWord = n.getNodeValue();
                       keyWords.put(keyWord, numSearches);
               }// for
 
               // should really be done in page, but we need the max and min
               Set entries = keyWords.entrySet();
               Iterator it = entries.iterator();
               while (it.hasNext()) {
                       Map.Entry<String, Integer> myEntry = (Map.Entry) it.next();
                       tagCloud.append("<a href=\"/services/recherche/?q=" + myEntry.getKey() + "\" class=\"tag"
                                       + (((myEntry.getValue() - min) * scale) / max)
                                       + "\">" + myEntry.getKey() + "</a> ");
               }
       }
       return tagCloud.toString();
}
 
 
private boolean undefined(String value) {
       return value == null || value.length() == 0;
}
 
 
private void doReturnResult(Object result, String var) throws IOException {
       if (undefined(var)) {
               getJspContext().getOut().print(result);
       } else {
               super.getJspContext().setAttribute(var, result);
       }
}
%>
 
<%
       // convert to milliseconds
       long updatePeriod = this.update.longValue() * 1000;
       long time = System.currentTimeMillis();
       String range = getDateRange(this.period);
 
   /*
    * add the protocol, GSA hostname, port, username and password
    */
       gsaLogin("http", "xxx.xxx.xxx.xxx", 8000, "xxxxxxxx", "xxxxxxxx");
 
       /*
        * Update tagCloud every this.update seconds
    */
       if (updateInProgress == true) {
       try {
                       tagList = retrieveTagCloud(9, this.collection);
                       lastUpdate = time;
                       updateInProgress = false;
       } catch (ServiceException se) {
                       tagList = "updateInProgress " + time;
               }
       } else if (time > (lastUpdate + updatePeriod)) {
               updateInProgress = true;
               try {
                       deleteTagCloud(this.collection);
       } catch (ServiceException se) {
                }
               createTagCloud(range, this.collection, this.count);
   }
 
       doReturnResult(tagList, this.var);
%>
===== Further Information ===== http://code.google.com/apis/searchappliance/documentation/62/gdata/acapi_protocol.html
tech/search/gsa-tag-cloud.txt · Last modified: 2010/06/17 15:47 by davidof
Recent changes RSS feed