From: <Saved by Microsoft Internet Explorer 5>
Subject: Textual and chemical information processing: different domains but similar algorithms
Date: Fri, 15 Oct 2004 17:11:10 +0100
MIME-Version: 1.0
Content-Type: multipart/related;
	boundary="----=_NextPart_000_0000_01C4B2D9.F7DD9D80";
	type="text/html"
X-MimeOLE: Produced By Microsoft MimeOLE V6.00.2800.1441

This is a multi-part message in MIME format.

------=_NextPart_000_0000_01C4B2D9.F7DD9D80
Content-Type: text/html;
	charset="Windows-1252"
Content-Transfer-Encoding: quoted-printable
Content-Location: http://informationr.net/ir/5-2/paper69.html

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<HTML><HEAD><TITLE>Textual and chemical information processing: =
different domains but similar algorithms</TITLE><LINK=20
href=3D"http://informationr.net/ir/IRstyle.css" rel=3Dstylesheet>
<META http-equiv=3DContent-Type content=3D"text/html; =
charset=3Dwindows-1252">
<META content=3D"MSHTML 6.00.2800.1458" name=3DGENERATOR><LINK =
rev=3Dmade=20
href=3D"mailto:t.d.wilson@shef.ac.uk">
<META=20
content=3D"information retrieval, chemical structures, text retrieval, =
screening systems, weighting, biological activity, textual retrieval, =
text databases"=20
name=3Dkeywords>
<META=20
content=3D"This paper discusses the extent to which algorithms developed =
for the processing of textual databases are also applicable to the =
processing of chemical structure databases, and <I>vice</i> =
<i>versa</i>.  Applications discussed include: an algorithm for =
distribution sorting that has been applied to the design of screening =
systems for rapid chemical substructure searching; the use of measures =
of inter-molecular structural similarity for the analysis of hypertext =
graphs; a genetic algorithm for calculating term weights for relevance =
feedback searching for determining whether a molecule is likely to =
exhibit biological activity; and the use of data fusion to combine the =
results of different chemical similarity searches."=20
name=3Ddescription>
<META content=3DMature name=3Drating>
<META content=3DDocument name=3DVW96.objecttype>
<META content=3DALL name=3DROBOTS>
<META=20
content=3D"Textual and chemical information processing: different =
domains but similar algorithms"=20
name=3DDC.Title>
<META content=3D"Peter Willett" name=3DDC.Creator>
<META=20
content=3D"information retrieval, chemical structures, text retrieval, =
screening systems, weighting, biological activity, textual retrieval, =
text databases"=20
name=3DDC.Subject>
<META=20
content=3D"This paper discusses the extent to which algorithms developed =
for the processing of textual databases are also applicable to the =
processing of chemical structure databases, and <I>vice</i> =
<i>versa</i>.  Applications discussed include: an algorithm for =
distribution sorting that has been applied to the design of screening =
systems for rapid chemical substructure searching; the use of measures =
of inter-molecular structural similarity for the analysis of hypertext =
graphs; a genetic algorithm for calculating term weights for relevance =
feedback searching for determining whether a molecule is likely to =
exhibit biological activity; and the use of data fusion to combine the =
results of different chemical similarity searches."=20
name=3DDC.Description>
<META content=3D"Information Research" name=3DDC.Publisher>
<META content=3DGlobal name=3DDC.Coverage.PlaceName>
<SCRIPT language=3DJavaScript type=3Dtext/JavaScript>
<!--
function MM_preloadImages() { //v3.0
  var d=3Ddocument; if(d.images){ if(!d.MM_p) d.MM_p=3Dnew Array();
    var i,j=3Dd.MM_p.length,a=3DMM_preloadImages.arguments; for(i=3D0; =
i<a.length; i++)
    if (a[i].indexOf("#")!=3D0){ d.MM_p[j]=3Dnew Image; =
d.MM_p[j++].src=3Da[i];}}
}

function MM_findObj(n, d) { //v4.01
  var p,i,x;  if(!d) d=3Ddocument; =
if((p=3Dn.indexOf("?"))>0&&parent.frames.length) {
    d=3Dparent.frames[n.substring(p+1)].document; n=3Dn.substring(0,p);}
  if(!(x=3Dd[n])&&d.all) x=3Dd.all[n]; for =
(i=3D0;!x&&i<d.forms.length;i++) x=3Dd.forms[i][n];
  for(i=3D0;!x&&d.layers&&i<d.layers.length;i++) =
x=3DMM_findObj(n,d.layers[i].document);
  if(!x && d.getElementById) x=3Dd.getElementById(n); return x;
}

function MM_nbGroup(event, grpName) { //v6.0
  var i,img,nbArr,args=3DMM_nbGroup.arguments;
  if (event =3D=3D "init" && args.length > 2) {
    if ((img =3D MM_findObj(args[2])) !=3D null && !img.MM_init) {
      img.MM_init =3D true; img.MM_up =3D args[3]; img.MM_dn =3D =
img.src;
      if ((nbArr =3D document[grpName]) =3D=3D null) nbArr =3D =
document[grpName] =3D new Array();
      nbArr[nbArr.length] =3D img;
      for (i=3D4; i < args.length-1; i+=3D2) if ((img =3D =
MM_findObj(args[i])) !=3D null) {
        if (!img.MM_up) img.MM_up =3D img.src;
        img.src =3D img.MM_dn =3D args[i+1];
        nbArr[nbArr.length] =3D img;
    } }
  } else if (event =3D=3D "over") {
    document.MM_nbOver =3D nbArr =3D new Array();
    for (i=3D1; i < args.length-1; i+=3D3) if ((img =3D =
MM_findObj(args[i])) !=3D null) {
      if (!img.MM_up) img.MM_up =3D img.src;
      img.src =3D (img.MM_dn && args[i+2]) ? args[i+2] : ((args[i+1])? =
args[i+1] : img.MM_up);
      nbArr[nbArr.length] =3D img;
    }
  } else if (event =3D=3D "out" ) {
    for (i=3D0; i < document.MM_nbOver.length; i++) {
      img =3D document.MM_nbOver[i]; img.src =3D (img.MM_dn) ? img.MM_dn =
: img.MM_up; }
  } else if (event =3D=3D "down") {
    nbArr =3D document[grpName];
    if (nbArr)
      for (i=3D0; i < nbArr.length; i++) { img=3DnbArr[i]; img.src =3D =
img.MM_up; img.MM_dn =3D 0; }
    document[grpName] =3D nbArr =3D new Array();
    for (i=3D2; i < args.length-1; i+=3D2) if ((img =3D =
MM_findObj(args[i])) !=3D null) {
      if (!img.MM_up) img.MM_up =3D img.src;
      img.src =3D img.MM_dn =3D (args[i+1])? args[i+1] : img.MM_up;
      nbArr[nbArr.length] =3D img;
  } }
}
//-->
</SCRIPT>
</HEAD>
<BODY bgColor=3D#ffffff=20
onload=3D"MM_preloadImages('../figs/iauthori1.gif','../figs/isubji1.gif',=
'../figs/isearch1.gif','../figs/ihome1.gif','../figs/contents1.gif')">
<TABLE cellSpacing=3D0 cellPadding=3D0 align=3Dcenter border=3D0>
  <TBODY>
  <TR>
    <TD align=3Dmiddle colSpan=3D5 height=3D30>
      <H4>Information Research, Vol. 5 No. 2, January =
2000</H4></TD></TR>
  <TR>
    <TD><A=20
      =
onmouseover=3D"MM_nbGroup('over','contents','../figs/contents1.gif','',1)=
"=20
      onclick=3D"MM_nbGroup('down','group1','contents','',1)"=20
      onmouseout=3D"MM_nbGroup('out')"=20
      href=3D"http://informationr.net/ir/5-2/infres52.html" =
target=3D_top><IMG=20
      alt=3D"" src=3D"http://informationr.net/ir/figs/contents.gif" =
onload=3D""=20
      border=3D0 name=3Dcontents></A></TD>
    <TD><A=20
      =
onmouseover=3D"MM_nbGroup('over','authorindex','../figs/iauthori1.gif',''=
,1)"=20
      onclick=3D"MM_nbGroup('down','group1','authorindex','',1)"=20
      onmouseout=3D"MM_nbGroup('out')"=20
      href=3D"http://informationr.net/ir/iraindex.html" =
target=3D_top><IMG height=3D20=20
      alt=3D"" src=3D"http://informationr.net/ir/figs/iauthori.gif" =
width=3D120=20
      onload=3D"" border=3D0 name=3Dauthorindex></A></TD>
    <TD><A=20
      =
onmouseover=3D"MM_nbGroup('over','subjindex','../figs/isubji1.gif','',1)"=
=20
      onclick=3D"MM_nbGroup('down','group1','subjindex','',1)"=20
      onmouseout=3D"MM_nbGroup('out')"=20
      href=3D"http://informationr.net/ir/irsindex.html" =
target=3D_top><IMG height=3D20=20
      alt=3D"" src=3D"http://informationr.net/ir/figs/isubji.gif" =
width=3D120=20
      onload=3D"" border=3D0 name=3Dsubjindex></A></TD>
    <TD><A=20
      =
onmouseover=3D"MM_nbGroup('over','search','../figs/isearch1.gif','',1)"=20
      onclick=3D"MM_nbGroup('down','group1','search','',1)"=20
      onmouseout=3D"MM_nbGroup('out')"=20
      href=3D"http://informationr.net/ir/search.html" target=3D_top><IMG =
height=3D20=20
      alt=3D"" src=3D"http://informationr.net/ir/figs/isearch.gif" =
width=3D120=20
      onload=3D"" border=3D0 name=3Dsearch></A></TD>
    <TD><A =
onmouseover=3D"MM_nbGroup('over','home','../figs/ihome1.gif','',1)"=20
      onclick=3D"MM_nbGroup('down','group1','home','',1)"=20
      onmouseout=3D"MM_nbGroup('out')" =
href=3D"http://informationr.net/ir/"=20
      target=3D_top><IMG alt=3D"" =
src=3D"http://informationr.net/ir/figs/ihome.gif"=20
      onload=3D"" border=3D0 name=3Dhome></A></TD></TR></TBODY></TABLE>
<HR color=3D#ff00ff SIZE=3D3>

<H1>Textual and chemical information processing: different domains but =
similar=20
algorithms</H1><BR>
<H4><A href=3D"mailto:p.willett@sheffield.ac.uk">Peter =
Willett</A><BR>Department=20
of Information Studies and <BR>Krebs Institute for Biomolecular=20
Research<BR>University of Sheffield<BR>Sheffield S10 2TN, =
UK</H4><BR><BR>
<DIV>Abstract</DIV>
<BLOCKQUOTE>This paper discusses the extent to which algorithms =
developed for=20
  the processing of textual databases are also applicable to the =
processing of=20
  chemical structure databases, and <I>vice</I> <I>versa</I>. =
Applications=20
  discussed include: an algorithm for distribution sorting that has been =
applied=20
  to the design of screening systems for rapid chemical substructure =
searching;=20
  the use of measures of inter-molecular structural similarity for the =
analysis=20
  of hypertext graphs; a genetic algorithm for calculating term weights =
for=20
  relevance feedback searching for determining whether a molecule is =
likely to=20
  exhibit biological activity; and the use of data fusion to combine the =
results=20
  of different chemical similarity searches.</BLOCKQUOTE><BR><BR>
<H2>Introduction</H2>
<P>Over the years, researchers in information retrieval (IR) have =
developed many=20
different techniques for the processing of databases of textual =
information.=20
Established examples of such techniques include document clustering, =
relevance=20
feedback, stemming algorithms and text compression, and there are many =
other=20
emerging applications, such as categorisation, event detection and =
information=20
filtering (Spark Jones and Willett, 1997). One such application is =
multimedia=20
retrieval, where there is much current interest in extending algorithms =
and data=20
structures developed for processing textual databases, for the storage =
and=20
retrieval of speech, image and video data (Maybury, 1997). This paper =
argues=20
that at least some of the algorithms that are used in textual =
information=20
retrieval can also be applied to another type of data, <I>viz</I> the=20
two-dimensional (2D) and three-dimensional (3D) chemical structure data =
that=20
forms one of the principal components of chemical information systems =
(Ash <I>et=20
al</I>., 1991). These systems were first developed principally for =
archival=20
purposes, but now play an important role in research programmes to =
discover=20
novel bioactive molecules for the pharmaceutical and agrochemical =
industries=20
(Martin and Willett, 1998). </P>
<P>The University of Sheffield has been involved in research on both =
chemical=20
and textual information retrieval for many years (Lynch and Willett, =
1986).=20
These studies have led us to believe that both areas have much to offer, =
with=20
research in one providing a fertile source of ideas for research in the =
other.=20
In some cases, the relationship is obvious, with algorithms and data =
structures=20
being transferable with little or no change from one application to =
another;=20
while in other cases, the relationship is less direct, involving more a =
general=20
view of the sorts of information processing techniques that are =
required, rather=20
than a direct transfer of technology. Here, we consider the former, more =
direct,=20
type of relationship.</P>
<P>There are clear similarities in the ways that chemical and textual =
database=20
records are characterised. The documents in a text database are each =
typically=20
indexed by some small number of keywords, in just the same way as the 2D =
or 3D=20
molecular representations in a chemical database are each characterised =
by some=20
small number of substructural features chosen from a much larger number =
of=20
potential attributes (as discussed further in the next section of this =
paper).=20
Moreover, both types of attribute follow a well-marked Zipfian =
distribution,=20
with the skewed distributions that characterise the frequencies of =
occurrence of=20
characters, character substrings and words in text databases being =
mirrored by=20
the comparable distributions for the frequencies of chemical moieties. =
Thus, the=20
overwhelming majority of all of the many millions of molecules that have =
ever=20
been made contain the element carbon but even the tenth most frequent =
element,=20
iodine, occurs only about one thousandth as frequently, with the great =
majority=20
of the elements having vanishingly small frequencies of occurrence; =
similar=20
distributions are observed for other types of chemical substructure =
(Lynch,=20
1977). These shared characteristics mean that the two types of database =
are=20
amenable to efficient processing using the same type of file structure. =
Thus one=20
of the first examples of what would now be referred to as text signature =

searching (Barton <I>et al</I>., 1974) arose from previous studies of =
chemical=20
bit-string processing (Adamson <I>et al</I>., 1973), and similar =
comments apply=20
to the use of an inverted file for rapid database clustering (Willett, =
1981,=20
1982). Finally, in just the same way as a document either is, or is not, =

relevant to some particular user query, so a molecule is active, or is =
not=20
active, in some particular biological test, thus allowing comparable =
performance=20
measures to be used to assess search effectiveness in the two types of =
retrieval=20
system (Edgar <I>et al</I>., 1999). </P>
<P>These similarities mean that it is often possible to apply similar =
algorithms=20
to the two different sorts of database, as we describe in some detail =
below.=20
That said, there are obvious differences, most obviously in the =
semantics of the=20
representations that are used. A 2D chemical structure diagram bears a =
much=20
closer relationship to the molecule it describes than does the set of =
words=20
comprising a textual document, and this relationship is still stronger =
when, as=20
is increasingly the case, a 3D structure with XYZ atomic co-ordinate =
data is=20
available for a molecule (Martin and Willett, 1998). Both the structure =
diagram=20
and the co-ordinates can be regarded as direct manifestations of the =
underlying=20
wave equations that describe a molecule, and it has thus proved possible =
to=20
develop powerful simulation techniques to predict the activities and =
properties=20
of molecules from a knowledge of their 2D or 3D structure. Many of these =

molecular modelling tools have no direct textual equivalent, as the use =
of=20
natural language raises a host of linguistic problems that do not arise =
in the=20
chemical context (although it should be noted that linguistic parsing =
and=20
recognition techniques can be used to represent and search the generic =
chemical=20
structures that occur in chemical patents (Barnard <I>et al</I>., 1984; =
Welford=20
<I>et al</I>., 1981)).</P>
<P>The remainder of this paper discusses several applications to support =
the=20
belief that there may be at least some degree of overlap between the =
techniques=20
used to process chemical and textual databases. The first application, =
which is=20
taken from a previous paper discussing a potential relationship between =
these=20
two domains (Willett, 1997), is that of chemical substructure searching; =
this=20
section also introduces the basic components of chemical information =
systems,=20
thus providing some of the necessary background for the more recent =
applications=20
discussed in the following sections.</P>
<H2>Design of screening systems for chemical substructure searching</H2>
<P>Chemical information systems have historically represented molecules =
by means=20
of their 2D chemical structure diagrams: these are encoded as labelled =
graphs in=20
which the nodes and edges of a graph encode the atoms and bonds of a =
molecule=20
(Ash <I>et al</I>., 1991). Searches for substances that contain a =
specific=20
partial structure, such as a cephalosporin ring, can then be effected by =

graph-matching techniques that permit the identification of all =
occurrences of=20
the user. s query substructure in each database structure (Barnard, =
1993). The=20
time-consuming nature of these<I> subgraph isomorphism</I> searches =
means that=20
an initial <I>screening</I> stage is required that can eliminate the =
great bulk=20
of the database from subsequent processing (in just the same way as a =
text=20
signature search is used to reduce the amount of pattern matching that =
is=20
required in serial searches of text databases). The question then arises =
as to=20
what sorts of substructural characteristics should be used as indexing =
keys in=20
the screening search. </P>
<P>The Zipfian distribution of the occurrences of the elements has been=20
mentioned previously, this implying a vast divergence in the =
discriminatory=20
powers of searches of chemical databases that are based on elemental =
type. Work=20
in Sheffield by Lynch and his co-workers in the early Seventies (see,=20
<I>e.g</I>., Adamson <I>et al</I>., 1973) showed that improved =
discriminatory=20
power could be obtained by the use of more sophisticated indexing keys =
that were=20
based on variably-sized chemical <I>fragment substructures</I>, =
<I>i.e</I>.,=20
groups of atoms and bonds. The fragments were chosen so as to occur with =

approximately equal frequencies of occurrence in the database that was =
to be=20
searched, an idea that now forms the basis for many current systems for =
2D=20
substructure searching (Barnard, 1993). </P>
<P>It was soon realised that the concept of equifrequency was of great=20
generality, and this led to an extended investigation of the application =
of=20
equifrequency ideas to the searching and processing of text databases =
(Lynch,=20
1977): here, we consider the studies that were carried out on the =
approach to=20
sorting normally refererd to as <I>distribution sorting</I> (Cooper =
<I>et=20
al</I>., 1980; Cooper and Lynch, 1984). The suggested approach involves =
sorting=20
a very large file in two stages: first, an initial, approximate sorting =
stage=20
that sub-divides the file into an ordered series of sub-files; and then =
an exact=20
sort of each of the sub-files. To those of us familiar with the old days =
when=20
catalogue cards had to be sorted by hand, the initial stage is analogous =
to=20
dividing the cards into the sub-files A to D, E to K, L to R and S to Z. =
In the=20
present context, each of these rough pigeonholes represents a separate =
sub-file=20
on disk and each of the ranges is chosen to ensure that approximately =
equal=20
numbers of records are allocated to each such sub-file, with the aim of=20
maximising the efficiency of the final, in-core sorts of these =
sub-files. </P>
<P>The 3D structure of a molecule plays a vital role in determining its=20
biological activity. The "lock-and-key" theory suggests that a molecule =
may be=20
able to act as a drug if it can fit into the active site of a protein in =
much=20
the same way as a key fits into a lock (Martin and Willett, 1998), and =
there is=20
thus great interest in being able to identify molecules that are =
appropriately=20
"key-shaped". This is done by searching for <I>pharmacophores</I>, =
<I>i.e</I>.,=20
the patterns of atoms in 3D space that are thought to be responsible for =
a=20
molecule binding to an active site in a protein molecule. When our =
studies of=20
pharmacophore searching started in the mid-Eighties, it was soon =
realised that=20
the graph-theoretic methods developed for 2D substructure searching were =
also=20
applicable here, with a 3D molecule being described by a graph in which =
the=20
nodes were the atoms and the edges were the inter-atomic distances =
(Willett,=20
1995). However, the implementation of subgraph isomorphism matching on =
such 3D=20
chemical graphs is still more demanding of computational resources than =
is 2D=20
substructure searching, with a consequent need for the development of =
efficient=20
methods of screening. </P>
<P>The graphs representing 2D chemical molecules are composed of atoms =
and=20
bonds, and this is reflected in the compositions of the screens that are =
used=20
for 2D substructure searching. It thus seemed appropriate to investigate =
screens=20
for 3D searching based on the information contained in 3D chemical =
graphs,=20
<I>i.e</I>., atoms and inter-atomic distances, and we decided to =
evaluate=20
screens consisting of a pair of atoms and an inter-atomic distance =
range. Thus,=20
a screen might represent the presence within a molecule of an oxygen =
atom and a=20
nitrogen atom separated by a distance range, <I>e.g</I>., between 5 and =
7=20
=C5ngstroms. Initial experiments demonstrated the vastly skewed =
distributions of=20
distances that characterise databases of typical 3D molecules and there =
was thus=20
a need to identify such inter-atomic screens so that they occurred with=20
approximately equal frequencies of occurrence (Jakes and Willett, 1986). =
</P>
<P>The approach adopted involved varying the width of the distance range =

associated with each pair of atoms, so that highly populated parts of =
the=20
distance distribution were associated with very narrow ranges, while =
less=20
populated parts of this distribution, or very infrequently occurring =
pairs of=20
atoms, were associated with more extended ranges. The algorithm that was =
finally=20
developed was a simple modification of the distribution sorting =
algorithm=20
described above with, in essence, the distance ranges here corresponding =
to the=20
character ranges that underlie the text-sorting application (Cringean =
<I>et=20
al</I>., 1990). This proved to be both effective and efficient, and =
screens=20
based on the distances between pairs of atoms now form the basis for =
nearly all=20
existing systems for 3D substructure searching; indeed, the same basic=20
methodology can be used to characterise the valence or dihedral angular=20
relationships that exist between sets of three or four atoms, =
respectively, thus=20
allowing database searches to be carried out that involve the =
specification of=20
both distance and angular information. More recently, we have =
demonstrated that=20
analogous procedures can be used to search databases where account is =
taken of=20
the fact that most 3D molecules are not completely rigid, but are =
actually in a=20
state of constant flux (so that the "keys" referred to previously might =
be=20
considered as being more akin to a jelly than to a piece of rigid metal) =

(Willett, 1995). This further application demonstrates clearly the =
generality of=20
equifrequency-based approaches for database processing. </P>
<H2>Manipulation of hypertext graphs using measures of inter-molecular=20
structural similarity.</H2>
<P>Text retrieval systems were initially based upon the Boolean =
retrieval model,=20
but the systems were soon extended to permit best match searching (in =
which the=20
documents are ranked in decreasing order of similarity to the query) =
(Spark=20
Jones and Willett, 1997). A similar progression has occurred in chemical =

information systems, with the substructure searching systems described =
above=20
increasingly being complemented with facilities for what is referred to =
as=20
<I>similarity searching</I>. This generally involves the specification =
of an=20
entire query molecule, the <I>target structure</I>, rather than the =
partial=20
structure that is required for substructure searching. The target is=20
characterised by one or more structural descriptors that are compared =
with the=20
corresponding sets of descriptors for each of the molecules in the =
database to=20
find those <I>nearest neighbour</I> molecules that are most similar to =
the=20
target structure. Two near-contemporaneous studies in the mid-Eighties=20
demonstrated that counts of the numbers of fragment substructures common =
to a=20
pair of molecules provided a computationally efficient, and surprisingly =

effective, basis for quantifying the degree of structural resemblance =
between=20
the two molecules under consideration (Carhart <I>et al</I>, 1985; =
Willett <I>et=20
al</I>., 1986). Specifically, the use of a simple association =
coefficient=20
(usually the Tanimoto coefficient) in conjunction with the lists of =
screens=20
associated with the target structure and each of the database structures =

provided a simple way of investigating inter-molecular structural =
similarities.=20
Such fragment-based methods for similarity searching are now widely used =
as a=20
complement to the established routines for substructure searching =
(Willett=20
<I>et</I> <I>al</I>., 1998) and we have since demonstrated that these =
measures=20
are also applicable to the comparison of textual, rather than chemical, =
graphs.=20
Specifically, as described below, we have used these measures to =
determine the=20
degree of consistency with which hypertext documents are created; other=20
relationships between textual and chemical similarity measures are =
discussed by=20
Willett (1997).</P>
<P>The creation of the intra-document links between the individual =
components of=20
a hypertext document is a difficult, and time-consuming, task, but one =
in which=20
human intervention has commonly been thought necessary if the semantic=20
relationships that exist between the components of the document are to =
be made=20
explicit. A similar view has prevailed for many years with regard to the =

indexing of documents in IR systems, where the existence of =
well-established=20
systems for automatic indexing has not prevented the widespread use of =
trained=20
library and information specialists for indexing and classifying =
documents prior=20
to their incorporation in an online database. The importance of the =
indexing=20
task in IR has led to many studies of <I>inter-indexer consistency</I>,=20
<I>i.e</I>., of the extent to which agreement exists among different =
indexers on=20
the sets of index terms to be assigned to individual documents. These =
studies=20
(as reviewed, <I>e.g</I>., by Markey (1984)) have consistently concluded =
that=20
recorded levels of consistency vary markedly, and that high levels of=20
consistency are rarely achieved; similar examples of manual =
inconsistency are=20
provided by related tasks, such as query formulation and the assessment =
of=20
document relevance (Salton, 1989). The insertion of links in hypertext =
documents=20
may be viewed as being analogous to the assignment of index terms to =
such=20
documents, and we hence undertook a study to determine the extent to =
which=20
different people produce similar link structures for the same hypertext=20
documents (Ellis <I>et</I> <I>al</I>., 1994, 1996). </P>
<P>The hypertext documents were generated from five printed full-text =
documents,=20
each a thesis, journal article or book written by a member of the =
Department of=20
Information Studies at the University of Sheffield. Five copies were =
made of=20
each of the chosen documents, and each of the twenty-five resulting =
copies was=20
allocated to a different student volunteer from the Department. The =
volunteers=20
were instructed in the use of an interactive system that allowed them to =
create=20
explicit representations of links between paragraphs whose contents they =
decided=20
were related, and thus to create hypertext versions of the source =
documents.=20
These hypertexts were stored as graphs in which the nodes represented =
portions=20
of a text (specifically paragraphs in our work but section-based or=20
sentence-based portions could also have been used), with an edge linking =
a pair=20
of nodes if the human linker had created a link between the =
corresponding=20
paragraphs. Pairs of these graphs (describing the same textual document =
but=20
processed by different volunteers) were then compared using a range of=20
similarity measures, most of which were based on those used for chemical =

similarity searching. For example, a commonly used type of chemical =
fragment is=20
the <I>augmented</I> <I>atom</I>, which consists of an atom and the =
atoms that=20
are bonded to it, and it is possible to generate a comparable hypertext =
fragment=20
consisting of a paragraph and those paragraphs that are linked to it. A =
measure=20
of the similarity between two hypertext graphs can then be calculated in =
terms=20
of the number of such fragments that the hypertexts have in common, and =
the=20
resulting similarity is taken as a measure of the degree of inter-linker =

consistency.</P>
<P>Five hypertext versions were produced for each source document, =
giving a=20
total of ten possible pairs of sets of hypertext links for that document =

(<I>i.e</I>., fifty possible pairs for the entire dataset). Similarity =
values=20
were calculated using many different combinations of graph =
representation,=20
fragment descriptor and similarity coefficient. When this was done, it =
was=20
possible to draw a simple, unequivocal conclusion: that levels of =
inter-linker=20
consistency are generally low, but can also be quite variable. It would =
hence=20
seem that different people tend to have very different views of the =
semantic=20
relationships that exist among the components of a full-text document, =
with the=20
possibility that different people will tend to impose different =
hypertext link=20
structures on the same source documents. This lack of agreement leads =
one to=20
question the effectiveness of manually-assigned links in supporting the =
browsing=20
strategies of subsequent hypertext readers, since if inter-linker =
consistency is=20
found to be low, then linker/reader<I> </I>consistency (<I>i.e</I>., the =
level=20
of agreement as to the semantic relationships that exist between the =
components=20
of the text, and that therefore should explicitly be represented by =
links that=20
the reader then has the opportunity to follow) can also be expected to =
be low.=20
Indeed, our results provide some justification for the numerous and =
continuing=20
efforts to develop techniques for the automatic generation of hypertext =
links=20
(see, <I>e.g</I>., Crestani and Melucci, 1998) where the set of links =
created by=20
a particular technique will be produced in a consistent manner, to which =
users=20
might become accustomed as they browse and which might well be no worse =
than=20
those created by human indxers.</P>
<H2>A genetic algorithm for calculating relevance feedback and =
substructural=20
analysis weights.</H2>
<P>The calculation of weights that describe the importance of document =
and query=20
terms is an important component of best-match text IR systems. The most=20
effective results are obtained from using relevance feedback information =

(Robertson and Sparck Jones, 1976; Salton and Buckley, 1990) but efforts =

continue to develop new weighting schemes that might further increase =
search=20
effectiveness. This striving for improved performance has raised the =
question of=20
what is the <I>upperbound</I>, <I>i.e</I>., the maximum possible level, =
that can=20
be achieved by a particular retrieval strategy. We have hence developed =
a=20
genetic algorithm (hereafter a GA) to investigate the upperbound that =
can be=20
achieved by ranked-output retrieval systems that employ weighted query =
terms.=20
Indeed, the GA was developed not only for this purpose but also for =
calculating=20
weights reflecting the extent to which particular types of molecular =
feature=20
contribute to the biological activity of molecules (using the analogies =
between=20
indexing terms and structural features, and between query relevance and=20
biological activity that we have noted in the introduction to this =
paper). We=20
will illustrate the operation of this GA by discussing the textual =
application=20
(Robertson and Willett, 1996) before proceeding to an account of the =
results=20
obtained in our chemical experiments (Gillet <I>et al</I>., 1998). </P>
<P>A GA operates on <I>chromosomes</I>, which are linearly-encoded=20
representations of possible solutions to the problem under =
investigation, with=20
each element of a chromosome describing some particular component of the =
encoded=20
solution. A <I>fitness</I> <I>function</I> is used to calculate a =
numeric score=20
that measures how "good" a solution is represented by each chromosome. =
The set=20
of chromosomes at any particular stage of the processing is referred to =
as the=20
current <I>population</I>, and the chromosomes in this generation are =
processed=20
by <I>genetic</I> <I>operators</I> to yield the population that =
corresponds to=20
the subsequent generation. The two principal operators are =
<I>crossover</I>,=20
which combines parts of the fittest chromosomes, and <I>mutation</I>, =
which=20
introduces new information at random. The chromosomes in the new =
generation are=20
evaluated by means of the fitness function and the genetic operators =
invoked=20
again, this process continuing until convergence is achieved<I>, =
i.e</I>, until=20
there is no increase in the average fitness of each successive =
generation.</P>
<P>Each chromosome in our text GA contains a set of elements, with one =
element=20
for each of the terms comprising the query and with each such element =
containing=20
a weight for one of these terms. The weights for those query terms that =
occur in=20
a particular document are summed, the documents are ranked in decreasing =
order=20
of these sums of weights, and a cut-off is then applied to the ranking =
to=20
retrieve some fixed number of the top-ranked documents. Our experiments =
have=20
used standard document test collections for which full relevance data is =

available, and it is hence possible to determine the recall of the =
search by=20
noting the number of relevant documents that have been retrieved above =
the=20
cut-off. These recall values are used as the fitness function for the =
GA. Once=20
termination has occurred, <I>i.e</I>., the recall values have ceased to=20
increase, the term weights are noted for the chromosome that has the =
largest=20
fitness. Thus, if the GA has successfully explored the full range of =
possible=20
weights, then the final weights provide an upperbound to the retrieval=20
performance obtainable for that query using relevance weighting of =
single=20
terms.</P>
<P>The experiments involved the following seven document test =
collections: Keen=20
(800 documents and 63 queries), Cranfield (1400 documents and 225 =
queries),=20
Harding (2472 documents and 65 queries), Evans (2542 documents and 39 =
queries),=20
LISA (6004 documents and 35 queries), SMART (12694 documents and 77 =
queries) and=20
UKCIS (27361 documents and 182 queries). The GA was run for each query =
in each=20
collection, and the final set of weights used to calculate recall values =
for the=20
top-10, top-20 and top-50 documents. The F4 retrospective relevance =
weights of=20
Robertson and Spark Jones (1976) were used for comparison with the GA =
weights=20
since the former have been found to provide a consistently high level of =

performance in previous studies of relevance feedback searching.</P>
<P>Table 1 (below) summarises the results obtained when the top-20 =
documents=20
were retrieved from a ranking; many other results are presented and =
discussed by=20
Robertson and Willett (1996). An inspection of this table shows that (as =
would=20
be expected) the GA weights generally perform better than the F4 =
weights, in=20
terms of both average recall and numbers of queries where one was =
superior to=20
the other. However, there are generally many searches where the two =
approaches=20
give the same level of performance (to two decimal places in mean =
recall), and=20
often at least a few queries (and a fair number in the case of the =
Harding=20
collection) where the F4 search is better. Where there are differences, =
in=20
either direction, between the two weights, these are overwhelmingly due =
to one=20
of the weights finding a single additional relevant document above the =
cut-off=20
position in the ranking: thus, the two approaches give very similar =
levels of=20
performance. A study of those few queries where the F4 weights were=20
(unexpectedly) superior suggested that these queries were ones in which =
at least=20
some of the F4 weights were negative (this corresponding to terms that =
occur=20
frequently in a collection but only infrequently in the relevant =
documents). The=20
initial version of the GA did not allow for the possibility of negative =
weights:=20
when it was modified to allow for such occurrences, there was a =
substantial=20
reduction in the (already small) number of queries where the F4 weights =
were=20
more effective (Robertson and Willett, 1996). In general then, the F4 =
weights=20
gave a level of performance that was only marginally inferior to those =
provided=20
by the GA weights, which had been obtained by a detailed exploration of =
the=20
term-weight space defined by the query terms. This being so, it seems =
reasonable=20
to conclude that the F4 weights give a practicable upperbound to the =
performance=20
that is achievable in relevance feedback searches of text databases.</P>
<TABLE cellSpacing=3D1 cellPadding=3D7 width=3D498 align=3Dcenter =
bgColor=3D#ffffc4=20
border=3D1>
  <TBODY>
  <TR>
    <TH vAlign=3Dtop width=3D"16%">Document</TD>=20
    <TH vAlign=3Dtop width=3D"33%" colSpan=3D2>Mean Recall</TD>=20
    <TH vAlign=3Dtop width=3D"51%" colSpan=3D3>Number Of Queries</TD> =
</TR>
  <TR>
    <TH vAlign=3Dtop width=3D"16%">Collection</TD>=20
    <TH vAlign=3Dtop align=3Dmiddle width=3D"15%">F4</TD>=20
    <TH vAlign=3Dtop align=3Dmiddle width=3D"17%">GA</TD>=20
    <TH vAlign=3Dtop align=3Dmiddle width=3D"17%">Same</TD>=20
    <TH vAlign=3Dtop align=3Dmiddle width=3D"17%">F4 Better</TD>=20
    <TH vAlign=3Dtop align=3Dmiddle width=3D"17%">GA Better</TD> </TR>
  <TR>
    <TD vAlign=3Dtop width=3D"16%">Keen</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"15%">0.55</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"17%">0.60</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"17%">31</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"17%">2</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"17%">30</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"16%">Cranfield</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"15%">0.58</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"17%">0.65</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"17%">117</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"17%">6</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"17%">102</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"16%">Harding</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"15%">0.34</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"17%">0.33</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"17%">33</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"17%">16</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"17%">16</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"16%">Evans</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"15%">0.38</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"17%">0.42</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"17%">11</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"17%">7</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"17%">21</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"16%">LISA</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"15%">0.61</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"17%">0.64</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"17%">13</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"17%">5</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"17%">17</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"16%">SMART</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"15%">0.30</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"17%">0.34</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"17%">23</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"17%">10</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"17%">44</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"16%">UKCIS</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"15%">0.18</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"17%">0.21</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"17%">105</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"17%">7</TD>
    <TD vAlign=3Dtop align=3Dmiddle =
width=3D"17%">70</TD></TR></TBODY></TABLE>
<P></P>
<DIV align=3Dcenter><B><I>Table 1</I></B>. Retrieval effectiveness in =
relevance=20
feedback searches using F4 and GA weights.<BR><FONT size=3D-1>[The Mean =
Recall=20
portion of the table is the mean recall when averaged over all of the =
queries=20
for a particular test collection using the two types of weight, while =
the=20
right-hand portion shows the number of queries where the GA and F4 =
weights gave=20
the same recall value (Same), and where either the F4 weights or the GA =
weights=20
were better.]</FONT></DIV>
<P>The GA that was used for these relevance feedback experiments was =
also=20
designed to calculate weights for <I>substructural analysis</I>, (Cramer =
<I>et=20
al</I>., 1974). Here, weights are calculated that relate the presence of =
a=20
molecular feature to the probability that that molecule is active in =
some=20
biological test system (<I>cf</I> relating the presence of a specific =
index term=20
in a document to the probability that that document is relevant to a =
particular=20
query). Given some training set of compounds for which the biological =
activities=20
are available, the aim of substructural analysis is to develop weights =
that can=20
then be used to select new compounds for biological testing. =
Specifically the=20
sum of weights is calculated for the fragment substructures present in a =

molecule, and then the compounds are ranked in order of decreasing=20
sums-of-weights, so that chemical synthesis and biological assays can be =
focused=20
on those compounds that have a high <I>a priori</I> probability of =
activity.=20
</P>
<P>The use of substructural analysis methods with fragment substructures =
(such=20
as those discussed in the second section of this paper) is well =
established,=20
with many different types of weighting scheme having been described in =
the=20
literature (Ormerod <I>et al</I>., 1989). The project to be described =
here used=20
a rather different level of molecular description, specifically =
high-level=20
molecular characteristics suggested by medicinal chemists at =
GlaxoWellcome=20
Research and Development (our collaborators in this project) as =
affecting the=20
drug-like behaviour of molecules. The features are the distribution of =
property=20
values for the molecular weight, the =B2<FONT face=3DSymbol>k</FONT> =
<FONT=20
face=3DSymbol>a</FONT> shape index (Kier, 1987), and the numbers of =
aromatic=20
rings, rotatable bonds, hydrogen-bond donor atoms and hydrogen-bond =
acceptor=20
atoms, in the molecules comprising a database. Each property was =
allocated a=20
total of 20 bins; for example, the first bin for the hydrogen-bond donor =
feature=20
describes those molecules in a dataset that have no donor atoms, the =
second bin=20
those that have one donor atom, and so in until the 20th and last, which =

represents those that have 19 or more donor atoms. The molecular weight =
and=20
shape index bins contained ranges of values, rather than specific =
integer=20
counts, <I>e.g</I>., the first two bins for molecular weight described =
the=20
molecules with weights in the range 0-74.99 and 75-149.99, <I>etc</I>. =
</P>
<P>The GA described previously was used to calculate weights for each =
bin of=20
each feature, with each such weight reflected the extent to which =
possession of=20
that feature-value combination resulted in a molecule having some =
specific=20
activity. The fitness function was then based on the occurrences of =
these=20
feature-value pairs in sets of molecules for which activity data are =
available.=20
For this purpose, we used sets of structures extracted from the <I>World =
Drugs=20
Index</I> (or WDI, available from Derwent Information at URL=20
http://www.derwent.co.uk) and <I>SPRESI</I> (available from Daylight =
Chemical=20
Information Systems Inc. at http://www.daylight.com) databases: the =
former=20
contains molecules for which biological activities have been established =
while=20
the latter contains a large number of molecules that are assumed to be =
inactive.=20
As with the text application, the chromosome in the GA encodes possible =
weights=20
(in this case for each of the feature-value bins), and then the score =
for a=20
particular molecule is the sum of the weights for those feature values =
that are=20
associated with it. The fitness function of the GA is simply the number =
of=20
top-ranked active molecules once the molecules have been ranked in =
decreasing=20
order of these sums-of-weights. </P>
<P>Sets of 1000 molecules with some particular activity were chosen from =
the WDI=20
and then combined with a set of 16,807 SPRESI structures. The GA was =
used to=20
calculate weights for this combined dataset, and a note made of the =
number of=20
actives in the top 1000 positions once convergence had occurred. The=20
effectiveness of the weights is measured by the degree of <I>initial</I> =

<I>enhancement</I>, <I>i.e</I>., the ratio of the observed number of =
top-1000=20
actives to the number of actives that would be obtained by selecting =
1000=20
compounds at random (Gillet <I>et al</I>., 1998). These results are =
summarised=20
in Table 2 and it will be seen that the GA weights give consistently =
better=20
results than merely selecting compounds at random for testing, although =
the=20
extent of the improvement is clearly dependent upon the specific =
activity under=20
investigation. Gillet <I>et al</I>. (1998) report comparable results =
from=20
extended experiments using various forms of these weights on a range of=20
datasets; these involved not only retrospective studies (as here) but =
also=20
experiments where the weights were used in a predictive manner. It was =
concluded=20
that the GA provided a simple and effective way of ranking sets of =
compounds in=20
order of decreasing probability of activity, thus enabling the =
prioritisation of=20
compounds for synthesis and biological testing.</P>
<TABLE style=3D"WIDTH: 353px; HEIGHT: 370px" cellSpacing=3D1 =
cellPadding=3D7 width=3D328=20
align=3Dcenter bgColor=3D#ffffc4 border=3D1>
  <TBODY>
  <TR>
    <TH vAlign=3Dtop width=3D"54%">Drug Activity</TD>=20
    <TH vAlign=3Dtop width=3D"46%">Initial Enhancement</TD> </TR>
  <TR>
    <TD vAlign=3Dtop width=3D"54%">Hormones</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"46%">8.3</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"54%">Anti-cancers</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"46%">7.6</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"54%">Anti-microbials</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"46%">7.6</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"54%">Anaesthetics</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"46%">4.1</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"54%">Blood</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"46%">3.6</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"54%">Central nervous system</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"46%">3.9</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"54%">Psychotropics</TD>
    <TD vAlign=3Dtop align=3Dmiddle =
width=3D"46%">2.5</TD></TR></TBODY></TABLE>
<P></P>
<DIV align=3Dcenter>Table 2. Effectiveness of ranking active compounds =
using the=20
GA-based substructural analysis weights.<BR><FONT size=3D-1>[The Drug =
Activity=20
listed is that contained in the keyword activity field for each of the =
compounds=20
in the WDI database.]</FONT></DIV>
<H2>Combination of chemical similarity measures using data fusion</H2>
<P>The many types of similarity measures that are available for the =
measurement=20
of molecular similarity has led to comparative studies in which =
researchers try=20
to identify a single, "best" measure, using some quantitative =
performance=20
criterion. For example, Willett <I>et al</I>. (1986) discuss the merits =
of=20
different association coefficient for chemical similarity searching and =
conclude=20
by recommending the Tanimoto coefficient for this purpose. Such =
comparisons, of=20
which there are many in the chemical information literature, are limited =
in that=20
they assume, normally implicitly, that there is some specific type of =
structural=20
feature (similarity coefficient, weighting scheme or whatever it is that =
is=20
being investigated) that is uniquely well suited to describing the =
type(s) of=20
biological activity that are being sought for in a similarity search. =
The=20
assumption cannot be expected to be generally valid, given the =
multi-faceted=20
nature of biological activities, and this has led us to consider =
chemical=20
applications of <I>data</I> <I>fusion</I> (Hall, 1992). </P>
<P>Data fusion was developed to combine inputs from different sensors, =
with the=20
expectation that using multiple information sources enables more =
effective=20
decisions to be made than if just a single sensor was to be employed. =
The=20
methods are used in a wide range of military, surveillance, medical and=20
production engineering applications (see, <I>e.g</I>., Arabnia and Zhu =
(1998)):=20
our interest was aroused by a paper by Belkin <I>et al</I>. (1995), in =
which=20
data fusion was used to combine the results of different searches of a =
text=20
database, conducted in response to a single query but employing =
different=20
indexing and searching strategies. A query was processed using different =

strategies, each of which was used to ranking the database in order of=20
decreasing similarity with the query. The ranks for each of the =
documents were=20
then combined using one of several different fusion rules, the output of =
the=20
fusion rule was taken as the document. s new similarity score and the =
fused=20
lists were then re-ranked in descending order of similarity. </P>
<P>Ginn <I>et al</I>. (1999) have described the application of these =
ideas in=20
the context of matching a target structure against a database, using =
several=20
different measures of chemical similarity as summarised in Figure 1.</P>
<DIV style=3D"BACKGROUND-COLOR: aqua">
<OL>
  <LI>Execute a similarity search of a chemical database for some =
particular=20
  target structure using two, or more, different measures of =
inter-molecular=20
  similarity.=20
  <P></P>
  <LI>Note the rank position, <I>r<SUB>i</I></SUB>, of each database =
structure=20
  in the ranking resulting from use of the <I>i</I>-th similarity =
measure.=20
  <P></P>
  <LI>Combine the various rankings using a fusion rule to give a new =
combined=20
  score for each database structure=20
  <P></P>
  <LI>Rank the resulting combined scores, and then use this ranking to =
calculate=20
  a quantitative measure of the effectiveness of the search for the =
chosen=20
  target structure.</LI></OL></DIV>
<DIV align=3Dcenter>Figure 1. Combination of similarity rankings using =
data=20
fusion</DIV>
<P>The fusion rules used were those identified by Belkin <I>et al</I>. =
(1995)=20
and shown in Figure 2, and the combined scores output by the fusion rule =
are=20
then used to re-order the database structures to give the final ranked =
output.=20
It will be seen from Figure 2 that the MIN and MAX rules represent the=20
assignment of extreme ranks to database structures and it is thus hardly =

surprising that both can be highly sensitive to the presence of a single =
"poor"=20
similarity measure amongst those that are being combined. The SUM rule, =
where=20
each database structure is assigned the sum of all the rank positions at =
which=20
it occurs in the input lists, is expected to be more stable against the =
presence=20
of a single poor or noisy input ranking, and this was generally found to =
be the=20
rule of choice in our experiments (Ginn <I>et al</I>., 1999).</P>
<TABLE cellSpacing=3D1 cellPadding=3D7 width=3D261 align=3Dcenter =
bgColor=3D#ffffc4=20
border=3D1>
  <TBODY>
  <TR>
    <TD vAlign=3Dtop width=3D"25%">
      <P align=3Dcenter>Name</P></TD>
    <TD vAlign=3Dtop width=3D"75%">
      <P align=3Dcenter>Fusion<B> </B>Rule</P></TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"25%">
      <P align=3Dcenter>MIN</P></TD>
    <TD vAlign=3Dtop width=3D"75%">
      <P>minimum (<I>r</I><SUB>1</SUB>, <I>r</I><SUB>2</SUB> =
,&amp;<I>r<SUB>i=20
      </I></SUB>&amp;<I>r<SUB>n</I></SUB>)</P></TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"25%">
      <P align=3Dcenter>MAX</P></TD>
    <TD vAlign=3Dtop width=3D"75%">
      <P>maximum (<I>r</I><SUB>1</SUB>, =
<I>r</I><SUB>2</SUB>,&amp;<I>r<SUB>i=20
      </I></SUB>&amp;<I>r<SUB>n</I></SUB>)</P></TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"25%">
      <P align=3Dcenter>SUM</P></TD>
    <TD vAlign=3Dtop width=3D"75%">
      <P align=3Dcenter><IMG height=3D30=20
      src=3D"http://informationr.net/ir/5-2/p69fig1.gif"=20
  width=3D45></P></TD></TR></TBODY></TABLE>
<P></P>
<DIV align=3Dcenter>Figure 2: Fusion rules for combining <I>n</I> ranked =
lists,=20
where <I>r<SUB>i</I></SUB> denotes the rank position of a specific =
database=20
structure in the <I>i</I>-th (1 <FONT face=3DSymbol>=A3</FONT> <I>i</I> =
<FONT=20
face=3DSymbol>=A3</FONT> <I>n</I>) ranked list.</DIV>
<P>Our experiments involved combining searches of several different =
datasets=20
using several different similarity measures in each case. The dataset =
considered=20
here contained 75 compounds selected by Kahn (1998) in a discussion of =
various=20
types of structural descriptor, with each of the compounds belonging to =
one of=20
14 well-defined biological activity classes (such as =
angiotensin-converting=20
enzyme inhibitors and HIV-1 protease inhibitors). Six different types of =

similarity measures were used in the experiments, as detailed by Ginn =
<I>et=20
al</I>. (1999); for the present, we note merely that they encoded =
information=20
about the steric, electrostatic and hydrophobic characteristics of =
molecules=20
(similarity measures denoted by the symbols "F" and "J"), about the 3D=20
arrangement of pharmacophore points in molecules (denoted by the symbols =
"3" or=20
"T") and about the occurrences of chains of up to 7 non-hydrogen atoms =
(denoted=20
by the symbols "2" or "N"). The SUM rule was used to generate all =
possible=20
combinations of rankings from these six similarity measures. Each of the =
members=20
of the dataset was used as the target structure for a similarity search =
and=20
Table 3 details the mean numbers of actives (<I>i.e</I>., molecules with =
the=20
same activity as the target structure) found in the top-10 nearest =
neighbours=20
when averaged over all 75 searches. The values of <I>c</I> at the top of =
the=20
table denote the number of similarity measures that were fused (so that, =

<I>e.g</I>., <I>c</I>=3D1 represents the original measures and =
<I>c</I>=3D2=20
represents the fusion of a pair of the original measures) and a shaded =
element=20
indicates a fused combination that is better than the best original =
individual=20
measure (which was one of the 3D pharmacophore measures).</P>
<TABLE cellSpacing=3D1 cellPadding=3D7 width=3D612 align=3Dcenter =
bgColor=3D#ffffc4=20
border=3D1>
  <TBODY>
  <TR>
    <TH vAlign=3Dtop align=3Dmiddle width=3D"14%" =
colSpan=3D2><I>c</I>=3D1</TD>=20
    <TH vAlign=3Dtop align=3Dmiddle width=3D"16%" =
colSpan=3D2><I>c</I>=3D2</TD>=20
    <TH vAlign=3Dtop align=3Dmiddle width=3D"16%" =
colSpan=3D2><I>c</I>=3D3</TD>=20
    <TH vAlign=3Dtop align=3Dmiddle width=3D"17%" =
colSpan=3D2><I>c</I>=3D4</TD>=20
    <TH vAlign=3Dtop align=3Dmiddle width=3D"18%" =
colSpan=3D2><I>c</I>=3D5</TD>=20
    <TH vAlign=3Dtop align=3Dmiddle width=3D"20%" =
colSpan=3D2><I>c</I>=3D6</TD> </TR>
  <TR>
    <TD vAlign=3Dtop width=3D"6%">2&gt;</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%">0.80</TD>
    <TD vAlign=3Dtop width=3D"8%">23</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%">1.10</TD>
    <TD vAlign=3Dtop width=3D"8%" bgColor=3D#d1ffa4>23F</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.28</TD>
    <TD vAlign=3Dtop width=3D"9%" bgColor=3D#d1ffa4>23FJ</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.52</TD>
    <TD vAlign=3Dtop width=3D"11%" bgColor=3D#d1ffa4>23FJN</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.45</TD>
    <TD vAlign=3Dtop width=3D"12%" bgColor=3D#d1ffa4>23FJNT</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.43</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"6%">3</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%">1.12</TD>
    <TD vAlign=3Dtop width=3D"8%">2F</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%">1.04</TD>
    <TD vAlign=3Dtop width=3D"8%" bgColor=3D#d1ffa4>23J</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.39</TD>
    <TD vAlign=3Dtop width=3D"9%" bgColor=3D#d1ffa4>23FN</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.23</TD>
    <TD vAlign=3Dtop width=3D"11%" bgColor=3D#d1ffa4>23FJT</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.69</TD>
    <TD vAlign=3Dtop width=3D"12%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"6%">F</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%">0.89</TD>
    <TD vAlign=3Dtop width=3D"8%">2J</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%">1.01</TD>
    <TD vAlign=3Dtop width=3D"8%">23N</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%">1.04</TD>
    <TD vAlign=3Dtop width=3D"9%" bgColor=3D#d1ffa4>23FT</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.43</TD>
    <TD vAlign=3Dtop width=3D"11%" bgColor=3D#d1ffa4>23FNT</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.36</TD>
    <TD vAlign=3Dtop width=3D"12%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"6%">J</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%">1.08</TD>
    <TD vAlign=3Dtop width=3D"8%">2N</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%">0.68</TD>
    <TD vAlign=3Dtop width=3D"8%" bgColor=3D#d1ffa4>23T</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.24</TD>
    <TD vAlign=3Dtop width=3D"9%" bgColor=3D#d1ffa4>23JN</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.31</TD>
    <TD vAlign=3Dtop width=3D"11%" bgColor=3D#d1ffa4>23JNT</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.43</TD>
    <TD vAlign=3Dtop width=3D"12%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"6%">N</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%">0.63</TD>
    <TD vAlign=3Dtop width=3D"8%">2T</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%">0.95</TD>
    <TD vAlign=3Dtop width=3D"8%" bgColor=3D#d1ffa4>2FJ</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.35</TD>
    <TD vAlign=3Dtop width=3D"9%" bgColor=3D#d1ffa4>23JT</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.45</TD>
    <TD vAlign=3Dtop width=3D"11%" bgColor=3D#d1ffa4>2FJNT</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.43</TD>
    <TD vAlign=3Dtop width=3D"12%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"6%">T</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%">0.69</TD>
    <TD vAlign=3Dtop width=3D"8%">3F</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%">1.09</TD>
    <TD vAlign=3Dtop width=3D"8%">2FN</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%">1.08</TD>
    <TD vAlign=3Dtop width=3D"9%" bgColor=3D#d1ffa4>23NT</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.25</TD>
    <TD vAlign=3Dtop width=3D"11%" bgColor=3D#d1ffa4>3FJNT</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.51</TD>
    <TD vAlign=3Dtop width=3D"12%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"6%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%" bgColor=3D#d1ffa4>3J</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.25</TD>
    <TD vAlign=3Dtop width=3D"8%" bgColor=3D#d1ffa4>2FT</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.28</TD>
    <TD vAlign=3Dtop width=3D"9%" bgColor=3D#d1ffa4>2FJN</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.28</TD>
    <TD vAlign=3Dtop width=3D"11%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"12%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"6%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">3N</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%">1.00</TD>
    <TD vAlign=3Dtop width=3D"8%">2JN</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%">1.03</TD>
    <TD vAlign=3Dtop width=3D"9%" bgColor=3D#d1ffa4>2FJT</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.53</TD>
    <TD vAlign=3Dtop width=3D"11%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"12%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"6%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%" bgColor=3D#d1ffa4>3T</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.32</TD>
    <TD vAlign=3Dtop width=3D"8%">2JT</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%">1.10</TD>
    <TD vAlign=3Dtop width=3D"9%" bgColor=3D#d1ffa4>2FNT</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.28</TD>
    <TD vAlign=3Dtop width=3D"11%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"12%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"6%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%" bgColor=3D#d1ffa4>FJ</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.20</TD>
    <TD vAlign=3Dtop width=3D"8%">2NT</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%">0.95</TD>
    <TD vAlign=3Dtop width=3D"9%" bgColor=3D#d1ffa4>2JNT</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.17</TD>
    <TD vAlign=3Dtop width=3D"11%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"12%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"6%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">FN</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%">0.91</TD>
    <TD vAlign=3Dtop width=3D"8%" bgColor=3D#d1ffa4>3FJ</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.40</TD>
    <TD vAlign=3Dtop width=3D"9%" bgColor=3D#d1ffa4>3FJN</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.35</TD>
    <TD vAlign=3Dtop width=3D"11%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"12%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"6%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">FT</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%">1.11</TD>
    <TD vAlign=3Dtop width=3D"8%" bgColor=3D#d1ffa4>3FN</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.19</TD>
    <TD vAlign=3Dtop width=3D"9%" bgColor=3D#d1ffa4>3FJT</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.55</TD>
    <TD vAlign=3Dtop width=3D"11%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"12%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"6%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">JN</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%">0.89</TD>
    <TD vAlign=3Dtop width=3D"8%" bgColor=3D#d1ffa4>3FT</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.33</TD>
    <TD vAlign=3Dtop width=3D"9%" bgColor=3D#d1ffa4>3FNT</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.41</TD>
    <TD vAlign=3Dtop width=3D"11%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"12%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"6%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">JT</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%">0.93</TD>
    <TD vAlign=3Dtop width=3D"8%" bgColor=3D#d1ffa4>3JN</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.25</TD>
    <TD vAlign=3Dtop width=3D"9%" bgColor=3D#d1ffa4>3JNT</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.36</TD>
    <TD vAlign=3Dtop width=3D"11%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"12%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"6%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">NT</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%">0.85</TD>
    <TD vAlign=3Dtop width=3D"8%" bgColor=3D#d1ffa4>3JT</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.45</TD>
    <TD vAlign=3Dtop width=3D"9%" bgColor=3D#d1ffa4>FJNT</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.32</TD>
    <TD vAlign=3Dtop width=3D"11%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"12%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"6%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%" bgColor=3D#d1ffa4>3NT</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.20</TD>
    <TD vAlign=3Dtop width=3D"9%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"11%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"12%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"6%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">FJN</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%">1.11</TD>
    <TD vAlign=3Dtop width=3D"9%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"11%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"12%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"6%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%" bgColor=3D#d1ffa4>FJT</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.21</TD>
    <TD vAlign=3Dtop width=3D"9%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"11%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"12%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"6%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">FNT</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%">1.11</TD>
    <TD vAlign=3Dtop width=3D"9%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"11%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"12%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD></TR>
  <TR>
    <TD vAlign=3Dtop width=3D"6%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%" bgColor=3D#d1ffa4>JNT</TD>
    <TD vAlign=3Dtop align=3Dmiddle width=3D"8%" =
bgColor=3D#d1ffa4>1.12</TD>
    <TD vAlign=3Dtop width=3D"9%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"11%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"12%">&nbsp;</TD>
    <TD vAlign=3Dtop width=3D"8%">&nbsp;</TD></TR></TBODY></TABLE>
<P></P>
<DIV align=3Dcenter>Table 3</B>. Mean number of actives found in the =
top-10=20
positions for chemical similarity searches when combining various =
numbers,=20
<I>c</I>, of different similarity measures.<BR><FONT size=3D-1>[The =
pale-green=20
shading indicates a fused result at least as good as the best original=20
similarity measure.]</FONT></DIV>
<P>It will be seen that very many of the fused combinations in Figure 3 =
are=20
shaded, thus supporting the idea that improvements in effectiveness can =
be=20
achieved by using more than just a single similarity measure. The table =
also=20
shows that the fraction of the combinations that are shaded increases in =
line=20
with <I>c</I>, so that all combinations with <I>c<FONT =
face=3DSymbol>=B3</FONT>=20
</I>4 perform at least as well as the best of the individual similarity=20
measures. That said, the best result overall was obtained with 23FJT =
(rather=20
than with 23FJNT, the combination involving all of the individual =
measures):=20
thus, while simply fusing as many individual measures as are available =
appears=20
to work very well, superior results may be obtained (for this dataset at =
least)=20
from fusing a subset of the individual measures.</P>
<P>Other datasets studied by Ginn <I>et al</I>. (1999) were: 8178 =
molecules from=20
the Starlist file (available from BioByte Corp. at URL =
http://clogp.pomona.edu)=20
for which experimental values of the octanol/water partition =
coefficient, an=20
important parameter in statistical methods for the prediction of =
biological=20
activity, were available; several sets of 3,500 molecules from the WDI =
database=20
for which biological activity data were available; and 136 biological =
dyes used=20
to stain cells so as to visualise various organelles. In all cases, it =
was found=20
that use of a fusion rule such as SUM generally resulted in a level of=20
performance (however quantified) that was at least as good as (and often =

noticeably better than) the best individual measure. The best individual =
measure=20
often varies from one target structure to another in an unpredictable =
manner,=20
and the use of a fusion rule will thus generally provide a more =
consistent level=20
of searching performance than will a single measure of chemical =
similarity. With=20
the increasing number of such measures available that can be implemented =
with a=20
reasonable degree of efficiency (Willett <I>et al</I>., 1998), it would =
seem=20
appropriate to consider the use of more than one of them for similarity=20
searching of chemical structure databases.</P>
<H2>Conclusions</H2>
<P>This paper has presented several examples of algorithms and data =
structures=20
that are applicable to the processing of both textual and chemical =
databases.=20
These examples suggest that each area has much to offer to the other, =
with the=20
transfer of methodology occurring from chemical to textual applications, =
and=20
<I>vice versa</I>.</P>
<P>The substructure searching methods discussed in the second section =
provide an=20
excellent example of this transfer of ideas. Here, an approach that was =
first=20
developed for chemical retrieval (specifically in the context of 2D =
substructure=20
searching) was soon shown to be applicable to many applications in the =
general=20
area of textual databases. One such application, distribution sorting, =
resulted=20
in an algorithm that was then adapted for use in a rather different area =
of=20
chemical retrieval (3D substructure searching). In fact, although not =
discussed=20
here, the idea of using distance and angular information for database =
searching=20
has now been extended to the computationally demanding task of searching =
for=20
patterns in the 3D structures of proteins, where our search methods have =
led to=20
the discovery of many previously unknown structural relationships =
between=20
proteins (see, <I>e.g</I>., Artymiuk <I>et al</I>., 1996, 1997). In the =
case of=20
the hypertext-comparison project, the similarity measures discussed =
above were=20
originally developed for calculating the similarities between pairs of =
2D=20
molecules, while the GA for the calculation of weights was designed from =
the=20
start for use in both relevance feedback and substructural analysis. =
Finally,=20
our use of data fusion for combining chemical similarity measures was a =
simple=20
application of work done by others in the textual domain. Given the =
range of=20
applications we have already been able to identify, we trust that other=20
researchers will be encouraged to investigate the many similarities that =
exist=20
between chemical and textual database processing. </P><B>
<P>Acknowledgements</B>. I thank all of my colleagues, past and present, =
for=20
their contributions to the research that has been summarised here, and =
the many=20
organisations that have provided financial support for my research over =
the=20
years. The Krebs Institute for Biomolecular Research is a designated=20
Biomolecular Sciences Centre of the Biotechnology and Biological =
Sciences=20
Research Council.</P>
<H2>References</H2>
<UL>
  <LI>Adamson, G.W., Cowell, J., Lynch, M.F., McLure, A.H.W., Town, W.G. =
and=20
  Yapp, A.M. (1973) "Strategic considerations in the design of screening =
systems=20
  for substructure searches of chemical structure files." <I>Journal of =
Chemical=20
  Documentation</I>, <B>13</B>, 153-157.=20
  <LI>Arabnia, H.R. and Zhu, D., editors (1998). <I>Proceedings of the=20
  International Conference on Multisource-Multisensor Information =
Fusion,=20
  Fusion. 98</I>. CSREA Press.=20
  <LI>Artymiuk, P.J., Poirrette, A.R., Rice, D.W. and Willett, P. (1996) =
"Biotin=20
  carboxylase comes into the fold." <I>Nature Structural Biology</I>, =
<B>3</B>,=20
  128-132.=20
  <LI>Artymiuk, P.J., Poirrette, A.R., Rice, D.W. and Willett, P. (1997) =
"A=20
  polymerase I palm in adenylyl cyclase?" <I>Nature</I>, <B>388</B>, =
1997,=20
  33-34.=20
  <LI>Ash, J.E., Warr, W.A. and Willett, P., editors (1991) <I>Chemical=20
  Structure Systems</I>. Chichester: Ellis Horwood.=20
  <LI>Barnard, J.M. (1993) "Substructure searching methods: old and =
new."=20
  <I>Journal of Chemical Information and Computer Sciences</I>, =
<B>33</B>, 532 -=20
  538.=20
  <LI>Barnard, J.M., Lynch, M.F. and Welford, S.M. (1984) "Computer =
storage and=20
  retrieval of generic chemical structures in patents. Part 6. An =
interpreter=20
  program for the generic structure language GENSAL." <I>Journal of =
Chemical=20
  Information and Computer Sciences</I>, <B>24</B>, 66-70.=20
  <LI>Barton, I.J., Creasey, S.E., Lynch, M.F. and Snell, M.J. (1974) =
"An=20
  information-theoretic approach to text-searching in direct-access =
systems."=20
  <I>Communications of the Association for Computing Machinery</I>, =
<B>17</B>,=20
  345-350.=20
  <LI>Belkin, N.J., Kantor, P., Fox, E.A. and Shaw, J.B. (1995) =
"Combining the=20
  evidence of multiple query representations for information =
retrieval.<I>"=20
  Information Processing and Management</I>, <B>31</B>, 431-448.=20
  <LI>Carhart, R.E., Smith, D.H., Venkataraghavan, R. (1985) "Atom pairs =
as=20
  molecular features in structure-activity studies: definition and=20
  applications." <I>Journal of Chemical Information and Computer =
Sciences</I>,=20
  <I>25</I>, 64-73.=20
  <LI>Cooper, D., Dicker, M.E. and Lynch, M.F. (1980) "Sorting of =
textual=20
  databases: a variety generation approach to distribution sorting."=20
  <I>Information Processing and Management</I>, <B>16</B>, 49-56.=20
  <LI>Cooper, D. and Lynch, M.F. (1984) "The use of binary search trees =
in=20
  external distribution sorting." <I>Information Processing and =
Management</I>,=20
  <B>20</B>, 547-557.=20
  <LI>Cramer, R.D., Redl, G. and Berkoff, C.E. (1974) "Substructural =
analysis. A=20
  novel approach to the problem of drug design." <I>Journal of Medicinal =

  Chemistry</I>, <B>17</B>, 533-535.=20
  <LI>Crestani, F. and Melucci, M. (1998) "A case study of automatic =
authoring:=20
  From a textbook to a hyper-textbook." <I>Data and Knowledge =
Engineering</I>,=20
  <B>27</B>, 1-30.=20
  <LI>Cringean, J.K., Pepperrell, C.A., Poirrette, A.R. and Willett, P. =
(1990)=20
  "Selection of screens for three-dimensional substructure searching."=20
  <I>Tetrahedron Computer Methodology</I>, <B>3</B>, 37-46.=20
  <LI>Edgar, S.J., Holliday, J.D. and Willett, P. (1999) "Effectiveness =
of=20
  retrieval in similarity searches of chemical databases: a review of=20
  performance measures." In preparation.=20
  <LI>Ellis, D., Furner-Hines, J. and Willett, P. (1994) "On the =
creation of=20
  hypertext links in full-text documents: Measurement of inter-linker=20
  consistency." <I>Journal of Documentation</I>, <B>50</B>, 67-98.=20
  <LI>Ellis, D., Furner-Hines, J. and Willett, P. (1996) "On the =
creation of=20
  hypertext links in full-text documents: Measurement of retrieval=20
  effectiveness." <I>Journal of the American Society for Information=20
  Science</I>, <B>47</B>, 287-300.=20
  <LI>Gillet, V.J., Willett, P. &amp; Bradshaw, J. (1998) =
"Identification of=20
  biological activity profiles using substructural analysis and genetic=20
  algorithms." <I>Journal of Chemical Information and Computer =
Sciences</I>,=20
  <B>38</B>, 165-179.=20
  <LI>Ginn, C.M.R., Willett, P. and Bradshaw, J. (1999) "Combination of=20
  molecular similarity measures using data fusion." <I>Perspectives in =
Drug=20
  Discovery and Design</I>, submitted for publication.=20
  <LI>Hall, D.L. (1992) <I>Mathematical Techniques in Multisensor Data=20
  Fusion</I>. Northwood, MA: Artech House.=20
  <LI>Jakes, S.E. and Willett, P. (1986) "Pharmacophoric pattern =
matching in=20
  files of 3-D chemical structures: selection of inter-atomic distance =
screens."=20
  <I>Journal of Molecular Graphics</I>, <B>4</B>, 12-20.=20
  <LI>Kahn, S.D. (1998) "Combinatorial libraries: structure activity =
analysis,=20
  in: Schleyer, P.v.R., Allinger, N.L., Clark, T., Gasteiger, J., =
Kollman, P.A.,=20
  Schaefer III, H.F. and Schreiner, P.R., editors, <I>Encyclopedia of=20
  Computational Chemistry</I>. Chichester: John Wiley. Vol 1, pp. =
417-425.=20
  <LI>Kier, L.B. (1987) "Indexes of molecular shape from chemical =
graphs<I>."=20
  Medicinal Research Reviews</I>, <B>7</B>, 417-440.=20
  <LI>Lynch, M.F. (1977) "Variety generation - a re-interpretation of =
Shannon. s=20
  mathematical theory of communication and its implications for =
information=20
  science<I>.</I>"<I> Journal of the American Society for Information=20
  Science</I>, <B>28</B>, 19-25.=20
  <LI>Lynch, M.F. and Willett, P. (1987) "Information retrieval research =
in the=20
  Department of Information Studies, University of Sheffield." =
<I>Journal of=20
  Information Science</I>, <B>13</B>, 221-234.=20
  <LI>Markey, K. (1984) "Inter-indexer consistency tests: A literature =
review=20
  and report of a test of consistency in indexing visual materials." =
<I>Library=20
  and Information Science Research</I>, <B>6</B>, 155-177.=20
  <LI>Martin. Y.C. and Willett, P., editors (1998) <I>Designing =
Bioactive=20
  Molecules: Three-Dimensional Techniques and Applications</I>. =
Washington:=20
  American Chemical Society.=20
  <LI>Maybury, M.T., editor (1997) <I>Intelligent Multimedia Information =

  Retrieval</I>. Cambridge MA: MIT Press.=20
  <LI>Ormerod, A., Willett, P. and Bawden, D. (1989) "Comparison of =
fragment=20
  weighting schemes for substructural analysis."<I> Quantitative=20
  Structure-Activity Relationships</I>, <B>8</B>, 115-129.=20
  <LI>Robertson, A.M. and Willett, P. (1996) "An upperbound to the =
performance=20
  of ranked-output searching: optimal weighting of query terms using a =
genetic=20
  algorithm." <I>Journal of Documentation</I>, <B>52</B>, 1996, 405-420. =

  <LI>Robertson, S.E. and Sparck Jones, K. (1976) "Relevance weighting =
of search=20
  terms." <I>Journal of the American Society for Information =
Science</I>,=20
  <B>27</B>, 129-146.=20
  <LI>Salton, G. (1989) <I>Automatic Text Processing</I>. Reading, MA:=20
  Addison-Wesley.=20
  <LI>Salton, G. and Buckley, C. (1990) "Improving retrieval performance =
by=20
  relevance feedback." <I>Journal of the American Society for =
Information=20
  Science</I>, <B>41</B>, 288-297.=20
  <LI>Sparck Jones, K. and Willett, P., editors (1997) <I>Readings in=20
  Information Retrieval</I>. San Francisco: Morgan Kaufmann.=20
  <LI>Welford, S.M., Lynch, M.F. and Barnard, J.M. (1981) "Computer =
storage and=20
  retrieval of generic chemical structures in patents. Part 3. Chemical=20
  structure grammars and their role in the manipulation of chemical =
structures."=20
  <I>Journal of Chemical Information and Computer Sciences</I>, =
<B>21</B>,=20
  161-168.=20
  <LI>Willett, P. (1981) "A fast procedure for the calculation of =
similarity=20
  coefficients in automatic classification." <I>Information Processing =
and=20
  Management</I>, <B>17</B>, 53-60.=20
  <LI>Willett, P. (1982) "The calculation of inter-molecular similarity=20
  coefficients using an inverted file algorithm." <I>Analytica Chimica =
Acta</I>,=20
  <B>138</B>, 339-342.=20
  <LI>Willett, P. (1995) "Searching for pharmacophoric patterns in =
databases of=20
  three-dimensional chemical structures." <I>Journal of Molecular=20
  Recognition</I>, <B>8</B>, 290-303.=20
  <LI>Willett, P. (1997) "Information retrieval research in the =
University of=20
  Sheffield", <I>ACM SIGIR Forum</I>, <B>31</B>(2), 7-13.=20
  <LI>Willett, P., Barnard, J.M. and Downs, G.M. (1998) "Chemical =
similarity=20
  searching." <I>Journal of Chemical Information and Computer =
Sciences</I>,=20
  <B>38</B>, 983-996.=20
  <LI>Willett, P., Winterman, V., Bawden, D. (1986) "Implementation of =
nearest=20
  neighbour searching in an online chemical structure search system =
<B>26</B>,=20
  36-41. </LI></UL>
<HR color=3D#ff00ff SIZE=3D3>

<P style=3D"FONT-WEIGHT: bold; COLOR: red; TEXT-ALIGN: center">How to =
cite this=20
paper:</I></P>
<P style=3D"COLOR: black; TEXT-ALIGN: left">Willett, Peter (2000)&nbsp; =
"Textual=20
and chemical information processing: different domains but similar=20
algorithms"&nbsp;&nbsp;<EM>Information Research</EM>, =
<STRONG>5</STRONG>(2)=20
Available at: http://informationr.net/ir/5-2/paper69.html</P>
<P style=3D"TEXT-ALIGN: center">=A9 the author, 2000. &nbsp; Last =
updated: 5th=20
January 2000 </P>
<HR color=3D#ff00ff SIZE=3D3>

<TABLE cellSpacing=3D0 cellPadding=3D15 align=3Dcenter border=3D0>
  <TBODY>
  <TR>
    <TD><A href=3D"http://informationr.net/ir/5-2/infres52.html">
      <H4>Contents</H4></A></TD>
    <TD vAlign=3Dtop align=3Dmiddle>
      <H5 align=3Dcenter><IMG height=3D20 hspace=3D4=20
      src=3D"http://counter.digits.com/wc/-d/-z/6/-b/FF0033/paper69" =
width=3D60=20
      align=3Dmiddle vspace=3D2 border=3D0><BR><A =
href=3D"http://www.digits.com/">Web=20
      Counter</A></H5></TD>
    <TD><A href=3D"http://informationr.net/ir/">
      <H4>Home</H4></A></TD></TR></TBODY></TABLE>
<HR color=3D#ff00ff SIZE=3D3>
</BODY></HTML>

------=_NextPart_000_0000_01C4B2D9.F7DD9D80
Content-Type: image/gif
Content-Transfer-Encoding: base64
Content-Location: http://informationr.net/ir/figs/contents.gif

R0lGODlheAAUAPcAAAAAAAAAMwAAZgAAmQAAzAAA/zMAADMAMzMAZjMAmTMAzDMA/2YAAGYAM2YA
ZmYAmWYAzGYA/5kAAJkAM5kAZpkAmZkAzJkA/8wAAMwAM8wAZswAmcwAzMwA//8AAP8AM/8AZv8A
mf8AzP8A/wAzAAAzMwAzZgAzmQAzzAAz/zMzADMzMzMzZjMzmTMzzDMz/2YzAGYzM2YzZmYzmWYz
zGYz/5kzAJkzM5kzZpkzmZkzzJkz/8wzAMwzM8wzZswzmcwzzMwz//8zAP8zM/8zZv8zmf8zzP8z
/wBmAABmMwBmZgBmmQBmzABm/zNmADNmMzNmZjNmmTNmzDNm/2ZmAGZmM2ZmZmZmmWZmzGZm/5lm
AJlmM5lmZplmmZlmzJlm/8xmAMxmM8xmZsxmmcxmzMxm//9mAP9mM/9mZv9mmf9mzP9m/wCZAACZ
MwCZZgCZmQCZzACZ/zOZADOZMzOZZjOZmTOZzDOZ/2aZAGaZM2aZZmaZmWaZzGaZ/5mZAJmZM5mZ
ZpmZmZmZzJmZ/8yZAMyZM8yZZsyZmcyZzMyZ//+ZAP+ZM/+ZZv+Zmf+ZzP+Z/wDMAADMMwDMZgDM
mQDMzADM/zPMADPMMzPMZjPMmTPMzDPM/2bMAGbMM2bMZmbMmWbMzGbM/5nMAJnMM5nMZpnMmZnM
zJnM/8zMAMzMM8zMZszMmczMzMzM///MAP/MM//MZv/Mmf/MzP/M/wD/AAD/MwD/ZgD/mQD/zAD/
/zP/ADP/MzP/ZjP/mTP/zDP//2b/AGb/M2b/Zmb/mWb/zGb//5n/AJn/M5n/Zpn/mZn/zJn//8z/
AMz/M8z/Zsz/mcz/zMz/////AP//M///Zv//mf//zP///wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACwAAAAAeAAUAAAI/gB//YpD
sKDBgwgTKlzIsKHDhxAfCoxIsaLFixgjotjIsaPHjyBDihxJsqTJkyYJolzJsqXLlynjwJxJs6ZN
jipvWtu586bPnx5z0uTZ0drMnkONAg0qsyZSnUqPRl26USjMpyF5Tu2pdaPWrUQ5cg37VWzYmlZf
Yv2I9OnZtlNRwDUbda5ZoGldri0K1mtfuX/9Cr4LmC/eplJFYrU7ePFfx4P9xqWZt+Vewo0fg/2q
FHJhvpddVmYZ+nNmw6ZToz7NtvTK0aQnR/7s2XNr1KFdo4S98qxcwm41e7xceyvmmbxj+wZs27Rv
zrOD110OMznV69hJWs/OvTuK7d7DAy8NCAA7

------=_NextPart_000_0000_01C4B2D9.F7DD9D80
Content-Type: image/gif
Content-Transfer-Encoding: base64
Content-Location: http://informationr.net/ir/figs/iauthori.gif

R0lGODlheAAUAPcAAAAAAAAAMwAAZgAAmQAAzAAA/zMAADMAMzMAZjMAmTMAzDMA/2YAAGYAM2YA
ZmYAmWYAzGYA/5kAAJkAM5kAZpkAmZkAzJkA/8wAAMwAM8wAZswAmcwAzMwA//8AAP8AM/8AZv8A
mf8AzP8A/wAzAAAzMwAzZgAzmQAzzAAz/zMzADMzMzMzZjMzmTMzzDMz/2YzAGYzM2YzZmYzmWYz
zGYz/5kzAJkzM5kzZpkzmZkzzJkz/8wzAMwzM8wzZswzmcwzzMwz//8zAP8zM/8zZv8zmf8zzP8z
/wBmAABmMwBmZgBmmQBmzABm/zNmADNmMzNmZjNmmTNmzDNm/2ZmAGZmM2ZmZmZmmWZmzGZm/5lm
AJlmM5lmZplmmZlmzJlm/8xmAMxmM8xmZsxmmcxmzMxm//9mAP9mM/9mZv9mmf9mzP9m/wCZAACZ
MwCZZgCZmQCZzACZ/zOZADOZMzOZZjOZmTOZzDOZ/2aZAGaZM2aZZmaZmWaZzGaZ/5mZAJmZM5mZ
ZpmZmZmZzJmZ/8yZAMyZM8yZZsyZmcyZzMyZ//+ZAP+ZM/+ZZv+Zmf+ZzP+Z/wDMAADMMwDMZgDM
mQDMzADM/zPMADPMMzPMZjPMmTPMzDPM/2bMAGbMM2bMZmbMmWbMzGbM/5nMAJnMM5nMZpnMmZnM
zJnM/8zMAMzMM8zMZszMmczMzMzM///MAP/MM//MZv/Mmf/MzP/M/wD/AAD/MwD/ZgD/mQD/zAD/
/zP/ADP/MzP/ZjP/mTP/zDP//2b/AGb/M2b/Zmb/mWb/zGb//5n/AJn/M5n/Zpn/mZn/zJn//8z/
AMz/M8z/Zsz/mcz/zMz/////AP//M///Zv//mf//zP///wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACwAAAAAeAAUAAAI/gB//YpD
sKDBgwgTKlzIsKHDhxAfCoxIsaLFixgjotjIsaPHjyBDihxJsqTJkyYJolzJsqXLlynjwERpzdrM
mxtr4gSp0qROlz93ogh6kqhQFD1LGhVJdOlMpyOh3kwadahNpVdzZt0pNWRXmFSZWu0YVGfNs1qt
blW71ezasWPRknU7t2zTkmG9wk3Lt+lPu3zVer0qF7DgvlnLxlTK0TBcv40Jt5XMlDLiuYEhvw2Z
92Piz5ED7/07WbRH0qH3Pj4rV+tmzjKjsi4MWrVhyKpPW16NObfu1x87k9Wdefdo46gHFxedHKTN
rxyFNya+Gnlt3qZ7OyZ9fXh2nrH1KnqW/Pzzbr+4nZ+fXL7u37p4wx+dT78+Uvn28+tvKX2///8d
9QfggPoFBAA7

------=_NextPart_000_0000_01C4B2D9.F7DD9D80
Content-Type: image/gif
Content-Transfer-Encoding: base64
Content-Location: http://informationr.net/ir/figs/isubji.gif

R0lGODlheAAUAPcAAAAAAAAAMwAAZgAAmQAAzAAA/zMAADMAMzMAZjMAmTMAzDMA/2YAAGYAM2YA
ZmYAmWYAzGYA/5kAAJkAM5kAZpkAmZkAzJkA/8wAAMwAM8wAZswAmcwAzMwA//8AAP8AM/8AZv8A
mf8AzP8A/wAzAAAzMwAzZgAzmQAzzAAz/zMzADMzMzMzZjMzmTMzzDMz/2YzAGYzM2YzZmYzmWYz
zGYz/5kzAJkzM5kzZpkzmZkzzJkz/8wzAMwzM8wzZswzmcwzzMwz//8zAP8zM/8zZv8zmf8zzP8z
/wBmAABmMwBmZgBmmQBmzABm/zNmADNmMzNmZjNmmTNmzDNm/2ZmAGZmM2ZmZmZmmWZmzGZm/5lm
AJlmM5lmZplmmZlmzJlm/8xmAMxmM8xmZsxmmcxmzMxm//9mAP9mM/9mZv9mmf9mzP9m/wCZAACZ
MwCZZgCZmQCZzACZ/zOZADOZMzOZZjOZmTOZzDOZ/2aZAGaZM2aZZmaZmWaZzGaZ/5mZAJmZM5mZ
ZpmZmZmZzJmZ/8yZAMyZM8yZZsyZmcyZzMyZ//+ZAP+ZM/+ZZv+Zmf+ZzP+Z/wDMAADMMwDMZgDM
mQDMzADM/zPMADPMMzPMZjPMmTPMzDPM/2bMAGbMM2bMZmbMmWbMzGbM/5nMAJnMM5nMZpnMmZnM
zJnM/8zMAMzMM8zMZszMmczMzMzM///MAP/MM//MZv/Mmf/MzP/M/wD/AAD/MwD/ZgD/mQD/zAD/
/zP/ADP/MzP/ZjP/mTP/zDP//2b/AGb/M2b/Zmb/mWb/zGb//5n/AJn/M5n/Zpn/mZn/zJn//8z/
AMz/M8z/Zsz/mcz/zMz/////AP//M///Zv//mf//zP///wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACwAAAAAeAAUAAAI/gB//YpD
sKDBgwgTKlzIsKHDhxAfCoxIsaLFixgjotjIsaPHjyBDihxJsqTJkyYJolzJsqXLlynjhLRG05pJ
miJxwhypc2XPnShUfvxZkihQjkY7JiW5tKVQpTZ9Rj3qsSkKqzOnAn2KVGtXqBtx1qyqtabXq2bT
Zg1r0yzZtmXHdi37kSvbsz91ys07VW9csjyjykX7FW1fwX/DgrR7NzHbx4UJS4Zs1KpfsJMl8wV7
diPjuZE1O75cuHLn0ok3Z+a79DPo1Ygxi04bG3DO2pRT0/ba1DVqzKpFr7U9XLXx041D+s4NfDRu
4qGHPifNPGfmjowPR95LffJSopanOWs3HB1p+aAypYMfe3kw5Mas/Z7uzhq226tQzy6nep2//537
UYXVfwSWFCBM9xWooFPpLejggywFBAA7

------=_NextPart_000_0000_01C4B2D9.F7DD9D80
Content-Type: image/gif
Content-Transfer-Encoding: base64
Content-Location: http://informationr.net/ir/figs/isearch.gif

R0lGODlheAAUAPcAAAAAAAAAMwAAZgAAmQAAzAAA/zMAADMAMzMAZjMAmTMAzDMA/2YAAGYAM2YA
ZmYAmWYAzGYA/5kAAJkAM5kAZpkAmZkAzJkA/8wAAMwAM8wAZswAmcwAzMwA//8AAP8AM/8AZv8A
mf8AzP8A/wAzAAAzMwAzZgAzmQAzzAAz/zMzADMzMzMzZjMzmTMzzDMz/2YzAGYzM2YzZmYzmWYz
zGYz/5kzAJkzM5kzZpkzmZkzzJkz/8wzAMwzM8wzZswzmcwzzMwz//8zAP8zM/8zZv8zmf8zzP8z
/wBmAABmMwBmZgBmmQBmzABm/zNmADNmMzNmZjNmmTNmzDNm/2ZmAGZmM2ZmZmZmmWZmzGZm/5lm
AJlmM5lmZplmmZlmzJlm/8xmAMxmM8xmZsxmmcxmzMxm//9mAP9mM/9mZv9mmf9mzP9m/wCZAACZ
MwCZZgCZmQCZzACZ/zOZADOZMzOZZjOZmTOZzDOZ/2aZAGaZM2aZZmaZmWaZzGaZ/5mZAJmZM5mZ
ZpmZmZmZzJmZ/8yZAMyZM8yZZsyZmcyZzMyZ//+ZAP+ZM/+ZZv+Zmf+ZzP+Z/wDMAADMMwDMZgDM
mQDMzADM/zPMADPMMzPMZjPMmTPMzDPM/2bMAGbMM2bMZmbMmWbMzGbM/5nMAJnMM5nMZpnMmZnM
zJnM/8zMAMzMM8zMZszMmczMzMzM///MAP/MM//MZv/Mmf/MzP/M/wD/AAD/MwD/ZgD/mQD/zAD/
/zP/ADP/MzP/ZjP/mTP/zDP//2b/AGb/M2b/Zmb/mWb/zGb//5n/AJn/M5n/Zpn/mZn/zJn//8z/
AMz/M8z/Zsz/mcz/zMz/////AP//M///Zv//mf//zP///wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACwAAAAAeAAUAAAI6AB//YpD
sKDBgwgTKlzIsKHDhxAfCoxIsaLFixgjotjIsaPHjyBDihxJsqTJkyYJolzJsqXLlynjwCxpraa1
mStr4mypcidInT5RAg0ak2jHoUZJIk0asmfSpUdtfrR5k6POoVSrXoXKFIVTo1Knaq26kSxWpFDD
ck36FWzYsmRRcD0b9afZuF29yswrl65VvHD/1hUrmO/GtkzVZkVLtXBgwo8NI048ViRQv5EH9zV8
eK/hy4AjY94MmbRkz2Adm9Y8erXo0GxRu02b9e/WzHAVq847mbPv3xx7Ax/OVzjx40aNI1+OMyAA
Ow==

------=_NextPart_000_0000_01C4B2D9.F7DD9D80
Content-Type: image/gif
Content-Transfer-Encoding: base64
Content-Location: http://informationr.net/ir/figs/ihome.gif

R0lGODlheAAUAPcAAAAAAAAAMwAAZgAAmQAAzAAA/zMAADMAMzMAZjMAmTMAzDMA/2YAAGYAM2YA
ZmYAmWYAzGYA/5kAAJkAM5kAZpkAmZkAzJkA/8wAAMwAM8wAZswAmcwAzMwA//8AAP8AM/8AZv8A
mf8AzP8A/wAzAAAzMwAzZgAzmQAzzAAz/zMzADMzMzMzZjMzmTMzzDMz/2YzAGYzM2YzZmYzmWYz
zGYz/5kzAJkzM5kzZpkzmZkzzJkz/8wzAMwzM8wzZswzmcwzzMwz//8zAP8zM/8zZv8zmf8zzP8z
/wBmAABmMwBmZgBmmQBmzABm/zNmADNmMzNmZjNmmTNmzDNm/2ZmAGZmM2ZmZmZmmWZmzGZm/5lm
AJlmM5lmZplmmZlmzJlm/8xmAMxmM8xmZsxmmcxmzMxm//9mAP9mM/9mZv9mmf9mzP9m/wCZAACZ
MwCZZgCZmQCZzACZ/zOZADOZMzOZZjOZmTOZzDOZ/2aZAGaZM2aZZmaZmWaZzGaZ/5mZAJmZM5mZ
ZpmZmZmZzJmZ/8yZAMyZM8yZZsyZmcyZzMyZ//+ZAP+ZM/+ZZv+Zmf+ZzP+Z/wDMAADMMwDMZgDM
mQDMzADM/zPMADPMMzPMZjPMmTPMzDPM/2bMAGbMM2bMZmbMmWbMzGbM/5nMAJnMM5nMZpnMmZnM
zJnM/8zMAMzMM8zMZszMmczMzMzM///MAP/MM//MZv/Mmf/MzP/M/wD/AAD/MwD/ZgD/mQD/zAD/
/zP/ADP/MzP/ZjP/mTP/zDP//2b/AGb/M2b/Zmb/mWb/zGb//5n/AJn/M5n/Zpn/mZn/zJn//8z/
AMz/M8z/Zsz/mcz/zMz/////AP//M///Zv//mf//zP///wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACwAAAAAeAAUAAAI0wB//YpD
sKDBgwgTKlzIsKHDhxAfCoxIsaLFixgjotjIsaPHjyBDihxJsqTJkyYJolzJsqXLlynjwJxJs6ZN
jipvlrRmzSNPnUBR5gwa8mdHo0STghyq9GhPp02j4pQplSPSjVd5Zu2pFetVFFqfVmUq9StSo2fR
qrX69KtSslHDyvUKFazYtHTzSoXb1GxbsXb14rU7dyzVqoHrbhV89y9ij3yV+tWbeHBlwI8jJ52c
17JnzIYfJ2brdDHj0m6TahbNmujq1rBtvo5N++Xs2rhRBgQAOw==

------=_NextPart_000_0000_01C4B2D9.F7DD9D80
Content-Type: image/gif
Content-Transfer-Encoding: base64
Content-Location: http://informationr.net/ir/5-2/p69fig1.gif

R0lGODdhLQAeAPcAAAAAAP//zv//////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////ywAAAAALQAeAAAIlgADCBxI
sKDBgwgTKlzIsKHDhxAjSpxIMSKAixgzaswYAEBFghgXXvwX8uPAkgkBkCRpEuRFkR1bFkR50ONL
mSdv4nxIcyfDnj4VAuVp0yZRjxRfIoW4VKJOnT8/No3ZcKrTmVVNTrVaE6bBpzO5gqwoNqtIsWW9
nkWYNmVVpRypMt1It6RGgW2/1t3LdWjQv4ADCw4cEAA7

------=_NextPart_000_0000_01C4B2D9.F7DD9D80
Content-Type: application/octet-stream
Content-Transfer-Encoding: base64
Content-Location: http://counter.digits.com/wc/-d/-z/6/-b/FF0033/paper69

R0lGODlhWgAUAIQAAL6+vv8AMwL///+Qpv94k//4+f/AzP9AZv/g5v+In/9gf/+ouf+Amf/w8//Q
2f+4xv+YrP+gs//o7P/Y3//I0/+wv0BAQDtP/zYgIzJiIzVvIzJlIyNzI0dwI2VjI2FlIyH5BAEA
AAAALAAAAABaABQAQAX+YCCOZGmeaKqubCs2RTIaRWXCskjbZlRQgUKBsUr4JC1f4TRoOBrQxYrY
orqu2OyJIBQ2UtzudwWLpbrdVNl7RhcMJlpkNBnGC3NR3TqiuIVSN2MiEkgnDggIWQUTWo6PkFo+
iiJCJ5MjligKeEI5JpoBCaEkQgSZSyacgW2fNHAlnjNvJn6opyxPDioRUEAouAFHJ0IPDHWNxAXG
yCykJhPR0ijS1ckmCGjXkdzd3t/g4SwMA1XlLQQUFMEnD0K7KhDq5ygD9vfsqBQD2ZSgP/wK+Dux
oMunEu4eiMgGr4QDgSIeDhxxQOCABBhP+GGV7aCIjSM6ojDQwI8ZFj5wGpKgAKtSqhKj+ChzRSvW
yQCvejByKfPfzhVh8iCEYoBkAwg3bu5ISpPHygILonbpeQJKCgOFhDrbJmLUtWddfwZ52YoagkAV
EKgcESEaCwTXBhRYi0qIIRRo7qoAW6LtBK0lrE0zW1Kc4cOIExsOAQA7

------=_NextPart_000_0000_01C4B2D9.F7DD9D80
Content-Type: text/css;
	charset="iso-8859-1"
Content-Transfer-Encoding: quoted-printable
Content-Location: http://informationr.net/ir/IRstyle.css

BODY {
	MARGIN-LEFT: 15%; MARGIN-RIGHT: 10%
}
DIV {
	FONT-WEIGHT: bold; FONT-SIZE: small; FONT-FAMILY: Arial; TEXT-ALIGN: =
center
}
H1 {
	FONT-WEIGHT: bold; FONT-SIZE: x-large; COLOR: maroon; FONT-STYLE: =
normal; FONT-FAMILY: serif; TEXT-ALIGN: center
}
BLOCKQUOTE {
	FONT-WEIGHT: 500; FONT-SIZE: medium; MARGIN-LEFT: 50px; COLOR: black; =
MARGIN-RIGHT: 50px; FONT-STYLE: italic; FONT-FAMILY: serif; =
BACKGROUND-COLOR: #ffffce; TEXT-ALIGN: justify
}
H2 {
	FONT-WEIGHT: bold; FONT-SIZE: medium; MARGIN-LEFT: -8%; COLOR: navy; =
FONT-STYLE: normal; FONT-FAMILY: Verdana, Geneva, Arial, Helvetica, =
sans-serif; TEXT-ALIGN: left
}
H3 {
	FONT-WEIGHT: bold; FONT-SIZE: medium; MARGIN-LEFT: -5%; COLOR: teal; =
FONT-STYLE: normal; FONT-FAMILY: Verdana, Geneva, Arial, Helvetica, =
sans-serif; TEXT-ALIGN: left
}
H4 {
	FONT-WEIGHT: bold; FONT-SIZE: large; COLOR: maroon; FONT-STYLE: italic; =
FONT-FAMILY: Times, Times Roman, serif; TEXT-ALIGN: center
}
P {
	FONT-WEIGHT: normal; FONT-SIZE: large; COLOR: black; FONT-STYLE: =
normal; FONT-FAMILY: serif; TEXT-ALIGN: justify
}
LI {
	FONT-WEIGHT: normal; FONT-SIZE: medium; COLOR: black; FONT-STYLE: =
normal; FONT-FAMILY: serif
}
OL {
	FONT-WEIGHT: normal; FONT-SIZE: medium; COLOR: black; FONT-STYLE: =
normal; FONT-FAMILY: serif
}
UL {
	FONT-WEIGHT: normal; FONT-SIZE: medium; COLOR: black; FONT-STYLE: =
normal; FONT-FAMILY: serif
}

------=_NextPart_000_0000_01C4B2D9.F7DD9D80--
