@INPROCEEDINGS{conf/nlprs/Aizawa01,
AUTHOR = {Akiko N. Aizawa},
TITLE = {Linguistic Techniques to Improve the Performance of
Automatic Text Categorization},
BOOKTITLE = {NLPRS},
YEAR = {2001},
PAGES = {307--314},
BIBDATE = {2004-03-05},
BIBSOURCE = {DBLP,
http://dblp.uni-trier.de/db/conf/nlprs/nlprs2001.html#Aizawa01},
CROSSREF = {conf/nlprs/2001},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\0079-01.pdf},
URL = {http://www.afnlp.org/nlprs2001/pdf/0079-01.pdf}
}
@INPROCEEDINGS{conf/icdm/AntonieZ02,
AUTHOR = {Maria-Luiza Antonie and Osmar R. Za{\"i}ane},
TITLE = {Text Document Categorization by Term Association},
BOOKTITLE = {ICDM},
YEAR = {2002},
PAGES = {19--26},
ABSTRACT = {A good text classifier is a classifier that efficiently categorizes
large sets of text documents in a reasonable time frame and with
an acceptable accuracy, and that provides classification rules that
are human readable for possible fine-tuning. If the training of
the classifier is also quick, this could become in some application
domains a good asset for the classifier. Many techniques and algorithms
for automatic text categorization have been devised. According to
published literature, some are more accurate than others, and some
provide more interpretable classification models than others. However,
none can combine all the beneficial properties enumerated above.
In this paper, we present a novel approach for automatic text categorization
that borrows from market basket analysis techniques using association
rule mining in the data-mining field. We focus on two major problems:
(1) finding the best term association rules
in a textual database by generating and pruning; and (2) using the
rules to build a text classifier. Our text categorization method
proves to be efficient and effective, and experiments
on well-known collections show that the classifier performs well.
In addition, training as well as classification are both fast and
the generated rules are human readable.},
BIBDATE = {2003-03-04},
BIBSOURCE = {DBLP,
http://dblp.uni-trier.de/db/conf/icdm/icdm2002.html#AntonieZ02},
CROSSREF = {conf/icdm/2002},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\text-document-categorization-by.pdf},
URL = {http://computer.org/proceedings/icdm/1754/17540019abs.htm}
}
@INPROCEEDINGS{conf/pakm/BenjaminsFG98,
AUTHOR = {V. Richard Benjamins and Dieter Fensel and Asunci{\'o}n G{\'o}mez-P{\'e}rez},
TITLE = {Knowledge Management through Ontologies},
BOOKTITLE = {PAKM},
YEAR = {1998},
BIBDATE = {2003-04-02},
BIBSOURCE = {DBLP,
http://dblp.uni-trier.de/db/conf/pakm/pakm1998.html#BenjaminsFG98},
CROSSREF = {conf/pakm/1998},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\benjamins98knowledge.pdf},
URL = {http://SunSITE.Informatik.RWTH-Aachen.DE/Publications/CEUR-WS/Vol-13/paper5.ps}
}
@INPROCEEDINGS{conf/ercimdl/CaraccioloHR04,
AUTHOR = {Caterina Caracciolo and Willem Robert van Hage and Maarten de Rijke},
TITLE = {Towards Topic Driven Access to Full Text Documents},
BOOKTITLE = {ECDL},
YEAR = {2004},
PAGES = {495--500},
ABSTRACT = {We address the issue of providing topic driven access to full text
documents. The methodology we propose is a combination of topic
segmentation and information retrieval techniques. By segmenting
the text into topic driven segments, we obtain small and coherent
documents that can be used in two ways: as a basis for automatically
generating hypertext links, and as a visualization aid for the reader
who is presented with a small set of focused and restricted text
snippets. In the presence of a concept hierarchy, or ontology, information
retrieval techniques can be used to connect the segments obtained
to concepts in the ontology. In this paper we concentrate on the
text segmentation phase: we describe our approach to segmentation,
discuss issues related to evaluation, and report on preliminary
results.},
BIBDATE = {2005-01-21},
BIBSOURCE = {DBLP,
http://dblp.uni-trier.de/db/conf/ercimdl/ecdl2004.html#CaraccioloHR04},
CROSSREF = {conf/ercimdl/2004},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\caracciolo04towards.pdf},
URL = {http://springerlink.metapress.com/openurl.asp?genre=article&issn=0302-9743&volume=3232&spage=495}
}
@INPROCEEDINGS{conf/sac/DeboleS03,
AUTHOR = {Franca Debole and Fabrizio Sebastiani},
TITLE = {Supervised Term Weighting for Automated Text Categorization},
BOOKTITLE = {SAC},
YEAR = {2003},
PAGES = {784--788},
ABSTRACT = {The construction of a text classifier usually involves (i) a phase
of term selection, in which the most relevant terms for the classification
task are identified, (ii) a phase of term weighting, in which document
weights for the selected terms are computed, and (iii) a phase of
classifier learning, in which a classifier is generated from the
weighted representations of the training documents. This process
involves an activity of supervised learning, in which information
on the membership of training documents in categories is used. Traditionally,
supervised learning enters only phases (i) and (iii). In this paper
we propose instead that learning from the training data should also
affect phase (ii), i.e. that information on the membership of training
documents to categories be used to determine term weights. We call
this idea supervised term weighting (STW). As an example of STW,
we propose a number of �supervised variants� of tfidf weighting,
obtained by replacing the idf function with the function that has
been used in phase (i) for term selection. The use of STW allows
the terms that are distributed most differently in the positive
and negative examples of the categories of interest to be weighted
highest. We present experimental results obtained on the standard
Reuters-21578 benchmark with three classifier learning methods (Rocchio,
k-NN, and support vector machines), three term selection functions
(information gain, chi-square, and gain ratio), and both local and
global term selection and weighting.},
BIBDATE = {2003-06-18},
BIBSOURCE = {DBLP,
http://dblp.uni-trier.de/db/conf/sac/sac2003.html#DeboleS03},
CROSSREF = {conf/sac/2003},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\debole02supervised.pdf}
}
@INPROCEEDINGS{conf/ijcai/GabrilovichM05,
AUTHOR = {Evgeniy Gabrilovich and Shaul Markovitch},
TITLE = {Feature Generation for Text Categorization Using World Knowledge},
BOOKTITLE = {IJCAI},
YEAR = {2005},
PAGES = {1048--1053},
ABSTRACT = {We enhance machine learning algorithms for text categorization with
generated features based on domain-specific and common-sense knowledge.
This knowledge is represented using publicly available ontologies
that contain hundreds of thousands of concepts, such as the Open
Directory; these ontologies are further enriched by several orders
of magnitude through controlled Web crawling. Prior to text categorization,
a feature generator analyzes the documents and maps them onto appropriate
ontology concepts, which in turn induce a set of generated features
that augment the standard bag of words. Feature generation is accomplished
through contextual analysis of document text, implicitly performing
word sense disambiguation. Coupled with the ability to generalize
concepts using the ontology, this approach addresses the two main
problems of natural language processing�synonymy and polysemy. Categorizing
documents with the aid of knowledge-based features leverages information
that cannot be deduced from the documents alone. Experimental results
confirm improved performance, breaking through the plateau previously
reached in the field.},
BIBDATE = {2005-12-09},
BIBSOURCE = {DBLP,
http://dblp.uni-trier.de/db/conf/ijcai/ijcai2005.html#GabrilovichM05},
CROSSREF = {conf/ijcai/2005},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\url10.pdf},
URL = {http://www.ijcai.org/papers/0971.pdf}
}
@INPROCEEDINGS{conf/cikm/HanK00,
AUTHOR = {Eui-Hong Han and George Karypis},
TITLE = {Fast Supervised Dimensionality Reduction Algorithm with Applications
to Document Categorization \& Retrieval},
BOOKTITLE = {CIKM},
YEAR = {2000},
PAGES = {12--19},
BIBDATE = {2002-12-05},
BIBSOURCE = {DBLP,
http://dblp.uni-trier.de/db/conf/cikm/cikm2000.html#HanK00},
CROSSREF = {conf/cikm/2000},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\karypis00fast.pdf},
URL = {http://doi.acm.org/10.1145/354756.354772}
}
@INPROCEEDINGS{conf/pakdd/HanKK01,
AUTHOR = {Eui-Hong Han and George Karypis and Vipin Kumar},
TITLE = {Text Categorization Using Weight Adjusted k-Nearest Neighbor Classification},
BOOKTITLE = {PAKDD},
YEAR = {2001},
PAGES = {53--65},
BIBDATE = {2002-01-03},
BIBSOURCE = {DBLP,
http://dblp.uni-trier.de/db/conf/pakdd/pakdd2001.html#HanKK01},
CROSSREF = {conf/pakdd/2001},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\han99text.pdf},
URL = {http://link.springer.de/link/service/series/0558/bibs/2035/20350053.htm}
}
@INPROCEEDINGS{SIGIR'99*50,
AUTHOR = {Thomas Hofmann},
TITLE = {Probabilistic Latent Semantic Indexing},
PAGES = {50--57},
CROSSREF = {SIGIR '99},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\hofmann99probabilistic.pdf}
}
@INPROCEEDINGS{conf/pkdd/HothoSS03,
AUTHOR = {Andreas Hotho and Steffen Staab and Gerd Stumme},
TITLE = {Explaining Text Clustering Results Using Semantic Structures},
BOOKTITLE = {PKDD},
YEAR = {2003},
PAGES = {217--228},
ABSTRACT = {Common text clustering techniques offer rather poor capabilities for
explaining to their users why a particular result has been achieved.
They have the disadvantage that they do not relate semantically
nearby terms and that they cannot explain how resulting clusters
are related to each other. In this paper, we discuss a way of integrating
a large thesaurus and the computation of lattices of resulting clusters
into common text clustering in order to overcome these two problems.
As its major result, our approach achieves an explanation using
an appropriate level of granularity at the concept level as well
as an appropriate size and complexity of the explaining lattice
of resulting clusters.},
BIBDATE = {2003-11-24},
BIBSOURCE = {DBLP,
http://dblp.uni-trier.de/db/conf/pkdd/pkdd2003.html#HothoSS03},
CROSSREF = {conf/pkdd/2003},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\hotho03explaining.pdf},
URL = {http://springerlink.metapress.com/openurl.asp?genre=article&issn=0302-9743&volume=2838&spage=217}
}
@INPROCEEDINGS{conf/sigir/LavelliMS02,
AUTHOR = {Alberto Lavelli and Bernardo Magnini and Fabrizio Sebastiani},
TITLE = {Building thematic lexical resources by term categorization},
BOOKTITLE = {SIGIR},
YEAR = {2002},
PAGES = {415--416},
ABSTRACT = {We discuss work in progress in the semi-automatic generation of thematic
lexicons by means of term categorization, a novel task employing
techniques from information retrieval (IR) and machine learning
(ML). Specifically, we view the generation of such lexicons as an
iterative process of learning previously unknown associations between
terms and themes (i.e. disciplines, or fields of activity). The
process is iterative, in that it generates, for each ci in a set
of themes, a sequenceof lexicons, bootstrapping from an initial
lexicon Li 0 and a set of text corpora given as input. The method
is inspired by text categorization, the discipline concerned with
labelling natural language texts with labels from a predefined set
of themes, or categories. However, while text categorization deals
with documents represented as vectors in a space of terms, we formulate
the task of term categorization as one in which terms are (dually)
represented as vectors in a space of documents, and in which terms
(instead of documents) are labelled with themes. As a learning device,
we adopt boosting, since (a) it has demonstrated state-of-the-art
effectiveness in a variety of text categorization applications,
and (b) it naturally allows for a form of �data cleaning�, thereby
making the process of generating a thematic lexicon an iteration
of generate-and-test steps.},
BIBDATE = {2002-12-06},
BIBSOURCE = {DBLP,
http://dblp.uni-trier.de/db/conf/sigir/sigir2002.html#LavelliMS02},
CROSSREF = {conf/sigir/2002},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\lavelli02building.pdf},
URL = {http://doi.acm.org/10.1145/564376.564471}
}
@INPROCEEDINGS{conf/icdm/LiuDLLY03,
AUTHOR = {Bing Liu and Yang Dai and Xiaoli Li and Wee Sun Lee and Philip S.
Yu},
TITLE = {Building Text Classifiers Using Positive and Unlabeled Examples},
BOOKTITLE = {ICDM},
YEAR = {2003},
PAGES = {179--188},
ABSTRACT = {This paper studies the problem of building text classifiers using
positive and unlabeled examples. The key feature of this problem
is that there is no negative example for learning. Recently, a few
techniques for solving this problem were proposed in the literature.
These techniques are based on the same idea, which builds a classifier
in two steps. Each existing technique uses a different method for
each step. In this paper, we first introduce some new methods for
the two steps, and perform a comprehensive evaluation of all possible
combinations of methods of the two steps. We then propose a more
principled approach to solving the problem based on a biased formulation
of SVM, and show experimentally that it is more accurate than the
existing techniques.},
BIBDATE = {2004-01-28},
BIBSOURCE = {DBLP,
http://dblp.uni-trier.de/db/conf/icdm/icdm2003.html#LiuDLLY03},
CROSSREF = {conf/icdm/2003},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\liu03building.pdf},
URL = {http://csdl.computer.org/comp/proceedings/icdm/2003/1978/00/19780179abs.htm}
}
@INPROCEEDINGS{conf/icdm/LiuCZMW04,
AUTHOR = {Tao Liu and Zheng Chen and Benyu Zhang and Wei-Ying Ma and Gongyi
Wu},
TITLE = {Improving Text Classification using Local Latent Semantic Indexing},
BOOKTITLE = {ICDM},
YEAR = {2004},
PAGES = {162--169},
ABSTRACT = {Latent Semantic Indexing (LSI) has been shown to be extremely useful
in information retrieval, but it is not an optimal representation
for text classification. It always drops the text classification
performance when being applied to the whole training set (global
LSI) because this completely unsupervised method ignores class discrimination
while only concentrating on representation. Some local LSI methods
have been proposed to improve the classification by utilizing class
discrimination information. However, their performance improvements
over original term vectors are still very limited. In this paper,
we propose a new local LSI method called �Local Relevancy Weighted
LSI� to improve text classification by performing a separate Single
Value Decomposition (SVD) on the transformed local region of each
class. Experimental results show that our method is much better
than global LSI and traditional local LSI methods on classification
within a much smaller LSI dimension.},
BIBDATE = {2004-12-13},
BIBSOURCE = {DBLP,
http://dblp.uni-trier.de/db/conf/icdm/icdm2004.html#LiuCZMW04},
CROSSREF = {conf/icdm/2004},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\ICDM2004-LLSI.pdf},
URL = {http://csdl.computer.org/comp/proceedings/icdm/2004/2142/00/21420162abs.htm}
}
@INPROCEEDINGS{conf/ecir/MoschittiB04,
AUTHOR = {Alessandro Moschitti and Roberto Basili},
TITLE = {Complex Linguistic Features for Text Classification: {A} Comprehensive
Study},
BOOKTITLE = {ECIR},
YEAR = {2004},
PAGES = {181--196},
ABSTRACT = {Previous researches on advanced representations for document retrieval
have shown that statistical state-of-the-art models are not improved
by a variety of different linguistic representations. Phrases, word
senses and syntactic relations derived by Natural Language Processing
(NLP) techniques were observed ineffective to increase retrieval
accuracy. For Text Categorization (TC) are available fewer and less
definitive studies on the use of advanced document representations
as it is a relatively new research area (compared to document retrieval).
In this paper, advanced document representations have been investigated.
Extensive experimentation on representative classifiers, Rocchio
and SVM, as well as a careful analysis of the literature have been
carried out to study how some NLP techniques used for indexing impact
TC. Cross validation over 4 different corpora in two languages allowed
us to gather an overwhelming evidence that complex nominals, proper
nouns and word senses are not adequate to improve TC accuracy.},
BIBDATE = {2004-03-02},
BIBSOURCE = {DBLP,
http://dblp.uni-trier.de/db/conf/ecir/ecir2004.html#MoschittiB04},
CROSSREF = {conf/ecir/2004},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\url9.pdf},
URL = {http://springerlink.metapress.com/openurl.asp?genre=article&issn=0302-9743&volume=2997&spage=181}
}
@INPROCEEDINGS{DBLP:conf/gecco/OdaW03,
AUTHOR = {Terri Oda and Tony White},
TITLE = {Developing an Immunity to Spam.},
BOOKTITLE = {GECCO},
YEAR = {2003},
PAGES = {231-242},
ABSTRACT = {Immune systems protect animals from pathogens, so why not apply a
similar model to protect computers? Several researchers have investigated
the use of an artificial immune system to protect computers from
viruses and others have looked at using such a system to detect
unauthorized computer intrusions. This paper describes the use of
an artificial immune system for another kind of protection: protection
from unsolicited email, or spam.},
BIBSOURCE = {DBLP, http://dblp.uni-trier.de},
COMMENT = {Tiene unos conceptos muy interesantes para la implementaci�n del sistema
inmune artificial},
CROSSREF = {DBLP:conf/gecco/2003-1},
EE = {http://link.springer.de/link/service/series/0558/bibs/2723/27230231.htm},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\i__GECCO 2003_papers_2723_27230231.pdf}
}
@INPROCEEDINGS{conf/icml/ScottM99,
AUTHOR = {Sam Scott and Stan Matwin},
TITLE = {Feature Engineering for Text Classification},
BOOKTITLE = {ICML},
YEAR = {1999},
PAGES = {379--388},
ABSTRACT = {Most research in text classification to date has used a �bag of words�
representation in which each feature corresponds to a single word.
This paper examines some alternative ways to represent text based
on syntactic and semantic relationships between words (phrases,
synonyms and hypernyms). We describe the new representations and
try to justify our hypothesis that they could improve the performance
of a rule-based learner. The representations are evaluated using
the RIPPER learning algorithm on the Reuters-21578 and DigiTrad
test corpora. On their own the new representations are not found
to produce significant performance improvements. We also try combining
classifiers based on different representations using a majority
voting technique, and this improves performance on both test collections.
In our opinion, more sophisticated Natural Language Processing techniques
need to be developed before better text representations can be produced
for classification.},
BIBDATE = {2002-12-03},
BIBSOURCE = {DBLP,
http://dblp.uni-trier.de/db/conf/icml/icml1999.html#ScottM99},
CROSSREF = {conf/icml/1999},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\scott99feature.pdf}
}
@INPROCEEDINGS{conf/iccs/Stumme02,
AUTHOR = {Gerd Stumme},
TITLE = {Formal Concept Analysis on Its Way from Mathematics to Computer Science},
BOOKTITLE = {ICCS},
YEAR = {2002},
PAGES = {2--19},
BIBDATE = {2002-07-09},
BIBSOURCE = {DBLP,
http://dblp.uni-trier.de/db/conf/iccs/iccs2002.html#Stumme02},
CROSSREF = {conf/iccs/2002},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\formal-concept-analysis-on.pdf},
URL = {http://link.springer.de/link/service/series/0558/bibs/2393/23930002.htm}
}
@INPROCEEDINGS{conf/icdm/SunL01,
AUTHOR = {Aixin Sun and Ee-Peng Lim},
TITLE = {Hierarchical Text Classification and Evaluation},
BOOKTITLE = {ICDM},
YEAR = {2001},
PAGES = {521--528},
ABSTRACT = {Hierarchical Classification refers to assigning of one or more suitable
categories from a hierarchical category space to a document. While
previous work in hierarchical classification focused on virtual
category trees where documents are assigned only to the leaf categories,
we propose a topdown level-based classification method that can
classify documents to both leaf and internal categories. As the
standard performance measures assume independence between categories,
they have not considered the documents incorrectly classified into
categories that are similar or not far from the correct ones in
the category tree. We therefore propose the Category-Similarity
Measures and Distance- Based Measures to consider the degree of
misclassification
in measuring the classification performance. An experiment has been
carried out to measure the performance of our proposed hierarchical
classification method. The results showed that our method performs
well for Reuters text collection when enough training documents
are given and the new measures have indeed considered the contributions
of misclassified documents.},
BIBDATE = {2002-02-13},
BIBSOURCE = {DBLP,
http://dblp.uni-trier.de/db/conf/icdm/icdm2001.html#SunL01},
CROSSREF = {conf/icdm/2001},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\sun01hierarchical.pdf}
}
@INPROCEEDINGS{conf/adc/ZaianeA02,
AUTHOR = {Osmar R. Za{\"i}ane and Maria-Luiza Antonie},
TITLE = {Classifying Text Documents by Associating Terms With Text Categories},
BOOKTITLE = {Australasian Database Conference},
YEAR = {2002},
BIBDATE = {2004-10-08},
BIBSOURCE = {DBLP,
http://dblp.uni-trier.de/db/conf/adc/adc2002.html#ZaianeA02},
CROSSREF = {conf/adc/2002},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\classifying-text-documents-by.pdf},
URL = {http://crpit.com/confpapers/CRPITV5Zaiane.pdf}
}
@MISC{oai:CiteSeerPSU:274056,
AUTHOR = {Kjersti Aas and Line Eikvil},
TITLE = {Text Categorisation: {A} Survey.},
YEAR = {1999},
ABSTRACT = {this report we give a survey of the state-of-the-art in text categorisation.
To be able to measure progress in this field, it is important to
use a standardised collection of documents for analysis and testing.
One such data set is the Reuters-21578 collection of newswires for
the year 1987, and our survey will focus on the work on text categorisation
that have used this collection for testing.},
BIBSOURCE = {OAI-PMH server at cs1.ist.psu.edu},
CITESEER-REFERENCES = {oai:CiteSeerPSU:19079; oai:CiteSeerPSU:25286;
oai:CiteSeerPSU:37681; oai:CiteSeerPSU:140776;
oai:CiteSeerPSU:189631; oai:CiteSeerPSU:123646;
oai:CiteSeerPSU:129727; oai:CiteSeerPSU:553162;
oai:CiteSeerPSU:107422; oai:CiteSeerPSU:553162;
oai:CiteSeerPSU:100508; oai:CiteSeerPSU:84047;
oai:CiteSeerPSU:93679},
LANGUAGE = {en},
OAI = {oai:CiteSeerPSU:274056},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\aas99text.pdf},
RIGHTS = {unrestricted},
URL = {http://citeseer.ist.psu.edu/274056.html; http://www.nr.no/research/samba/tm_survey.ps}
}
@INPROCEEDINGS{conf/sigir/Ando00,
AUTHOR = {Rie Kubota Ando},
TITLE = {Latent semantic-space: iterative scaling improves precision of inter-document
similarity measurement},
BOOKTITLE = {SIGIR},
YEAR = {2000},
PAGES = {216--223},
ABSTRACT = {We present a novel algorithm that creates document vectors with reduced
dimensionality. This work was motivated by an application characterizing
relationships among documents in a collection. Our algorithm yielded
inter-document similarities with an average precision up to 17.8%
higher than that of singular value decomposition (SVD) used for
Latent Semantic Indexing. The best performance was achieved with
dimensional reduction rates that were 43% higher than SVD on average.
Our algorithm creates basis vectors for a reduced space by iteratively
�scaling� vectors and computing eigenvectors. Unlike SVD, it breaks
the symmetry of documents and terms to capture information more
evenly across documents. We also discuss correlation with a probabilistic
model and evaluate a method for selecting the dimensionality using
log-likelihood estimation},
BIBDATE = {2002-12-06},
BIBSOURCE = {DBLP,
http://dblp.uni-trier.de/db/conf/sigir/sigir2000.html#Ando00},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\ando00latent.pdf},
URL = {http://doi.acm.org/10.1145/345508.345579}
}
@BOOK{books/wi/BaldiFS03,
TITLE = {Modeling the Internet and the Web: Probabilistic Method and Algorithms},
PUBLISHER = {John Wiley},
YEAR = {2003},
AUTHOR = {Pierre Baldi and Paolo Frasconi and Padhraic Smyth},
BIBDATE = {2003-08-22},
ISBN = {0-470-84906-1},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\Modeling_the_Internet_and_the_Web_Probabilistic_Methods_and_Algorithms_(Wiley-2003).pdf},
URL = {http://ibook.ics.uci.edu/}
}
@INPROCEEDINGS{conf/hicss/BasuWS03,
AUTHOR = {A. Basu and Carolyn R. Watters and Michael A. Shepherd},
TITLE = {Support Vector Machines for Text Categorization},
BOOKTITLE = {HICSS},
YEAR = {2003},
PAGES = {103},
ABSTRACT = {Text categorization is the process of sorting text documents into
one or more predefined categories or classes of similar documents.
Differences in the results of such categorization arise from the
feature set chosen to base the association of a given document with
a given category. Advocates of text categorization recognize that
the sorting of text documents into categories of like documents
reduces the overhead required for fast retrieval of such documents
and provides smaller domains in which the users may explore similar
documents. In this paper we are interested in examining whether
automatic classification of news texts can be improved by a prefiltering
the vocabulary to reduce the feature set used in the computations.
First we compare artificial neural network and support vector machine
algorithms for use as text classifiers of news items. Secondly,
we identify a reduction in feature set that provides improved results.},
BIBDATE = {2003-06-17},
BIBSOURCE = {DBLP,
http://dblp.uni-trier.de/db/conf/hicss/hicss2003-4.html#BasuWS03},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\support_vectors.pdf},
URL = {http://computer.org/proceedings/hicss/1874/track4/187440103cabs.htm}
}
@ARTICLE{betw-dwcvwtc-03,
AUTHOR = {Ron Bekkerman and Ran El-Yaniv and Naftali Tishby and Yoad Winter},
TITLE = {Distributional Word Clusters vs. Words for Text Categorization},
JOURNAL = {Journal of Machine Learning Research},
YEAR = {2003},
VOLUME = {3},
PAGES = {1183--1208},
ABSTRACT = {We study an approach to text categorization that combines distributional
clustering of words and a Support Vector Machine (SVM) classifier.
This word-cluster representation is computed using the recently
introduced Information Bottleneck method, which generates a compact
and efficient representation of documents. When combined with the
classification power of the SVM, this method yields high performance
in text categorization. This novel combination of SVM with word-cluster
representation is compared with SVM-based categorization using the
simpler bag-of-words (BOW) representation. The comparison is performed
over three known datasets. On one of these datasets (the 20 Newsgroups)
the method based on word clusters significantly outperforms the
word-based representation in terms of categorization accuracy or
representation efficiency. On the two other sets (Reuters-21578
and WebKB) the word-based representation slightly outperforms the
word-cluster representation. We investigate the potential reasons
for this behavior and relate it to structural differences between
the datasets.},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\bekkerman03distributional.pdf}
}
@BOOK{chakrabarti02mining,
TITLE = {Mining the {Web}. Discovering Knowledge from Hypertext Data},
PUBLISHER = {Morgan Kaufmann Publishers},
YEAR = {2002},
AUTHOR = {Soumen Chakrabarti},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\Mining_the_Web_-_Discovering_Knowledge_from_Hypertext_Data_(Morgan_Kaufmann-2003).pdf}
}
@INPROCEEDINGS{Dalessio00,
AUTHOR = {Stephen D'Alessio and Keitha Murray and Robert Schiaffino and Aaron
Kershenbaum},
TITLE = {The effect of using Hierarchical classifiers in Text Categorization},
BOOKTITLE = {Proceeding of RIAO-00, 6th International Conference ``Recherche d'Information
Assistee par Ordinateur''},
YEAR = {2000},
PAGES = {302--313},
ADDRESS = {Paris, FR},
ABSTRACT = {Given a set of categories, with or without a preexisting hierarchy
among them, we consider the problem of assigning documents to one
or more of these categories from the point of view of a hierarchy
with more or less depth. We can choose to make use of none, part
or all of the hierarchical structure to improve the categorization
effectiveness and efficiency. It is possible to create additional
hierarchy among the categories. We describe a procedure for generating
a hierarchy of classifiers that model the hierarchy structure. We
report on computational experience using this procedure. We show
that judicious use of a hierarchy can significantly improve both
the speed and effectiveness of the categorization process. Using
the Reuters-21578 corpus, we obtain an improvement in running time
of over a factor of three and a 5\% improvement in F-measure.},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\the-effect-of-using.pdf},
URL = {http://www.iona.edu/cs/FacultyPublications/riao2000New.pdf}
}
@MISC{oai:CiteSeerPSU:613766,
AUTHOR = {Offer Drori},
TITLE = {Identifying the Subject of Documents in Digital Libraries Automatically
Using Frequently-Occurring Words - Study and Findings},
MONTH = MAY # {~23},
YEAR = {2003},
ABSTRACT = {Contemporary information databases contain millions of electronic
documents. The immense number of documents makes it difficult to
conduct efficient searches on the Internet. Several studies have
found that associating documents with a subject or list of topics
can make them easier to locate online [5] [6] [7]. Effective cataloging
of information is performed manually, requiring extensive resources.
Consequently, at present most information is not cataloged.},
ANNOTE = {Offer Drori (The Hebrew University of Jerusalem ,
Jerusalem; ISRAEL);},
BIBSOURCE = {OAI-PMH server at cs1.ist.psu.edu},
CITESEER-REFERENCES = {oai:CiteSeerPSU:388078; oai:CiteSeerPSU:445314;
oai:CiteSeerPSU:306092; oai:CiteSeerPSU:354930;
oai:CiteSeerPSU:46582; oai:CiteSeerPSU:3011;
oai:CiteSeerPSU:110715},
LANGUAGE = {en},
OAI = {oai:CiteSeerPSU:613766},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\drori02identifying.pdf},
RIGHTS = {unrestricted},
URL = {http://citeseer.ist.psu.edu/613766.html; http://shum.huji.ac.il/~offerd/papers/drori042003-a.pdf}
}
@MISC{oai:CiteSeerPSU:529607,
AUTHOR = {Dave Elliman},
TITLE = {Automatic Derivation of On-line Document Ontologies},
MONTH = JUL # {~26},
YEAR = {2001},
ABSTRACT = {This paper describes a method for constructing an ontology which will
represent the set of web pages on a specified site. We are developing
a technique that will extract knowledge from digital sources, create
ontologies containing reusable knowledge to be shared with software
agents, and present a view of this knowledge to users. This method
will provide a solution to the problem of classifying information
and supporting mechanisms that explore its structure, as well as
allowing knowledge to be extracted and shared with other software
agents.},
ANNOTE = {Dave Elliman (J.Rafael G.Pulido; Image Processing and
Interpretation Research Group,; Computer Science and
Information Technology School,; University of;
Nottingham , United Kingdom);},
BIBSOURCE = {OAI-PMH server at cs1.ist.psu.edu},
CITESEER-REFERENCES = {oai:CiteSeerPSU:431849; oai:CiteSeerPSU:366691;
oai:CiteSeerPSU:100962; oai:CiteSeerPSU:444390;
oai:CiteSeerPSU:341007; oai:CiteSeerPSU:258861;
oai:CiteSeerPSU:316519},
LANGUAGE = {en},
OAI = {oai:CiteSeerPSU:529607},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\elliman01automatic.pdf},
RIGHTS = {unrestricted},
URL = {http://citeseer.ist.psu.edu/529607.html; http://www.cs.nott.ac.uk/~jrp/./pub/merit01.pdf}
}
@INPROCEEDINGS{conf/iv/EllimanP02,
AUTHOR = {Dave Elliman and J. R. G. Pulido},
TITLE = {Visualizing Ontology Components through Self-Organizing Maps},
BOOKTITLE = {IV},
YEAR = {2002},
PAGES = {434},
ABSTRACT = {This paper describes a method for identifying Ontology components
by using Self-Organizing Maps. Our system represents the knowledge
contained in a particular digital archive by assembling and displaying
the ontologies components. This novel approach provides an alternative
solution to the problem of classifying on-line information and retrieval,
supportmechanisms that explore domains, and allows knowledge to
be displayed in a browsable manner.},
BIBDATE = {2005-05-02},
BIBSOURCE = {DBLP,
http://dblp.uni-trier.de/db/conf/iv/iv2002.html#EllimanP02},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\visualizing-ontology-components-through.pdf},
URL = {http://csdl.computer.org/comp/proceedings/iv/2002/1656/00/16560434abs.htm}
}
@INPROCEEDINGS{Florian:416,
AUTHOR = {Radu Florian and David Yarowsky},
TITLE = {Dynamic Nonlocal Language Modeling via Hierarchical Topic-Based Adaptation},
BOOKTITLE = {37th Annual Meeting of the Association for Computational Linguistics},
YEAR = {1999},
PAGES = {167--174},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\florian99dynamic.pdf}
}
@MISC{oai:CiteSeerPSU:566368,
AUTHOR = {Hichem Frigui and Olfa Nasraoui},
TITLE = {Simultaneous Categorization of Text Documents And Identification
of Cluster-dependent Keywords},
MONTH = APR # {~07},
YEAR = {2002},
ABSTRACT = {In this paper, we propose a new approach to unsupervised text document
categorization based on a coupled process of clustering and cluster-dependent
keyword weighting. The proposed algorithm is based on the K-Means
clustering algorithm. Hence it is computationally and implementationally
simple. Moreover, it learns a different set of keyword weights for
each cluster. This means that, as a by-product of the clustering
process, each document cluster will be characterized by a possibly
different set of keywords. The cluster dependent keyword weights
have two advantages. First, they help in partitioning the document
collection into more meaningful categories.},
ANNOTE = {Hichem Frigui (Department of Electrical and Computer
Engineering; University of Memphis; Campus Box 526574 ,
Memphis , TN 38152); Olfa Nasraoui (Department of
Electrical and Computer Engineering; University of
Memphis; Campus Box 526574 , Memphis , TN 38152);},
BIBSOURCE = {OAI-PMH server at cs1.ist.psu.edu},
CITESEER-REFERENCES = {oai:CiteSeerPSU:322454; oai:CiteSeerPSU:15856;
oai:CiteSeerPSU:112012; oai:CiteSeerPSU:196975;
oai:CiteSeerPSU:164433; oai:CiteSeerPSU:3156;
oai:CiteSeerPSU:98891; oai:CiteSeerPSU:349805;
oai:CiteSeerPSU:214617; oai:CiteSeerPSU:332454;
oai:CiteSeerPSU:69940},
LANGUAGE = {en},
OAI = {oai:CiteSeerPSU:566368},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\frigui02simultaneous.pdf},
RIGHTS = {unrestricted},
URL = {http://citeseer.ist.psu.edu/566368.html; http://www.ee.memphis.edu/people/faculty/nasraoui/publications/SIAM02_TXT_WKSHP.pdf}
}
@INPROCEEDINGS{Guo04,
AUTHOR = {Gongde Guo and Hui Wang and David A. Bell and Yaxin Bi and Kieran
Greer},
TITLE = {An k{NN} Model-Based Approach and Its Application in Text Categorization},
BOOKTITLE = {Proceedings of CICLING-04, 5th International Conference on Computational
Linguistics and Intelligent Text Processing},
YEAR = {2004},
EDITOR = {Alexander F. Gelbukh},
PAGES = {559--570},
ADDRESS = {Seoul, KO},
PUBLISHER = {Springer Verlag, Heidelberg, DE},
NOTE = {Published in the ``Lecture Notes in Computer Science'' series, number
2945},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\unknown.pdf}
}
@MISC{oai:CiteSeerPSU:584708,
AUTHOR = {Andreas Hotho and Gerd Stumme},
TITLE = {Conceptual Clustering of Text Clusters},
MONTH = MAY # {~23},
YEAR = {2002},
ABSTRACT = {Common clustering techniques have the disadvantage that they do not
provide intensional descriptions of the clusters obtained. Conceptual
Clustering techniques, on the other hand, provide such descriptions,
but are known to be rather slow. In this paper, we discuss a way
of combining both techniques. We first cluster the documents by
a variant of #--Means, using a thesaurus as background knowledge.
This clustering reduces the large number of documents to a relatively
small number of clusters, which can then be clustered conceptually
in the second step.},
ANNOTE = {Andreas Hotho (Institute of Applied Informatics and
Formal Description Methods AIFB , University of
Karlsruhe; D--76128 Karlsruhe); Gerd Stumme (Institute
of Applied Informatics and Formal Description Methods
AIFB , University of Karlsruhe; D--76128 Karlsruhe);},
BIBSOURCE = {OAI-PMH server at cs1.ist.psu.edu},
CITESEER-REFERENCES = {oai:CiteSeerPSU:582032; oai:CiteSeerPSU:70718;
oai:CiteSeerPSU:380343; oai:CiteSeerPSU:246639;
oai:CiteSeerPSU:573923; oai:CiteSeerPSU:301272;
oai:CiteSeerPSU:274006},
LANGUAGE = {en},
OAI = {oai:CiteSeerPSU:584708},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\hotho02conceptual.pdf},
RIGHTS = {unrestricted},
URL = {http://citeseer.ist.psu.edu/584708.html; http://www.aifb.uni-karlsruhe.de/WBS/aho/pub/tc_fca_2002_submit.pdf}
}
@INPROCEEDINGS{JM02b,
AUTHOR = {M. Jarrar and R. Meersman},
TITLE = {Scalability and Knowledge Reusability in Ontology Modeling},
BOOKTITLE = {Proceedings of the International conference on Infrastructure for
e-Business, e-Education, e-Science, and e-Medicine},
YEAR = {2002},
EDITOR = {Veljko Milutinovic},
VOLUME = {SSGRR2002s},
ADDRESS = {Rome, Italy},
PUBLISHER = {SSGRR education center},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\jarrar02scalability.pdf},
SOURCE = {http://www.jarrar.info/publications/}
}
@TECHREPORT{KarypisHan,
AUTHOR = {George Karypis and Eui-Hong Han},
TITLE = {Concept indexing: {A} fast dimensionality reduction algorithm with
applications to document retrieval and categorization},
INSTITUTION = {University of Minnesota},
YEAR = {2000},
TYPE = {Computer Science Department \uppercase{TR}-00-0016},
ABSTRACT = {In recent years, we have seen a tremendous growth in the volume of
text documents available on the Internet, digital libraries, news
sources, and company-wide intranets. This has led to an increased
interest in developing methods that can efficiently categorize and
retrieve relevant information. Retrieval techniques based on dimensionality
reduction, such as Latent Semantic Indexing (LSI), have been shown
to improve the quality of the information being retrieved by capturing
the latent meaning of the words present in the documents. Unfortunately,
the high computational requirements of LSI and its inability to
compute an effective dimensionality reduction in a supervised setting
limits its applicability. In this paper we present a fast dimensionality
reduction algorithm, called concept indexing (CI) that is equally
effective for unsupervised and supervised dimensionality reduction.
CI computes a k-dimensional representation of a collection of documents
by first clustering the documents into k groups, and then using
the centroid vectors of the clusters to derive the axes of the reduced
k-dimensional space. Experimental results show that the dimensionality
reduction computed by CI achieves comparable retrieval performance
to that obtained using LSI, while requiring an order of magnitude
less time. Moreover, when CI is used to compute the dimensionality
reduction in a supervised setting, it greatly improves the performance
of traditional classification algorithms such as C4.5 and kNN.},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\karypis00concept.pdf}
}
@INPROCEEDINGS{Kudo04,
AUTHOR = {Taku Kudo and Yuji Matsumoto},
TITLE = {A Boosting Algorithm for Classification of
Semi-Structured Text},
BOOKTITLE = {Proceedings of EMNLP-04, 9th Conference on Empirical
Methods in Natural Language Processing},
YEAR = {2004},
ADDRESS = {Barcelon, ES},
ABSTRACT = {The focus of research in text classification has
expanded from simple topic identification to more
challenging tasks such as opinion/modality
identification. Unfortunately, the latter goals exceed
the ability of the traditional bag-of-word
representation approach, and a richer, more structural
representation is required. Accordingly, learning
algorithms must be created that can handle the
structures observed in texts. In this paper, we propose
a Boosting algorithm that captures sub-structures
embedded in texts. The proposal consists of i) decision
stumps that use subtrees as features and ii) the
Boosting algorithm which employs the subtree-based
decision stumps as weak learners. We also discuss the
relation between our algorithm and SVMs with tree
kernel. Two experiments on opinion/modality
classification confirm that subtree features are
important.},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\emnlp2004-1.pdf},
URL = {http://chasen.org/~taku/publications/emnlp2004-1.pdf}
}
@MISC{oai:CiteSeerPSU:458733,
AUTHOR = {Dawn Lawrie and W. Bruce Croft},
TITLE = {Discovering and Comparing Topic Hierarchies},
MONTH = OCT # {~13},
YEAR = {2000},
ABSTRACT = {Hierarchies have been used for organization, summarization, and access
to information, yet a lingering issue is how best to construct them.
In this paper, our goal is to automatically create domain specific
hierarchies that can be used for browsing a document set and locating
relevant documents. We examine methods of automatically generating
hierarchies and evaluating them. To this end, we compare and contrast
two methods of generating topic hierarchies from the text of documents:
one, subsumption hierarchies, uses subsumption relations found within
document sets, and the other, lexical hierarchies, utilizes frequently
used words within phrases. Our evaluation shows that subsumption
hierarchies divide documents into smaller groups, allowing one to
find all relevant documents without looking at as many non-relevant
documents. However, such hierarchies are more likely to contain
no path to a relevant document.},
ANNOTE = {Dawn Lawrie (Department of Computer Science;
University of Massachusetts; Amherst , MA 01003 USA);
W. Bruce Croft (Department of Computer Science;
University of Massachusetts; Amherst , MA 01003 USA);},
BIBSOURCE = {OAI-PMH server at cs1.ist.psu.edu},
CITESEER-REFERENCES = {oai:CiteSeerPSU:26307; oai:CiteSeerPSU:15856;
oai:CiteSeerPSU:211368; oai:CiteSeerPSU:187156;
oai:CiteSeerPSU:455178; oai:CiteSeerPSU:209382},
LANGUAGE = {en},
OAI = {oai:CiteSeerPSU:458733},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\lawrie00discovering.pdf},
RIGHTS = {unrestricted},
URL = {http://citeseer.ist.psu.edu/458733.html; http://ciir.cs.umass.edu/pubfiles/ir-183.pdf}
}
@CONFERENCE{IMM2004-02894,
AUTHOR = {R. E. Madsen and J. Larsen and L. K. Hansen},
TITLE = {Part-of-Speech Enhanced Context Recognition},
BOOKTITLE = {Proceedings of {IEEE} Workshop on Machine Learning for Signal Processing
{XIV}},
YEAR = {2004},
EDITOR = {S. Douglas {A.K. Barros, J. Principe, J. Larsen, T. Adali}},
PAGES = {635--644},
ADDRESS = {Piscataway, New Jersey},
MONTH = SEP,
PUBLISHER = {{IEEE} Press},
ABSTRACT = {Language independent `bag-of-words' representations are surprisingly
efective for text classi�cation. In this communi- cation our aim
is to elucidate the synergy between language inde- pendent features
and simple language model features. We consider term tag features
estimated by a so-called part-of-speech tagger. The feature sets
are combined in an early binding design with an optimized binding
coefficient that allows weighting of the relative variance contributions
of the participating feature sets. With the combined features documents
are classi�ed using a latent semantic indexing representation and
a probabilistic neural network classi- fier. Three medium size data-sets
are analyzed and we find consis- tent synergy between the term and
natural language features in all three sets for a range of training
set sizes. The most significant en- hancement is found for small
text databases where high recognition rates are possible.},
ISBN_ISSN = {0-7803-8609-4},
KEYWORDS = {text mining, latent space, context recognition},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\part-of-speech-enhanced.pdf},
URL = {http://www2.imm.dtu.dk/pubdb/p.php?2894}
}
@ARTICLE{Park04,
AUTHOR = {Seong-Bae Park and Byoung-Tak Zhang},
TITLE = {Co-trained support vector machines for large scale unstructured document
classification using unlabeled data and syntactic information},
JOURNAL = {Information Processing and Management},
YEAR = {2004},
VOLUME = {40},
PAGES = {421--439},
NUMBER = {3},
ABSTRACT = {Most document classification systems consider only the distribution
of content words of the documents, ignoring the syntactic information
underlying the documents though it is also an important factor.
In this paper, we present an approach for classifying large scale
unstructured documents by incorporating both lexical and syntactic
information of documents. For this purpose, we use the co-training
algorithm, a partially supervised learning algorithm, in which two
separated views for the training data are employed and the small
number of labeled data are augmented by a large number of unlabeled
data. Since both lexical and syntactic information can play roles
of separated views for the unstructured documents, the co-training
algorithm enhances the performance of document classification using
both of them and a large number of unlabeled documents. The experimental
results on Reuters-21578 corpus and TREC-7 filtering documents show
the effectiveness of unlabeled documents and the use of both lexical
and syntactic information.},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\LNAI2637_Park.pdf}
}
@TECHREPORT{oai:CiteSeerPSU:542295,
AUTHOR = {William M. Pottenger and Ph. D},
TITLE = {Detecting Patterns in the {LSI} Term-Term Matrix},
YEAR = {2002},
MONTH = SEP # {~25},
ABSTRACT = {applications use techniques that explicitly or implicitly employ a
limited degree of transitivity in the co-occurrence relation. In
this work we show use of higher orders of co-occurrence in the Singular
Value Decomposition (SVD) algorithm and, by inference, on the systems
that rely on SVD, such as LSI. Our empirical and mathematical studies
prove that term cooccurrence plays a crucial role in LSI.},
ANNOTE = {William M. Pottenger (CSE Department; 19 Memorial
Drive West); Ph. D (CSE Department; 19 Memorial Drive
West);},
BIBSOURCE = {OAI-PMH server at cs1.ist.psu.edu},
CITESEER-REFERENCES = {oai:CiteSeerPSU:189631; oai:CiteSeerPSU:441438;
oai:CiteSeerPSU:144832; oai:CiteSeerPSU:576970;
oai:CiteSeerPSU:23424; oai:CiteSeerPSU:8085},
LANGUAGE = {en},
OAI = {oai:CiteSeerPSU:542295},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\kontostathis02detecting.pdf},
RIGHTS = {unrestricted},
URL = {http://citeseer.ist.psu.edu/542295.html; http://www.cse.lehigh.edu/techreports/2002/LU-CSE-02-010.pdf}
}
@MISC{oai:CiteSeerPSU:555050,
AUTHOR = {David Ramamonjisoa},
TITLE = {Towards Automated Research Topics Discovery on Scientific Domain
by Agents System},
MONTH = JAN # {~02},
YEAR = {2003},
ABSTRACT = {In our project on multiagent for web mining, we developed KAROKA (Keywords
Association Rules Optimizer Knobots Advisers) as a model of discovery
in text database used in WWW. In this paper, we explain our model
and its application to discover new research topics in scientific
domain on the web. This tool aims to support researchers for their
bibliographical investigation and help to avoid information overload.
The WWW sources are converted into a highly structured collection
of text. Then, KAROKA tries to extract topics, association rules,
regularities, exception and useful information in the collection
of text.},
ANNOTE = {David Ramamonjisoa (Faculty of Software and
Information Science; Iwate Prefectural University;
Takizawa , Iwate , Japan 020--0273);},
BIBSOURCE = {OAI-PMH server at cs1.ist.psu.edu},
CITESEER-REFERENCES = {oai:CiteSeerPSU:19801; oai:CiteSeerPSU:458698;
oai:CiteSeerPSU:348324; oai:CiteSeerPSU:179712;
oai:CiteSeerPSU:68861; oai:CiteSeerPSU:55671;
oai:CiteSeerPSU:563035; oai:CiteSeerPSU:454529;
oai:CiteSeerPSU:456928; oai:CiteSeerPSU:438592},
LANGUAGE = {en},
OAI = {oai:CiteSeerPSU:555050},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\ramamonjisoa03towards.pdf},
RIGHTS = {unrestricted},
URL = {http://citeseer.ist.psu.edu/555050.html; http://www.ssgrr.it/en/ssgrr2003w/papers/157.pdf}
}
@BOOK{Scime2005,
TITLE = {Web Mining: applications and techniques},
PUBLISHER = {Idea Group},
YEAR = {2005},
EDITOR = {A Scime},
AUTHOR = {Anthony Scime},
OWNER = {Andr�s},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\Idea.Group.Publishing.Web.Mining.Applications.and.Techniques.Aug.2004.eBook-DDU.pdf},
TIMESTAMP = {2006.04.10}
}
@INPROCEEDINGS{Sinka:2004:EDFfWDCAFS,
AUTHOR = {Mark Sinka and David Corne},
TITLE = {Evolving Document Features for Web Document
Clustering: {A} Feasability Study},
BOOKTITLE = {Proceedings of the 2004 IEEE Congress on Evolutionary
Computation},
YEAR = {2004},
PAGES = {891--897},
ADDRESS = {Portland, Oregon},
MONTH = {20-23 } # JUN,
PUBLISHER = {IEEE Press},
ABSTRACT = {Document analysis research underpins the envisaged
'semantic web'. A key issue is how to encode a document
without losing salient information. Current research
almost always uses fixed-length vectors based on word
(term) frequency (TF) and/or variants thereof. We
explore alternative encodings using an evolutionary
algorithm (EA). These alternatives use a variety of
other features that can be extracted from a document,
and the EA explores the space of weighted combinations
of these. Tests are able to find encodings which
outperform previous results. Among several tentative
findings it seems clear that the ideal encoding is
highly task-dependent, and we can recommend certain
features as useful for specific types of document
clustering tasks.},
ISBN = {0-7803-8515-2},
KEYWORDS = {Other, Real-world applications},
NOTES = {CEC 2004 - A joint meeting of the IEEE, the EPS, and
the IEE.},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\S020P003.pdf}
}
@INPROCEEDINGS{Slonim01,
AUTHOR = {Noam Slonim and Naftali Tishby},
TITLE = {The Power of Word Clusters for Text Classification},
BOOKTITLE = {Proceedings of ECIR-01, 23rd European Colloquium on Information Retrieval
Research},
YEAR = {2001},
ADDRESS = {Darmstadt, DE},
ABSTRACT = {The recently introduced Information Bottleneck method provides an
information theoretic framework, for extracting features of one
variable, that are relevant for the values of another variable.
Several previous works already suggested applying this method for
document clustering, gene expression data analysis, spectral analysis
and more. In this work we present a novel implementation of this
method for supervised text classification. Specifically, we apply
the information bottleneck method to find word-clusters that preserve
the information about document categories and use these clusters
as features for classification. Previous work used a similar clustering
procedure to show that word-clusters can significantly reduce the
feature space dimensionality, with only a minor change in classification
accuracy. In this work we reproduce these results and go further
to show that when the training sample is small word clusters can
yield significant improvement in classification accuracy (up to
18\%) over the performance using the words directly.},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\slonim01power.pdf},
URL = {http://www.cs.huji.ac.il/labs/learning/Papers/irsg3.eps.gz}
}
@INPROCEEDINGS{Strehl:2000:ISM,
AUTHOR = {Alexander Strehl and Joydeep Ghosh and Raymond Mooney},
TITLE = {Impact of Similarity Measures on Web-page Clustering},
BOOKTITLE = {Proceedings of the 17th National Conference on Artificial Intelligence:
Workshop of Artificial Intelligence for Web Search (AAAI 2000),
30--31 July 2000, Austin, Texas, USA},
YEAR = {2000},
PAGES = {58--64},
MONTH = JUL,
PUBLISHER = {AAAI},
BIBDATE = {Sat Apr 20 15:28:13 2002},
BIBSOURCE = {ftp://ftp.math.utah.edu/pub/bibnet/authors/s/strehl-alexander.bib},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\strehl00impact.pdf}
}
@MISC{oai:CiteSeerPSU:563891,
AUTHOR = {Domonkos Tikk and Jae Dong Yang and Sun Lee Bang},
TITLE = {Hierarchical Text Categorization Using Fuzzy Relational Thesaurus},
MONTH = APR # {~22},
YEAR = {0},
ABSTRACT = {Text categorization is the classification to assign a text document
to an appropriate category in a predefined set of categories. We
present a new approach for the text categorization by means of Fuzzy
Relational Thesaurus (FRT). FRT is a multilevel category system
that stores and maintains adaptive local dictionary for each category.
The goal of our approach is twofold; to develop a reliable text
categorization method on a certain subject domain, and to expand
the initial FRT by automatically added terms, thereby obtaining
an incrementally defined knowledge base of the domain. We implemented
the categorization algorithm and compared it with some other hierarchical
classifiers. Experimental results have been shown that our algorithm
outperforms its rivals on all document corpora investigated.},
ANNOTE = {Jae Dong Yang (Dept . of Computer Science , Chonbuk
National University; Chonju 561--756 , Korea); Sun Lee
Bang (Intelligent Integrated Systems
Japanese--Hungarian Laboratory; 1111 Budapest ,
Muegyetem rakpart 3. , Hungary);},
BIBSOURCE = {OAI-PMH server at cs1.ist.psu.edu},
CITESEER-REFERENCES = {oai:CiteSeerPSU:274056; oai:CiteSeerPSU:253930;
oai:CiteSeerPSU:13159; oai:CiteSeerPSU:382331;
oai:CiteSeerPSU:552405; oai:CiteSeerPSU:553162;
oai:CiteSeerPSU:211368; oai:CiteSeerPSU:582940;
oai:CiteSeerPSU:10425; oai:CiteSeerPSU:147931;
oai:CiteSeerPSU:332789; oai:CiteSeerPSU:322454;
oai:CiteSeerPSU:84047; oai:CiteSeerPSU:93679},
LANGUAGE = {en},
OAI = {oai:CiteSeerPSU:563891},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\hierarchical-text-categorization-using.pdf},
RIGHTS = {unrestricted},
URL = {http://citeseer.ist.psu.edu/563891.html; http://www.mft.hu/publications/tikk/Kybernetika.pdf}
}
@MISC{oai:CiteSeerPSU:541094,
AUTHOR = {J. J. Verbeek},
TITLE = {Supervised Feature Extraction for Text Categorization},
MONTH = FEB # {~14},
YEAR = {2002},
ABSTRACT = {This paper concerns finding the `optimal' number of word groups for
text classification. We present a method to select which words to
cluster into word groups and how many such word groups to use on
the basis of a set of pre-classified texts. The method involves
a `greedy' search through the space of possible word groups. The
words are grouped according to the `Jensen-Shannon divergence' between
the corresponding distributions over the classes. The criterion
to decide which number of word groups to use is based on Rissanen's
MDL Principle. We present empirical results that indicate that the
proposed method performs well. Furthermore, the proposed method
outperforms cross-validation in the sense that far fewer word groups
are selected while prediction accuracy is just slightly worse. For
the experimentation we used a subset of the `20 Newsgroup' dataset
[10].},
BIBSOURCE = {OAI-PMH server at cs1.ist.psu.edu},
CITESEER-REFERENCES = {oai:CiteSeerPSU:433337; oai:CiteSeerPSU:485452;
oai:CiteSeerPSU:93401; oai:CiteSeerPSU:553162;
oai:CiteSeerPSU:93679},
LANGUAGE = {en},
OAI = {oai:CiteSeerPSU:541094},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\supervised-feature-extraction-for.pdf},
RIGHTS = {unrestricted},
URL = {http://citeseer.ist.psu.edu/541094.html; http://carol.wins.uva.nl/~jverbeek/publications/../pub/benelearn.ps.gz}
}
@ARTICLE{journals/corr/cs-DL-9902007,
AUTHOR = {Ian H. Witten and Gordon W. Paynter and Eibe Frank and Carl Gutwin
and Craig G. Nevill-Manning},
TITLE = {{KEA}: Practical Automatic Keyphrase Extraction},
JOURNAL = {CoRR},
YEAR = {1999},
VOLUME = {cs.DL/9902007},
ABSTRACT = {Keyphrases provide semantic metadata that summarize and characterize
documents. This paper describes Kea, an algorithm for automatically
extracting keyphrases from text. Kea identifies candidate keyphrases
using lexical methods, calculates feature values for each candidate,
and uses a machine-learning algorithm to predict which candidates
are good keyphrases. The machine learning scheme first builds a
prediction model using training documents with known keyphrases,
and then uses the model to find keyphrases in new documents. We
use a large test corpus to evaluate Kea�s effectiveness in terms
of how many author-assigned keyphrases are correctly identified.
The system is simple, robust, and available under the GNU General
Public License; the paper gives instructions for use.},
BIBDATE = {2004-05-19},
BIBSOURCE = {DBLP,
http://dblp.uni-trier.de/db/journals/corr/corr9902.html#cs-DL-9902007},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\ian99kea.pdf},
URL = {http://arxiv.org/abs/cs.DL/9902007}
}
@MISC{oai:CiteSeerPSU:328087,
AUTHOR = {Wai-chiu Wong and Ada Wai-chee Fu},
TITLE = {Incremental Document Clustering for Web Page Classification},
MONTH = AUG # {~31},
YEAR = {2000},
ABSTRACT = {Motivated by the benefits in organizing the documents in Web search
engines, we consider the problem of automatic Web page classification.
We employ the clustering techniques. Each document is represented
by a feature vector. By analyzing the clusters formed by these vectors,
we can assign the documents within the same cluster to the same
class automatically. Our contributions are the following: (1) We
propose a feature extraction mechanism which is more suitable to
Web page classification. (2) We introduce a tree structure called
the DC-tree to make the clustering process incremental and less
sensitive to the document insertion order. (3) We show with experiments
on a set of Internet documents from Yahoo! that the proposed clustering
algorithm can classify Web pages effectively. Keywords: Incremental
update, Tree, Document, Clustering, Web, Classification 0 1 Introduction
The popularity of the Internet has caused a continuous massive increase
in the amount of Web pages (o...},
ANNOTE = {Wai-chiu Wong (Department of Computer Science and
Engineering; The Chinese University of Hong Kong;
Shatin , Hong Kong); Ada Wai-chee Fu (Department of
Computer Science and Engineering; The Chinese
University of Hong Kong; Shatin , Hong Kong);},
BIBSOURCE = {OAI-PMH server at cs1.ist.psu.edu},
CITESEER-REFERENCES = {oai:CiteSeerPSU:484762; oai:CiteSeerPSU:13982;
oai:CiteSeerPSU:388427; oai:CiteSeerPSU:45755;
oai:CiteSeerPSU:100508; oai:CiteSeerPSU:33829;
oai:CiteSeerPSU:571734; oai:CiteSeerPSU:322454;
oai:CiteSeerPSU:3011; oai:CiteSeerPSU:514099},
LANGUAGE = {en},
OAI = {oai:CiteSeerPSU:328087},
PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\wong00incremental.pdf},
RIGHTS = {unrestricted},
URL = {http://citeseer.ist.psu.edu/328087.html; http://www.cs.cuhk.hk/~adafu/Pub/IS2000.ps}
}
This file has been generated by bibtex2html 1.79