filtered_references.bib

@INPROCEEDINGS{conf/nlprs/Aizawa01,
  AUTHOR = {Akiko N. Aizawa},
  TITLE = {Linguistic Techniques to Improve the Performance of
	
	 Automatic Text Categorization},
  BOOKTITLE = {NLPRS},
  YEAR = {2001},
  PAGES = {307--314},
  BIBDATE = {2004-03-05},
  BIBSOURCE = {DBLP,
	
	 http://dblp.uni-trier.de/db/conf/nlprs/nlprs2001.html#Aizawa01},
  CROSSREF = {conf/nlprs/2001},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\0079-01.pdf},
  URL = {http://www.afnlp.org/nlprs2001/pdf/0079-01.pdf}
}

@INPROCEEDINGS{conf/icdm/AntonieZ02,
  AUTHOR = {Maria-Luiza Antonie and Osmar R. Za{\"i}ane},
  TITLE = {Text Document Categorization by Term Association},
  BOOKTITLE = {ICDM},
  YEAR = {2002},
  PAGES = {19--26},
  ABSTRACT = {A good text classifier is a classifier that efficiently categorizes
	large sets of text documents in a reasonable time frame and with
	an acceptable accuracy, and that provides classification rules that
	are human readable for possible fine-tuning. If the training of
	the classifier is also quick, this could become in some application
	domains a good asset for the classifier. Many techniques and algorithms
	for automatic text categorization have been devised. According to
	published literature, some are more accurate than others, and some
	provide more interpretable classification models than others. However,
	none can combine all the beneficial properties enumerated above.
	In this paper, we present a novel approach for automatic text categorization
	that borrows from market basket analysis techniques using association
	rule mining in the data-mining field. We focus on two major problems:
	(1) finding the best term association rules
	
	in a textual database by generating and pruning; and (2) using the
	rules to build a text classifier. Our text categorization method
	proves to be efficient and effective, and experiments
	
	on well-known collections show that the classifier performs well.
	In addition, training as well as classification are both fast and
	the generated rules are human readable.},
  BIBDATE = {2003-03-04},
  BIBSOURCE = {DBLP,
	
	 http://dblp.uni-trier.de/db/conf/icdm/icdm2002.html#AntonieZ02},
  CROSSREF = {conf/icdm/2002},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\text-document-categorization-by.pdf},
  URL = {http://computer.org/proceedings/icdm/1754/17540019abs.htm}
}

@INPROCEEDINGS{conf/pakm/BenjaminsFG98,
  AUTHOR = {V. Richard Benjamins and Dieter Fensel and Asunci{\'o}n G{\'o}mez-P{\'e}rez},
  TITLE = {Knowledge Management through Ontologies},
  BOOKTITLE = {PAKM},
  YEAR = {1998},
  BIBDATE = {2003-04-02},
  BIBSOURCE = {DBLP,
	
	 http://dblp.uni-trier.de/db/conf/pakm/pakm1998.html#BenjaminsFG98},
  CROSSREF = {conf/pakm/1998},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\benjamins98knowledge.pdf},
  URL = {http://SunSITE.Informatik.RWTH-Aachen.DE/Publications/CEUR-WS/Vol-13/paper5.ps}
}

@INPROCEEDINGS{conf/ercimdl/CaraccioloHR04,
  AUTHOR = {Caterina Caracciolo and Willem Robert van Hage and Maarten de Rijke},
  TITLE = {Towards Topic Driven Access to Full Text Documents},
  BOOKTITLE = {ECDL},
  YEAR = {2004},
  PAGES = {495--500},
  ABSTRACT = {We address the issue of providing topic driven access to full text
	documents. The methodology we propose is a combination of topic
	segmentation and information retrieval techniques. By segmenting
	the text into topic driven segments, we obtain small and coherent
	documents that can be used in two ways: as a basis for automatically
	generating hypertext links, and as a visualization aid for the reader
	who is presented with a small set of focused and restricted text
	snippets. In the presence of a concept hierarchy, or ontology, information
	retrieval techniques can be used to connect the segments obtained
	to concepts in the ontology. In this paper we concentrate on the
	text segmentation phase: we describe our approach to segmentation,
	discuss issues related to evaluation, and report on preliminary
	results.},
  BIBDATE = {2005-01-21},
  BIBSOURCE = {DBLP,
	
	 http://dblp.uni-trier.de/db/conf/ercimdl/ecdl2004.html#CaraccioloHR04},
  CROSSREF = {conf/ercimdl/2004},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\caracciolo04towards.pdf},
  URL = {http://springerlink.metapress.com/openurl.asp?genre=article&issn=0302-9743&volume=3232&spage=495}
}

@INPROCEEDINGS{conf/sac/DeboleS03,
  AUTHOR = {Franca Debole and Fabrizio Sebastiani},
  TITLE = {Supervised Term Weighting for Automated Text Categorization},
  BOOKTITLE = {SAC},
  YEAR = {2003},
  PAGES = {784--788},
  ABSTRACT = {The construction of a text classifier usually involves (i) a phase
	of term selection, in which the most relevant terms for the classification
	task are identified, (ii) a phase of term weighting, in which document
	weights for the selected terms are computed, and (iii) a phase of
	classifier learning, in which a classifier is generated from the
	weighted representations of the training documents. This process
	involves an activity of supervised learning, in which information
	on the membership of training documents in categories is used. Traditionally,
	supervised learning enters only phases (i) and (iii). In this paper
	we propose instead that learning from the training data should also
	affect phase (ii), i.e. that information on the membership of training
	documents to categories be used to determine term weights. We call
	this idea supervised term weighting (STW). As an example of STW,
	we propose a number of �supervised variants� of tfidf weighting,
	obtained by replacing the idf function with the function that has
	been used in phase (i) for term selection. The use of STW allows
	the terms that are distributed most differently in the positive
	and negative examples of the categories of interest to be weighted
	highest. We present experimental results obtained on the standard
	Reuters-21578 benchmark with three classifier learning methods (Rocchio,
	k-NN, and support vector machines), three term selection functions
	(information gain, chi-square, and gain ratio), and both local and
	global term selection and weighting.},
  BIBDATE = {2003-06-18},
  BIBSOURCE = {DBLP,
	
	 http://dblp.uni-trier.de/db/conf/sac/sac2003.html#DeboleS03},
  CROSSREF = {conf/sac/2003},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\debole02supervised.pdf}
}

@INPROCEEDINGS{conf/ijcai/GabrilovichM05,
  AUTHOR = {Evgeniy Gabrilovich and Shaul Markovitch},
  TITLE = {Feature Generation for Text Categorization Using World Knowledge},
  BOOKTITLE = {IJCAI},
  YEAR = {2005},
  PAGES = {1048--1053},
  ABSTRACT = {We enhance machine learning algorithms for text categorization with
	generated features based on domain-specific and common-sense knowledge.
	This knowledge is represented using publicly available ontologies
	that contain hundreds of thousands of concepts, such as the Open
	Directory; these ontologies are further enriched by several orders
	of magnitude through controlled Web crawling. Prior to text categorization,
	a feature generator analyzes the documents and maps them onto appropriate
	ontology concepts, which in turn induce a set of generated features
	that augment the standard bag of words. Feature generation is accomplished
	through contextual analysis of document text, implicitly performing
	word sense disambiguation. Coupled with the ability to generalize
	concepts using the ontology, this approach addresses the two main
	problems of natural language processing�synonymy and polysemy. Categorizing
	documents with the aid of knowledge-based features leverages information
	that cannot be deduced from the documents alone. Experimental results
	confirm improved performance, breaking through the plateau previously
	reached in the field.},
  BIBDATE = {2005-12-09},
  BIBSOURCE = {DBLP,
	
	 http://dblp.uni-trier.de/db/conf/ijcai/ijcai2005.html#GabrilovichM05},
  CROSSREF = {conf/ijcai/2005},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\url10.pdf},
  URL = {http://www.ijcai.org/papers/0971.pdf}
}

@INPROCEEDINGS{conf/cikm/HanK00,
  AUTHOR = {Eui-Hong Han and George Karypis},
  TITLE = {Fast Supervised Dimensionality Reduction Algorithm with Applications
	to Document Categorization \& Retrieval},
  BOOKTITLE = {CIKM},
  YEAR = {2000},
  PAGES = {12--19},
  BIBDATE = {2002-12-05},
  BIBSOURCE = {DBLP,
	
	 http://dblp.uni-trier.de/db/conf/cikm/cikm2000.html#HanK00},
  CROSSREF = {conf/cikm/2000},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\karypis00fast.pdf},
  URL = {http://doi.acm.org/10.1145/354756.354772}
}

@INPROCEEDINGS{conf/pakdd/HanKK01,
  AUTHOR = {Eui-Hong Han and George Karypis and Vipin Kumar},
  TITLE = {Text Categorization Using Weight Adjusted k-Nearest Neighbor Classification},
  BOOKTITLE = {PAKDD},
  YEAR = {2001},
  PAGES = {53--65},
  BIBDATE = {2002-01-03},
  BIBSOURCE = {DBLP,
	
	 http://dblp.uni-trier.de/db/conf/pakdd/pakdd2001.html#HanKK01},
  CROSSREF = {conf/pakdd/2001},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\han99text.pdf},
  URL = {http://link.springer.de/link/service/series/0558/bibs/2035/20350053.htm}
}

@INPROCEEDINGS{SIGIR'99*50,
  AUTHOR = {Thomas Hofmann},
  TITLE = {Probabilistic Latent Semantic Indexing},
  PAGES = {50--57},
  CROSSREF = {SIGIR '99},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\hofmann99probabilistic.pdf}
}

@INPROCEEDINGS{conf/pkdd/HothoSS03,
  AUTHOR = {Andreas Hotho and Steffen Staab and Gerd Stumme},
  TITLE = {Explaining Text Clustering Results Using Semantic Structures},
  BOOKTITLE = {PKDD},
  YEAR = {2003},
  PAGES = {217--228},
  ABSTRACT = {Common text clustering techniques offer rather poor capabilities for
	explaining to their users why a particular result has been achieved.
	They have the disadvantage that they do not relate semantically
	nearby terms and that they cannot explain how resulting clusters
	are related to each other. In this paper, we discuss a way of integrating
	a large thesaurus and the computation of lattices of resulting clusters
	into common text clustering in order to overcome these two problems.
	As its major result, our approach achieves an explanation using
	an appropriate level of granularity at the concept level as well
	as an appropriate size and complexity of the explaining lattice
	of resulting clusters.},
  BIBDATE = {2003-11-24},
  BIBSOURCE = {DBLP,
	
	 http://dblp.uni-trier.de/db/conf/pkdd/pkdd2003.html#HothoSS03},
  CROSSREF = {conf/pkdd/2003},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\hotho03explaining.pdf},
  URL = {http://springerlink.metapress.com/openurl.asp?genre=article&issn=0302-9743&volume=2838&spage=217}
}

@INPROCEEDINGS{conf/sigir/LavelliMS02,
  AUTHOR = {Alberto Lavelli and Bernardo Magnini and Fabrizio Sebastiani},
  TITLE = {Building thematic lexical resources by term categorization},
  BOOKTITLE = {SIGIR},
  YEAR = {2002},
  PAGES = {415--416},
  ABSTRACT = {We discuss work in progress in the semi-automatic generation of thematic
	lexicons by means of term categorization, a novel task employing
	techniques from information retrieval (IR) and machine learning
	(ML). Specifically, we view the generation of such lexicons as an
	iterative process of learning previously unknown associations between
	terms and themes (i.e. disciplines, or fields of activity). The
	process is iterative, in that it generates, for each ci in a set
	of themes, a sequenceof lexicons, bootstrapping from an initial
	lexicon Li 0 and a set of text corpora given as input. The method
	is inspired by text categorization, the discipline concerned with
	labelling natural language texts with labels from a predefined set
	of themes, or categories. However, while text categorization deals
	with documents represented as vectors in a space of terms, we formulate
	the task of term categorization as one in which terms are (dually)
	represented as vectors in a space of documents, and in which terms
	(instead of documents) are labelled with themes. As a learning device,
	we adopt boosting, since (a) it has demonstrated state-of-the-art
	effectiveness in a variety of text categorization applications,
	and (b) it naturally allows for a form of �data cleaning�, thereby
	making the process of generating a thematic lexicon an iteration
	of generate-and-test steps.},
  BIBDATE = {2002-12-06},
  BIBSOURCE = {DBLP,
	
	 http://dblp.uni-trier.de/db/conf/sigir/sigir2002.html#LavelliMS02},
  CROSSREF = {conf/sigir/2002},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\lavelli02building.pdf},
  URL = {http://doi.acm.org/10.1145/564376.564471}
}

@INPROCEEDINGS{conf/icdm/LiuDLLY03,
  AUTHOR = {Bing Liu and Yang Dai and Xiaoli Li and Wee Sun Lee and Philip S.
	Yu},
  TITLE = {Building Text Classifiers Using Positive and Unlabeled Examples},
  BOOKTITLE = {ICDM},
  YEAR = {2003},
  PAGES = {179--188},
  ABSTRACT = {This paper studies the problem of building text classifiers using
	positive and unlabeled examples. The key feature of this problem
	is that there is no negative example for learning. Recently, a few
	techniques for solving this problem were proposed in the literature.
	These techniques are based on the same idea, which builds a classifier
	in two steps. Each existing technique uses a different method for
	each step. In this paper, we first introduce some new methods for
	the two steps, and perform a comprehensive evaluation of all possible
	combinations of methods of the two steps. We then propose a more
	principled approach to solving the problem based on a biased formulation
	of SVM, and show experimentally that it is more accurate than the
	existing techniques.},
  BIBDATE = {2004-01-28},
  BIBSOURCE = {DBLP,
	
	 http://dblp.uni-trier.de/db/conf/icdm/icdm2003.html#LiuDLLY03},
  CROSSREF = {conf/icdm/2003},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\liu03building.pdf},
  URL = {http://csdl.computer.org/comp/proceedings/icdm/2003/1978/00/19780179abs.htm}
}

@INPROCEEDINGS{conf/icdm/LiuCZMW04,
  AUTHOR = {Tao Liu and Zheng Chen and Benyu Zhang and Wei-Ying Ma and Gongyi
	Wu},
  TITLE = {Improving Text Classification using Local Latent Semantic Indexing},
  BOOKTITLE = {ICDM},
  YEAR = {2004},
  PAGES = {162--169},
  ABSTRACT = {Latent Semantic Indexing (LSI) has been shown to be extremely useful
	in information retrieval, but it is not an optimal representation
	for text classification. It always drops the text classification
	performance when being applied to the whole training set (global
	LSI) because this completely unsupervised method ignores class discrimination
	while only concentrating on representation. Some local LSI methods
	have been proposed to improve the classification by utilizing class
	discrimination information. However, their performance improvements
	over original term vectors are still very limited. In this paper,
	we propose a new local LSI method called �Local Relevancy Weighted
	LSI� to improve text classification by performing a separate Single
	Value Decomposition (SVD) on the transformed local region of each
	class. Experimental results show that our method is much better
	than global LSI and traditional local LSI methods on classification
	within a much smaller LSI dimension.},
  BIBDATE = {2004-12-13},
  BIBSOURCE = {DBLP,
	
	 http://dblp.uni-trier.de/db/conf/icdm/icdm2004.html#LiuCZMW04},
  CROSSREF = {conf/icdm/2004},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\ICDM2004-LLSI.pdf},
  URL = {http://csdl.computer.org/comp/proceedings/icdm/2004/2142/00/21420162abs.htm}
}

@INPROCEEDINGS{conf/ecir/MoschittiB04,
  AUTHOR = {Alessandro Moschitti and Roberto Basili},
  TITLE = {Complex Linguistic Features for Text Classification: {A} Comprehensive
	Study},
  BOOKTITLE = {ECIR},
  YEAR = {2004},
  PAGES = {181--196},
  ABSTRACT = {Previous researches on advanced representations for document retrieval
	have shown that statistical state-of-the-art models are not improved
	by a variety of different linguistic representations. Phrases, word
	senses and syntactic relations derived by Natural Language Processing
	(NLP) techniques were observed ineffective to increase retrieval
	accuracy. For Text Categorization (TC) are available fewer and less
	definitive studies on the use of advanced document representations
	as it is a relatively new research area (compared to document retrieval).
	
	In this paper, advanced document representations have been investigated.
	Extensive experimentation on representative classifiers, Rocchio
	and SVM, as well as a careful analysis of the literature have been
	carried out to study how some NLP techniques used for indexing impact
	TC. Cross validation over 4 different corpora in two languages allowed
	us to gather an overwhelming evidence that complex nominals, proper
	nouns and word senses are not adequate to improve TC accuracy.},
  BIBDATE = {2004-03-02},
  BIBSOURCE = {DBLP,
	
	 http://dblp.uni-trier.de/db/conf/ecir/ecir2004.html#MoschittiB04},
  CROSSREF = {conf/ecir/2004},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\url9.pdf},
  URL = {http://springerlink.metapress.com/openurl.asp?genre=article&issn=0302-9743&volume=2997&spage=181}
}

@INPROCEEDINGS{DBLP:conf/gecco/OdaW03,
  AUTHOR = {Terri Oda and Tony White},
  TITLE = {Developing an Immunity to Spam.},
  BOOKTITLE = {GECCO},
  YEAR = {2003},
  PAGES = {231-242},
  ABSTRACT = {Immune systems protect animals from pathogens, so why not apply a
	similar model to protect computers? Several researchers have investigated
	the use of an artificial immune system to protect computers from
	viruses and others have looked at using such a system to detect
	unauthorized computer intrusions. This paper describes the use of
	an artificial immune system for another kind of protection: protection
	from unsolicited email, or spam.},
  BIBSOURCE = {DBLP, http://dblp.uni-trier.de},
  COMMENT = {Tiene unos conceptos muy interesantes para la implementaci�n del sistema
	inmune artificial},
  CROSSREF = {DBLP:conf/gecco/2003-1},
  EE = {http://link.springer.de/link/service/series/0558/bibs/2723/27230231.htm},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\i__GECCO 2003_papers_2723_27230231.pdf}
}

@INPROCEEDINGS{conf/icml/ScottM99,
  AUTHOR = {Sam Scott and Stan Matwin},
  TITLE = {Feature Engineering for Text Classification},
  BOOKTITLE = {ICML},
  YEAR = {1999},
  PAGES = {379--388},
  ABSTRACT = {Most research in text classification to date has used a �bag of words�
	representation in which each feature corresponds to a single word.
	This paper examines some alternative ways to represent text based
	on syntactic and semantic relationships between words (phrases,
	synonyms and hypernyms). We describe the new representations and
	try to justify our hypothesis that they could improve the performance
	of a rule-based learner. The representations are evaluated using
	the RIPPER learning algorithm on the Reuters-21578 and DigiTrad
	test corpora. On their own the new representations are not found
	to produce significant performance improvements. We also try combining
	classifiers based on different representations using a majority
	voting technique, and this improves performance on both test collections.
	In our opinion, more sophisticated Natural Language Processing techniques
	need to be developed before better text representations can be produced
	for classification.},
  BIBDATE = {2002-12-03},
  BIBSOURCE = {DBLP,
	
	 http://dblp.uni-trier.de/db/conf/icml/icml1999.html#ScottM99},
  CROSSREF = {conf/icml/1999},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\scott99feature.pdf}
}

@INPROCEEDINGS{conf/iccs/Stumme02,
  AUTHOR = {Gerd Stumme},
  TITLE = {Formal Concept Analysis on Its Way from Mathematics to Computer Science},
  BOOKTITLE = {ICCS},
  YEAR = {2002},
  PAGES = {2--19},
  BIBDATE = {2002-07-09},
  BIBSOURCE = {DBLP,
	
	 http://dblp.uni-trier.de/db/conf/iccs/iccs2002.html#Stumme02},
  CROSSREF = {conf/iccs/2002},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\formal-concept-analysis-on.pdf},
  URL = {http://link.springer.de/link/service/series/0558/bibs/2393/23930002.htm}
}

@INPROCEEDINGS{conf/icdm/SunL01,
  AUTHOR = {Aixin Sun and Ee-Peng Lim},
  TITLE = {Hierarchical Text Classification and Evaluation},
  BOOKTITLE = {ICDM},
  YEAR = {2001},
  PAGES = {521--528},
  ABSTRACT = {Hierarchical Classification refers to assigning of one or more suitable
	categories from a hierarchical category space to a document. While
	previous work in hierarchical classification focused on virtual
	category trees where documents are assigned only to the leaf categories,
	we propose a topdown level-based classification method that can
	classify documents to both leaf and internal categories. As the
	standard performance measures assume independence between categories,
	they have not considered the documents incorrectly classified into
	categories that are similar or not far from the correct ones in
	the category tree. We therefore propose the Category-Similarity
	Measures and Distance- Based Measures to consider the degree of
	misclassification
	
	in measuring the classification performance. An experiment has been
	carried out to measure the performance of our proposed hierarchical
	classification method. The results showed that our method performs
	well for Reuters text collection when enough training documents
	are given and the new measures have indeed considered the contributions
	of misclassified documents.},
  BIBDATE = {2002-02-13},
  BIBSOURCE = {DBLP,
	
	 http://dblp.uni-trier.de/db/conf/icdm/icdm2001.html#SunL01},
  CROSSREF = {conf/icdm/2001},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\sun01hierarchical.pdf}
}

@INPROCEEDINGS{conf/adc/ZaianeA02,
  AUTHOR = {Osmar R. Za{\"i}ane and Maria-Luiza Antonie},
  TITLE = {Classifying Text Documents by Associating Terms With Text Categories},
  BOOKTITLE = {Australasian Database Conference},
  YEAR = {2002},
  BIBDATE = {2004-10-08},
  BIBSOURCE = {DBLP,
	
	 http://dblp.uni-trier.de/db/conf/adc/adc2002.html#ZaianeA02},
  CROSSREF = {conf/adc/2002},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\classifying-text-documents-by.pdf},
  URL = {http://crpit.com/confpapers/CRPITV5Zaiane.pdf}
}

@MISC{oai:CiteSeerPSU:274056,
  AUTHOR = {Kjersti Aas and Line Eikvil},
  TITLE = {Text Categorisation: {A} Survey.},
  YEAR = {1999},
  ABSTRACT = {this report we give a survey of the state-of-the-art in text categorisation.
	To be able to measure progress in this field, it is important to
	use a standardised collection of documents for analysis and testing.
	One such data set is the Reuters-21578 collection of newswires for
	the year 1987, and our survey will focus on the work on text categorisation
	that have used this collection for testing.},
  BIBSOURCE = {OAI-PMH server at cs1.ist.psu.edu},
  CITESEER-REFERENCES = {oai:CiteSeerPSU:19079; oai:CiteSeerPSU:25286;
	
	 oai:CiteSeerPSU:37681; oai:CiteSeerPSU:140776;
	
	 oai:CiteSeerPSU:189631; oai:CiteSeerPSU:123646;
	
	 oai:CiteSeerPSU:129727; oai:CiteSeerPSU:553162;
	
	 oai:CiteSeerPSU:107422; oai:CiteSeerPSU:553162;
	
	 oai:CiteSeerPSU:100508; oai:CiteSeerPSU:84047;
	
	 oai:CiteSeerPSU:93679},
  LANGUAGE = {en},
  OAI = {oai:CiteSeerPSU:274056},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\aas99text.pdf},
  RIGHTS = {unrestricted},
  URL = {http://citeseer.ist.psu.edu/274056.html; http://www.nr.no/research/samba/tm_survey.ps}
}

@INPROCEEDINGS{conf/sigir/Ando00,
  AUTHOR = {Rie Kubota Ando},
  TITLE = {Latent semantic-space: iterative scaling improves precision of inter-document
	similarity measurement},
  BOOKTITLE = {SIGIR},
  YEAR = {2000},
  PAGES = {216--223},
  ABSTRACT = {We present a novel algorithm that creates document vectors with reduced
	dimensionality. This work was motivated by an application characterizing
	relationships among documents in a collection. Our algorithm yielded
	inter-document similarities with an average precision up to 17.8%
	higher than that of singular value decomposition (SVD) used for
	Latent Semantic Indexing. The best performance was achieved with
	dimensional reduction rates that were 43% higher than SVD on average.
	Our algorithm creates basis vectors for a reduced space by iteratively
	
	�scaling� vectors and computing eigenvectors. Unlike SVD, it breaks
	the symmetry of documents and terms to capture information more
	evenly across documents. We also discuss correlation with a probabilistic
	model and evaluate a method for selecting the dimensionality using
	log-likelihood estimation},
  BIBDATE = {2002-12-06},
  BIBSOURCE = {DBLP,
	
	 http://dblp.uni-trier.de/db/conf/sigir/sigir2000.html#Ando00},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\ando00latent.pdf},
  URL = {http://doi.acm.org/10.1145/345508.345579}
}

@BOOK{books/wi/BaldiFS03,
  TITLE = {Modeling the Internet and the Web: Probabilistic Method and Algorithms},
  PUBLISHER = {John Wiley},
  YEAR = {2003},
  AUTHOR = {Pierre Baldi and Paolo Frasconi and Padhraic Smyth},
  BIBDATE = {2003-08-22},
  ISBN = {0-470-84906-1},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\Modeling_the_Internet_and_the_Web_Probabilistic_Methods_and_Algorithms_(Wiley-2003).pdf},
  URL = {http://ibook.ics.uci.edu/}
}

@INPROCEEDINGS{conf/hicss/BasuWS03,
  AUTHOR = {A. Basu and Carolyn R. Watters and Michael A. Shepherd},
  TITLE = {Support Vector Machines for Text Categorization},
  BOOKTITLE = {HICSS},
  YEAR = {2003},
  PAGES = {103},
  ABSTRACT = {Text categorization is the process of sorting text documents into
	one or more predefined categories or classes of similar documents.
	Differences in the results of such categorization arise from the
	feature set chosen to base the association of a given document with
	a given category. Advocates of text categorization recognize that
	the sorting of text documents into categories of like documents
	reduces the overhead required for fast retrieval of such documents
	and provides smaller domains in which the users may explore similar
	documents. In this paper we are interested in examining whether
	automatic classification of news texts can be improved by a prefiltering
	the vocabulary to reduce the feature set used in the computations.
	First we compare artificial neural network and support vector machine
	algorithms for use as text classifiers of news items. Secondly,
	we identify a reduction in feature set that provides improved results.},
  BIBDATE = {2003-06-17},
  BIBSOURCE = {DBLP,
	
	 http://dblp.uni-trier.de/db/conf/hicss/hicss2003-4.html#BasuWS03},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\support_vectors.pdf},
  URL = {http://computer.org/proceedings/hicss/1874/track4/187440103cabs.htm}
}

@ARTICLE{betw-dwcvwtc-03,
  AUTHOR = {Ron Bekkerman and Ran El-Yaniv and Naftali Tishby and Yoad Winter},
  TITLE = {Distributional Word Clusters vs. Words for Text Categorization},
  JOURNAL = {Journal of Machine Learning Research},
  YEAR = {2003},
  VOLUME = {3},
  PAGES = {1183--1208},
  ABSTRACT = {We study an approach to text categorization that combines distributional
	clustering of words and a Support Vector Machine (SVM) classifier.
	This word-cluster representation is computed using the recently
	introduced Information Bottleneck method, which generates a compact
	and efficient representation of documents. When combined with the
	classification power of the SVM, this method yields high performance
	in text categorization. This novel combination of SVM with word-cluster
	representation is compared with SVM-based categorization using the
	simpler bag-of-words (BOW) representation. The comparison is performed
	over three known datasets. On one of these datasets (the 20 Newsgroups)
	the method based on word clusters significantly outperforms the
	word-based representation in terms of categorization accuracy or
	representation efficiency. On the two other sets (Reuters-21578
	and WebKB) the word-based representation slightly outperforms the
	word-cluster representation. We investigate the potential reasons
	for this behavior and relate it to structural differences between
	the datasets.},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\bekkerman03distributional.pdf}
}

@BOOK{chakrabarti02mining,
  TITLE = {Mining the {Web}. Discovering Knowledge from Hypertext Data},
  PUBLISHER = {Morgan Kaufmann Publishers},
  YEAR = {2002},
  AUTHOR = {Soumen Chakrabarti},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\Mining_the_Web_-_Discovering_Knowledge_from_Hypertext_Data_(Morgan_Kaufmann-2003).pdf}
}

@INPROCEEDINGS{Dalessio00,
  AUTHOR = {Stephen D'Alessio and Keitha Murray and Robert Schiaffino and Aaron
	Kershenbaum},
  TITLE = {The effect of using Hierarchical classifiers in Text Categorization},
  BOOKTITLE = {Proceeding of RIAO-00, 6th International Conference ``Recherche d'Information
	Assistee par Ordinateur''},
  YEAR = {2000},
  PAGES = {302--313},
  ADDRESS = {Paris, FR},
  ABSTRACT = {Given a set of categories, with or without a preexisting hierarchy
	among them, we consider the problem of assigning documents to one
	or more of these categories from the point of view of a hierarchy
	with more or less depth. We can choose to make use of none, part
	or all of the hierarchical structure to improve the categorization
	effectiveness and efficiency. It is possible to create additional
	hierarchy among the categories. We describe a procedure for generating
	a hierarchy of classifiers that model the hierarchy structure. We
	report on computational experience using this procedure. We show
	that judicious use of a hierarchy can significantly improve both
	the speed and effectiveness of the categorization process. Using
	the Reuters-21578 corpus, we obtain an improvement in running time
	of over a factor of three and a 5\% improvement in F-measure.},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\the-effect-of-using.pdf},
  URL = {http://www.iona.edu/cs/FacultyPublications/riao2000New.pdf}
}

@MISC{oai:CiteSeerPSU:613766,
  AUTHOR = {Offer Drori},
  TITLE = {Identifying the Subject of Documents in Digital Libraries Automatically
	Using Frequently-Occurring Words - Study and Findings},
  MONTH = MAY # {~23},
  YEAR = {2003},
  ABSTRACT = {Contemporary information databases contain millions of electronic
	documents. The immense number of documents makes it difficult to
	conduct efficient searches on the Internet. Several studies have
	found that associating documents with a subject or list of topics
	can make them easier to locate online [5] [6] [7]. Effective cataloging
	of information is performed manually, requiring extensive resources.
	Consequently, at present most information is not cataloged.},
  ANNOTE = {Offer Drori (The Hebrew University of Jerusalem ,
	
	 Jerusalem; ISRAEL);},
  BIBSOURCE = {OAI-PMH server at cs1.ist.psu.edu},
  CITESEER-REFERENCES = {oai:CiteSeerPSU:388078; oai:CiteSeerPSU:445314;
	
	 oai:CiteSeerPSU:306092; oai:CiteSeerPSU:354930;
	
	 oai:CiteSeerPSU:46582; oai:CiteSeerPSU:3011;
	
	 oai:CiteSeerPSU:110715},
  LANGUAGE = {en},
  OAI = {oai:CiteSeerPSU:613766},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\drori02identifying.pdf},
  RIGHTS = {unrestricted},
  URL = {http://citeseer.ist.psu.edu/613766.html; http://shum.huji.ac.il/~offerd/papers/drori042003-a.pdf}
}

@MISC{oai:CiteSeerPSU:529607,
  AUTHOR = {Dave Elliman},
  TITLE = {Automatic Derivation of On-line Document Ontologies},
  MONTH = JUL # {~26},
  YEAR = {2001},
  ABSTRACT = {This paper describes a method for constructing an ontology which will
	represent the set of web pages on a specified site. We are developing
	a technique that will extract knowledge from digital sources, create
	ontologies containing reusable knowledge to be shared with software
	agents, and present a view of this knowledge to users. This method
	will provide a solution to the problem of classifying information
	and supporting mechanisms that explore its structure, as well as
	allowing knowledge to be extracted and shared with other software
	agents.},
  ANNOTE = {Dave Elliman (J.Rafael G.Pulido; Image Processing and
	
	 Interpretation Research Group,; Computer Science and
	
	 Information Technology School,; University of;
	
	 Nottingham , United Kingdom);},
  BIBSOURCE = {OAI-PMH server at cs1.ist.psu.edu},
  CITESEER-REFERENCES = {oai:CiteSeerPSU:431849; oai:CiteSeerPSU:366691;
	
	 oai:CiteSeerPSU:100962; oai:CiteSeerPSU:444390;
	
	 oai:CiteSeerPSU:341007; oai:CiteSeerPSU:258861;
	
	 oai:CiteSeerPSU:316519},
  LANGUAGE = {en},
  OAI = {oai:CiteSeerPSU:529607},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\elliman01automatic.pdf},
  RIGHTS = {unrestricted},
  URL = {http://citeseer.ist.psu.edu/529607.html; http://www.cs.nott.ac.uk/~jrp/./pub/merit01.pdf}
}

@INPROCEEDINGS{conf/iv/EllimanP02,
  AUTHOR = {Dave Elliman and J. R. G. Pulido},
  TITLE = {Visualizing Ontology Components through Self-Organizing Maps},
  BOOKTITLE = {IV},
  YEAR = {2002},
  PAGES = {434},
  ABSTRACT = {This paper describes a method for identifying Ontology components
	by using Self-Organizing Maps. Our system represents the knowledge
	contained in a particular digital archive by assembling and displaying
	the ontologies components. This novel approach provides an alternative
	solution to the problem of classifying on-line information and retrieval,
	supportmechanisms that explore domains, and allows knowledge to
	be displayed in a browsable manner.},
  BIBDATE = {2005-05-02},
  BIBSOURCE = {DBLP,
	
	 http://dblp.uni-trier.de/db/conf/iv/iv2002.html#EllimanP02},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\visualizing-ontology-components-through.pdf},
  URL = {http://csdl.computer.org/comp/proceedings/iv/2002/1656/00/16560434abs.htm}
}

@INPROCEEDINGS{Florian:416,
  AUTHOR = {Radu Florian and David Yarowsky},
  TITLE = {Dynamic Nonlocal Language Modeling via Hierarchical Topic-Based Adaptation},
  BOOKTITLE = {37th Annual Meeting of the Association for Computational Linguistics},
  YEAR = {1999},
  PAGES = {167--174},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\florian99dynamic.pdf}
}

@MISC{oai:CiteSeerPSU:566368,
  AUTHOR = {Hichem Frigui and Olfa Nasraoui},
  TITLE = {Simultaneous Categorization of Text Documents And Identification
	of Cluster-dependent Keywords},
  MONTH = APR # {~07},
  YEAR = {2002},
  ABSTRACT = {In this paper, we propose a new approach to unsupervised text document
	categorization based on a coupled process of clustering and cluster-dependent
	keyword weighting. The proposed algorithm is based on the K-Means
	clustering algorithm. Hence it is computationally and implementationally
	simple. Moreover, it learns a different set of keyword weights for
	each cluster. This means that, as a by-product of the clustering
	process, each document cluster will be characterized by a possibly
	different set of keywords. The cluster dependent keyword weights
	have two advantages. First, they help in partitioning the document
	collection into more meaningful categories.},
  ANNOTE = {Hichem Frigui (Department of Electrical and Computer
	
	 Engineering; University of Memphis; Campus Box 526574 ,
	
	 Memphis , TN 38152); Olfa Nasraoui (Department of
	
	 Electrical and Computer Engineering; University of
	
	 Memphis; Campus Box 526574 , Memphis , TN 38152);},
  BIBSOURCE = {OAI-PMH server at cs1.ist.psu.edu},
  CITESEER-REFERENCES = {oai:CiteSeerPSU:322454; oai:CiteSeerPSU:15856;
	
	 oai:CiteSeerPSU:112012; oai:CiteSeerPSU:196975;
	
	 oai:CiteSeerPSU:164433; oai:CiteSeerPSU:3156;
	
	 oai:CiteSeerPSU:98891; oai:CiteSeerPSU:349805;
	
	 oai:CiteSeerPSU:214617; oai:CiteSeerPSU:332454;
	
	 oai:CiteSeerPSU:69940},
  LANGUAGE = {en},
  OAI = {oai:CiteSeerPSU:566368},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\frigui02simultaneous.pdf},
  RIGHTS = {unrestricted},
  URL = {http://citeseer.ist.psu.edu/566368.html; http://www.ee.memphis.edu/people/faculty/nasraoui/publications/SIAM02_TXT_WKSHP.pdf}
}

@INPROCEEDINGS{Guo04,
  AUTHOR = {Gongde Guo and Hui Wang and David A. Bell and Yaxin Bi and Kieran
	Greer},
  TITLE = {An k{NN} Model-Based Approach and Its Application in Text Categorization},
  BOOKTITLE = {Proceedings of CICLING-04, 5th International Conference on Computational
	Linguistics and Intelligent Text Processing},
  YEAR = {2004},
  EDITOR = {Alexander F. Gelbukh},
  PAGES = {559--570},
  ADDRESS = {Seoul, KO},
  PUBLISHER = {Springer Verlag, Heidelberg, DE},
  NOTE = {Published in the ``Lecture Notes in Computer Science'' series, number
	2945},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\unknown.pdf}
}

@MISC{oai:CiteSeerPSU:584708,
  AUTHOR = {Andreas Hotho and Gerd Stumme},
  TITLE = {Conceptual Clustering of Text Clusters},
  MONTH = MAY # {~23},
  YEAR = {2002},
  ABSTRACT = {Common clustering techniques have the disadvantage that they do not
	provide intensional descriptions of the clusters obtained. Conceptual
	Clustering techniques, on the other hand, provide such descriptions,
	but are known to be rather slow. In this paper, we discuss a way
	of combining both techniques. We first cluster the documents by
	a variant of #--Means, using a thesaurus as background knowledge.
	This clustering reduces the large number of documents to a relatively
	small number of clusters, which can then be clustered conceptually
	in the second step.},
  ANNOTE = {Andreas Hotho (Institute of Applied Informatics and
	
	 Formal Description Methods AIFB , University of
	
	 Karlsruhe; D--76128 Karlsruhe); Gerd Stumme (Institute
	
	 of Applied Informatics and Formal Description Methods
	
	 AIFB , University of Karlsruhe; D--76128 Karlsruhe);},
  BIBSOURCE = {OAI-PMH server at cs1.ist.psu.edu},
  CITESEER-REFERENCES = {oai:CiteSeerPSU:582032; oai:CiteSeerPSU:70718;
	
	 oai:CiteSeerPSU:380343; oai:CiteSeerPSU:246639;
	
	 oai:CiteSeerPSU:573923; oai:CiteSeerPSU:301272;
	
	 oai:CiteSeerPSU:274006},
  LANGUAGE = {en},
  OAI = {oai:CiteSeerPSU:584708},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\hotho02conceptual.pdf},
  RIGHTS = {unrestricted},
  URL = {http://citeseer.ist.psu.edu/584708.html; http://www.aifb.uni-karlsruhe.de/WBS/aho/pub/tc_fca_2002_submit.pdf}
}

@INPROCEEDINGS{JM02b,
  AUTHOR = {M. Jarrar and R. Meersman},
  TITLE = {Scalability and Knowledge Reusability in Ontology Modeling},
  BOOKTITLE = {Proceedings of the International conference on Infrastructure for
	e-Business, e-Education, e-Science, and e-Medicine},
  YEAR = {2002},
  EDITOR = {Veljko Milutinovic},
  VOLUME = {SSGRR2002s},
  ADDRESS = {Rome, Italy},
  PUBLISHER = {SSGRR education center},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\jarrar02scalability.pdf},
  SOURCE = {http://www.jarrar.info/publications/}
}

@TECHREPORT{KarypisHan,
  AUTHOR = {George Karypis and Eui-Hong Han},
  TITLE = {Concept indexing: {A} fast dimensionality reduction algorithm with
	applications to document retrieval and categorization},
  INSTITUTION = {University of Minnesota},
  YEAR = {2000},
  TYPE = {Computer Science Department \uppercase{TR}-00-0016},
  ABSTRACT = {In recent years, we have seen a tremendous growth in the volume of
	text documents available on the Internet, digital libraries, news
	sources, and company-wide intranets. This has led to an increased
	interest in developing methods that can efficiently categorize and
	retrieve relevant information. Retrieval techniques based on dimensionality
	reduction, such as Latent Semantic Indexing (LSI), have been shown
	to improve the quality of the information being retrieved by capturing
	the latent meaning of the words present in the documents. Unfortunately,
	the high computational requirements of LSI and its inability to
	compute an effective dimensionality reduction in a supervised setting
	limits its applicability. In this paper we present a fast dimensionality
	reduction algorithm, called concept indexing (CI) that is equally
	effective for unsupervised and supervised dimensionality reduction.
	CI computes a k-dimensional representation of a collection of documents
	by first clustering the documents into k groups, and then using
	the centroid vectors of the clusters to derive the axes of the reduced
	k-dimensional space. Experimental results show that the dimensionality
	reduction computed by CI achieves comparable retrieval performance
	to that obtained using LSI, while requiring an order of magnitude
	less time. Moreover, when CI is used to compute the dimensionality
	reduction in a supervised setting, it greatly improves the performance
	of traditional classification algorithms such as C4.5 and kNN.},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\karypis00concept.pdf}
}

@INPROCEEDINGS{Kudo04,
  AUTHOR = {Taku Kudo and Yuji Matsumoto},
  TITLE = {A Boosting Algorithm for Classification of
	
	 Semi-Structured Text},
  BOOKTITLE = {Proceedings of EMNLP-04, 9th Conference on Empirical
	
	 Methods in Natural Language Processing},
  YEAR = {2004},
  ADDRESS = {Barcelon, ES},
  ABSTRACT = {The focus of research in text classification has
	
	 expanded from simple topic identification to more
	
	 challenging tasks such as opinion/modality
	
	 identification. Unfortunately, the latter goals exceed
	
	 the ability of the traditional bag-of-word
	
	 representation approach, and a richer, more structural
	
	 representation is required. Accordingly, learning
	
	 algorithms must be created that can handle the
	
	 structures observed in texts. In this paper, we propose
	
	 a Boosting algorithm that captures sub-structures
	
	 embedded in texts. The proposal consists of i) decision
	
	 stumps that use subtrees as features and ii) the
	
	 Boosting algorithm which employs the subtree-based
	
	 decision stumps as weak learners. We also discuss the
	
	 relation between our algorithm and SVMs with tree
	
	 kernel. Two experiments on opinion/modality
	
	 classification confirm that subtree features are
	
	 important.},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\emnlp2004-1.pdf},
  URL = {http://chasen.org/~taku/publications/emnlp2004-1.pdf}
}

@MISC{oai:CiteSeerPSU:458733,
  AUTHOR = {Dawn Lawrie and W. Bruce Croft},
  TITLE = {Discovering and Comparing Topic Hierarchies},
  MONTH = OCT # {~13},
  YEAR = {2000},
  ABSTRACT = {Hierarchies have been used for organization, summarization, and access
	to information, yet a lingering issue is how best to construct them.
	In this paper, our goal is to automatically create domain specific
	hierarchies that can be used for browsing a document set and locating
	relevant documents. We examine methods of automatically generating
	hierarchies and evaluating them. To this end, we compare and contrast
	two methods of generating topic hierarchies from the text of documents:
	one, subsumption hierarchies, uses subsumption relations found within
	document sets, and the other, lexical hierarchies, utilizes frequently
	used words within phrases. Our evaluation shows that subsumption
	hierarchies divide documents into smaller groups, allowing one to
	find all relevant documents without looking at as many non-relevant
	documents. However, such hierarchies are more likely to contain
	no path to a relevant document.},
  ANNOTE = {Dawn Lawrie (Department of Computer Science;
	
	 University of Massachusetts; Amherst , MA 01003 USA);
	
	 W. Bruce Croft (Department of Computer Science;
	
	 University of Massachusetts; Amherst , MA 01003 USA);},
  BIBSOURCE = {OAI-PMH server at cs1.ist.psu.edu},
  CITESEER-REFERENCES = {oai:CiteSeerPSU:26307; oai:CiteSeerPSU:15856;
	
	 oai:CiteSeerPSU:211368; oai:CiteSeerPSU:187156;
	
	 oai:CiteSeerPSU:455178; oai:CiteSeerPSU:209382},
  LANGUAGE = {en},
  OAI = {oai:CiteSeerPSU:458733},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\lawrie00discovering.pdf},
  RIGHTS = {unrestricted},
  URL = {http://citeseer.ist.psu.edu/458733.html; http://ciir.cs.umass.edu/pubfiles/ir-183.pdf}
}

@CONFERENCE{IMM2004-02894,
  AUTHOR = {R. E. Madsen and J. Larsen and L. K. Hansen},
  TITLE = {Part-of-Speech Enhanced Context Recognition},
  BOOKTITLE = {Proceedings of {IEEE} Workshop on Machine Learning for Signal Processing
	{XIV}},
  YEAR = {2004},
  EDITOR = {S. Douglas {A.K. Barros, J. Principe, J. Larsen, T. Adali}},
  PAGES = {635--644},
  ADDRESS = {Piscataway, New Jersey},
  MONTH = SEP,
  PUBLISHER = {{IEEE} Press},
  ABSTRACT = {Language independent `bag-of-words' representations are surprisingly
	efective for text classi�cation. In this communi- cation our aim
	is to elucidate the synergy between language inde- pendent features
	and simple language model features. We consider term tag features
	estimated by a so-called part-of-speech tagger. The feature sets
	are combined in an early binding design with an optimized binding
	coefficient that allows weighting of the relative variance contributions
	of the participating feature sets. With the combined features documents
	are classi�ed using a latent semantic indexing representation and
	a probabilistic neural network classi- fier. Three medium size data-sets
	are analyzed and we find consis- tent synergy between the term and
	natural language features in all three sets for a range of training
	set sizes. The most significant en- hancement is found for small
	text databases where high recognition rates are possible.},
  ISBN_ISSN = {0-7803-8609-4},
  KEYWORDS = {text mining, latent space, context recognition},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\part-of-speech-enhanced.pdf},
  URL = {http://www2.imm.dtu.dk/pubdb/p.php?2894}
}

@ARTICLE{Park04,
  AUTHOR = {Seong-Bae Park and Byoung-Tak Zhang},
  TITLE = {Co-trained support vector machines for large scale unstructured document
	classification using unlabeled data and syntactic information},
  JOURNAL = {Information Processing and Management},
  YEAR = {2004},
  VOLUME = {40},
  PAGES = {421--439},
  NUMBER = {3},
  ABSTRACT = {Most document classification systems consider only the distribution
	of content words of the documents, ignoring the syntactic information
	underlying the documents though it is also an important factor.
	In this paper, we present an approach for classifying large scale
	unstructured documents by incorporating both lexical and syntactic
	information of documents. For this purpose, we use the co-training
	algorithm, a partially supervised learning algorithm, in which two
	separated views for the training data are employed and the small
	number of labeled data are augmented by a large number of unlabeled
	data. Since both lexical and syntactic information can play roles
	of separated views for the unstructured documents, the co-training
	algorithm enhances the performance of document classification using
	both of them and a large number of unlabeled documents. The experimental
	results on Reuters-21578 corpus and TREC-7 filtering documents show
	the effectiveness of unlabeled documents and the use of both lexical
	and syntactic information.},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\LNAI2637_Park.pdf}
}

@TECHREPORT{oai:CiteSeerPSU:542295,
  AUTHOR = {William M. Pottenger and Ph. D},
  TITLE = {Detecting Patterns in the {LSI} Term-Term Matrix},
  YEAR = {2002},
  MONTH = SEP # {~25},
  ABSTRACT = {applications use techniques that explicitly or implicitly employ a
	limited degree of transitivity in the co-occurrence relation. In
	this work we show use of higher orders of co-occurrence in the Singular
	Value Decomposition (SVD) algorithm and, by inference, on the systems
	that rely on SVD, such as LSI. Our empirical and mathematical studies
	prove that term cooccurrence plays a crucial role in LSI.},
  ANNOTE = {William M. Pottenger (CSE Department; 19 Memorial
	
	 Drive West); Ph. D (CSE Department; 19 Memorial Drive
	
	 West);},
  BIBSOURCE = {OAI-PMH server at cs1.ist.psu.edu},
  CITESEER-REFERENCES = {oai:CiteSeerPSU:189631; oai:CiteSeerPSU:441438;
	
	 oai:CiteSeerPSU:144832; oai:CiteSeerPSU:576970;
	
	 oai:CiteSeerPSU:23424; oai:CiteSeerPSU:8085},
  LANGUAGE = {en},
  OAI = {oai:CiteSeerPSU:542295},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\kontostathis02detecting.pdf},
  RIGHTS = {unrestricted},
  URL = {http://citeseer.ist.psu.edu/542295.html; http://www.cse.lehigh.edu/techreports/2002/LU-CSE-02-010.pdf}
}

@MISC{oai:CiteSeerPSU:555050,
  AUTHOR = {David Ramamonjisoa},
  TITLE = {Towards Automated Research Topics Discovery on Scientific Domain
	by Agents System},
  MONTH = JAN # {~02},
  YEAR = {2003},
  ABSTRACT = {In our project on multiagent for web mining, we developed KAROKA (Keywords
	Association Rules Optimizer Knobots Advisers) as a model of discovery
	in text database used in WWW. In this paper, we explain our model
	and its application to discover new research topics in scientific
	domain on the web. This tool aims to support researchers for their
	bibliographical investigation and help to avoid information overload.
	The WWW sources are converted into a highly structured collection
	of text. Then, KAROKA tries to extract topics, association rules,
	regularities, exception and useful information in the collection
	of text.},
  ANNOTE = {David Ramamonjisoa (Faculty of Software and
	
	 Information Science; Iwate Prefectural University;
	
	 Takizawa , Iwate , Japan 020--0273);},
  BIBSOURCE = {OAI-PMH server at cs1.ist.psu.edu},
  CITESEER-REFERENCES = {oai:CiteSeerPSU:19801; oai:CiteSeerPSU:458698;
	
	 oai:CiteSeerPSU:348324; oai:CiteSeerPSU:179712;
	
	 oai:CiteSeerPSU:68861; oai:CiteSeerPSU:55671;
	
	 oai:CiteSeerPSU:563035; oai:CiteSeerPSU:454529;
	
	 oai:CiteSeerPSU:456928; oai:CiteSeerPSU:438592},
  LANGUAGE = {en},
  OAI = {oai:CiteSeerPSU:555050},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\ramamonjisoa03towards.pdf},
  RIGHTS = {unrestricted},
  URL = {http://citeseer.ist.psu.edu/555050.html; http://www.ssgrr.it/en/ssgrr2003w/papers/157.pdf}
}

@BOOK{Scime2005,
  TITLE = {Web Mining: applications and techniques},
  PUBLISHER = {Idea Group},
  YEAR = {2005},
  EDITOR = {A Scime},
  AUTHOR = {Anthony Scime},
  OWNER = {Andr�s},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\Idea.Group.Publishing.Web.Mining.Applications.and.Techniques.Aug.2004.eBook-DDU.pdf},
  TIMESTAMP = {2006.04.10}
}

@INPROCEEDINGS{Sinka:2004:EDFfWDCAFS,
  AUTHOR = {Mark Sinka and David Corne},
  TITLE = {Evolving Document Features for Web Document
	
	 Clustering: {A} Feasability Study},
  BOOKTITLE = {Proceedings of the 2004 IEEE Congress on Evolutionary
	
	 Computation},
  YEAR = {2004},
  PAGES = {891--897},
  ADDRESS = {Portland, Oregon},
  MONTH = {20-23 } # JUN,
  PUBLISHER = {IEEE Press},
  ABSTRACT = {Document analysis research underpins the envisaged
	
	 'semantic web'. A key issue is how to encode a document
	
	 without losing salient information. Current research
	
	 almost always uses fixed-length vectors based on word
	
	 (term) frequency (TF) and/or variants thereof. We
	
	 explore alternative encodings using an evolutionary
	
	 algorithm (EA). These alternatives use a variety of
	
	 other features that can be extracted from a document,
	
	 and the EA explores the space of weighted combinations
	
	 of these. Tests are able to find encodings which
	
	 outperform previous results. Among several tentative
	
	 findings it seems clear that the ideal encoding is
	
	 highly task-dependent, and we can recommend certain
	
	 features as useful for specific types of document
	
	 clustering tasks.},
  ISBN = {0-7803-8515-2},
  KEYWORDS = {Other, Real-world applications},
  NOTES = {CEC 2004 - A joint meeting of the IEEE, the EPS, and
	
	 the IEE.},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\S020P003.pdf}
}

@INPROCEEDINGS{Slonim01,
  AUTHOR = {Noam Slonim and Naftali Tishby},
  TITLE = {The Power of Word Clusters for Text Classification},
  BOOKTITLE = {Proceedings of ECIR-01, 23rd European Colloquium on Information Retrieval
	Research},
  YEAR = {2001},
  ADDRESS = {Darmstadt, DE},
  ABSTRACT = {The recently introduced Information Bottleneck method provides an
	information theoretic framework, for extracting features of one
	variable, that are relevant for the values of another variable.
	Several previous works already suggested applying this method for
	document clustering, gene expression data analysis, spectral analysis
	and more. In this work we present a novel implementation of this
	method for supervised text classification. Specifically, we apply
	the information bottleneck method to find word-clusters that preserve
	the information about document categories and use these clusters
	as features for classification. Previous work used a similar clustering
	procedure to show that word-clusters can significantly reduce the
	feature space dimensionality, with only a minor change in classification
	accuracy. In this work we reproduce these results and go further
	to show that when the training sample is small word clusters can
	yield significant improvement in classification accuracy (up to
	18\%) over the performance using the words directly.},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\slonim01power.pdf},
  URL = {http://www.cs.huji.ac.il/labs/learning/Papers/irsg3.eps.gz}
}

@INPROCEEDINGS{Strehl:2000:ISM,
  AUTHOR = {Alexander Strehl and Joydeep Ghosh and Raymond Mooney},
  TITLE = {Impact of Similarity Measures on Web-page Clustering},
  BOOKTITLE = {Proceedings of the 17th National Conference on Artificial Intelligence:
	Workshop of Artificial Intelligence for Web Search (AAAI 2000),
	30--31 July 2000, Austin, Texas, USA},
  YEAR = {2000},
  PAGES = {58--64},
  MONTH = JUL,
  PUBLISHER = {AAAI},
  BIBDATE = {Sat Apr 20 15:28:13 2002},
  BIBSOURCE = {ftp://ftp.math.utah.edu/pub/bibnet/authors/s/strehl-alexander.bib},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\strehl00impact.pdf}
}

@MISC{oai:CiteSeerPSU:563891,
  AUTHOR = {Domonkos Tikk and Jae Dong Yang and Sun Lee Bang},
  TITLE = {Hierarchical Text Categorization Using Fuzzy Relational Thesaurus},
  MONTH = APR # {~22},
  YEAR = {0},
  ABSTRACT = {Text categorization is the classification to assign a text document
	to an appropriate category in a predefined set of categories. We
	present a new approach for the text categorization by means of Fuzzy
	Relational Thesaurus (FRT). FRT is a multilevel category system
	that stores and maintains adaptive local dictionary for each category.
	The goal of our approach is twofold; to develop a reliable text
	categorization method on a certain subject domain, and to expand
	the initial FRT by automatically added terms, thereby obtaining
	an incrementally defined knowledge base of the domain. We implemented
	the categorization algorithm and compared it with some other hierarchical
	classifiers. Experimental results have been shown that our algorithm
	outperforms its rivals on all document corpora investigated.},
  ANNOTE = {Jae Dong Yang (Dept . of Computer Science , Chonbuk
	
	 National University; Chonju 561--756 , Korea); Sun Lee
	
	 Bang (Intelligent Integrated Systems
	
	 Japanese--Hungarian Laboratory; 1111 Budapest ,
	
	 Muegyetem rakpart 3. , Hungary);},
  BIBSOURCE = {OAI-PMH server at cs1.ist.psu.edu},
  CITESEER-REFERENCES = {oai:CiteSeerPSU:274056; oai:CiteSeerPSU:253930;
	
	 oai:CiteSeerPSU:13159; oai:CiteSeerPSU:382331;
	
	 oai:CiteSeerPSU:552405; oai:CiteSeerPSU:553162;
	
	 oai:CiteSeerPSU:211368; oai:CiteSeerPSU:582940;
	
	 oai:CiteSeerPSU:10425; oai:CiteSeerPSU:147931;
	
	 oai:CiteSeerPSU:332789; oai:CiteSeerPSU:322454;
	
	 oai:CiteSeerPSU:84047; oai:CiteSeerPSU:93679},
  LANGUAGE = {en},
  OAI = {oai:CiteSeerPSU:563891},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\hierarchical-text-categorization-using.pdf},
  RIGHTS = {unrestricted},
  URL = {http://citeseer.ist.psu.edu/563891.html; http://www.mft.hu/publications/tikk/Kybernetika.pdf}
}

@MISC{oai:CiteSeerPSU:541094,
  AUTHOR = {J. J. Verbeek},
  TITLE = {Supervised Feature Extraction for Text Categorization},
  MONTH = FEB # {~14},
  YEAR = {2002},
  ABSTRACT = {This paper concerns finding the `optimal' number of word groups for
	text classification. We present a method to select which words to
	cluster into word groups and how many such word groups to use on
	the basis of a set of pre-classified texts. The method involves
	a `greedy' search through the space of possible word groups. The
	words are grouped according to the `Jensen-Shannon divergence' between
	the corresponding distributions over the classes. The criterion
	to decide which number of word groups to use is based on Rissanen's
	MDL Principle. We present empirical results that indicate that the
	proposed method performs well. Furthermore, the proposed method
	outperforms cross-validation in the sense that far fewer word groups
	are selected while prediction accuracy is just slightly worse. For
	the experimentation we used a subset of the `20 Newsgroup' dataset
	[10].},
  BIBSOURCE = {OAI-PMH server at cs1.ist.psu.edu},
  CITESEER-REFERENCES = {oai:CiteSeerPSU:433337; oai:CiteSeerPSU:485452;
	
	 oai:CiteSeerPSU:93401; oai:CiteSeerPSU:553162;
	
	 oai:CiteSeerPSU:93679},
  LANGUAGE = {en},
  OAI = {oai:CiteSeerPSU:541094},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\supervised-feature-extraction-for.pdf},
  RIGHTS = {unrestricted},
  URL = {http://citeseer.ist.psu.edu/541094.html; http://carol.wins.uva.nl/~jverbeek/publications/../pub/benelearn.ps.gz}
}

@ARTICLE{journals/corr/cs-DL-9902007,
  AUTHOR = {Ian H. Witten and Gordon W. Paynter and Eibe Frank and Carl Gutwin
	and Craig G. Nevill-Manning},
  TITLE = {{KEA}: Practical Automatic Keyphrase Extraction},
  JOURNAL = {CoRR},
  YEAR = {1999},
  VOLUME = {cs.DL/9902007},
  ABSTRACT = {Keyphrases provide semantic metadata that summarize and characterize
	documents. This paper describes Kea, an algorithm for automatically
	extracting keyphrases from text. Kea identifies candidate keyphrases
	using lexical methods, calculates feature values for each candidate,
	and uses a machine-learning algorithm to predict which candidates
	are good keyphrases. The machine learning scheme first builds a
	prediction model using training documents with known keyphrases,
	and then uses the model to find keyphrases in new documents. We
	use a large test corpus to evaluate Kea�s effectiveness in terms
	of how many author-assigned keyphrases are correctly identified.
	The system is simple, robust, and available under the GNU General
	Public License; the paper gives instructions for use.},
  BIBDATE = {2004-05-19},
  BIBSOURCE = {DBLP,
	
	 http://dblp.uni-trier.de/db/journals/corr/corr9902.html#cs-DL-9902007},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\ian99kea.pdf},
  URL = {http://arxiv.org/abs/cs.DL/9902007}
}

@MISC{oai:CiteSeerPSU:328087,
  AUTHOR = {Wai-chiu Wong and Ada Wai-chee Fu},
  TITLE = {Incremental Document Clustering for Web Page Classification},
  MONTH = AUG # {~31},
  YEAR = {2000},
  ABSTRACT = {Motivated by the benefits in organizing the documents in Web search
	engines, we consider the problem of automatic Web page classification.
	We employ the clustering techniques. Each document is represented
	by a feature vector. By analyzing the clusters formed by these vectors,
	we can assign the documents within the same cluster to the same
	class automatically. Our contributions are the following: (1) We
	propose a feature extraction mechanism which is more suitable to
	Web page classification. (2) We introduce a tree structure called
	the DC-tree to make the clustering process incremental and less
	sensitive to the document insertion order. (3) We show with experiments
	on a set of Internet documents from Yahoo! that the proposed clustering
	algorithm can classify Web pages effectively. Keywords: Incremental
	update, Tree, Document, Clustering, Web, Classification 0 1 Introduction
	The popularity of the Internet has caused a continuous massive increase
	in the amount of Web pages (o...},
  ANNOTE = {Wai-chiu Wong (Department of Computer Science and
	
	 Engineering; The Chinese University of Hong Kong;
	
	 Shatin , Hong Kong); Ada Wai-chee Fu (Department of
	
	 Computer Science and Engineering; The Chinese
	
	 University of Hong Kong; Shatin , Hong Kong);},
  BIBSOURCE = {OAI-PMH server at cs1.ist.psu.edu},
  CITESEER-REFERENCES = {oai:CiteSeerPSU:484762; oai:CiteSeerPSU:13982;
	
	 oai:CiteSeerPSU:388427; oai:CiteSeerPSU:45755;
	
	 oai:CiteSeerPSU:100508; oai:CiteSeerPSU:33829;
	
	 oai:CiteSeerPSU:571734; oai:CiteSeerPSU:322454;
	
	 oai:CiteSeerPSU:3011; oai:CiteSeerPSU:514099},
  LANGUAGE = {en},
  OAI = {oai:CiteSeerPSU:328087},
  PDF = {D:\Andr�s\Maestr�a\Seminario I\Referencias\Organizadas\wong00incremental.pdf},
  RIGHTS = {unrestricted},
  URL = {http://citeseer.ist.psu.edu/328087.html; http://www.cs.cuhk.hk/~adafu/Pub/IS2000.ps}
}

This file has been generated by bibtex2html 1.79

Hosted by www.Geocities.ws