add nltk_data
This commit is contained in:
parent
79f1420a14
commit
ca49799fb2
|
|
@ -1,5 +1,9 @@
|
|||
from configs.model_config import *
|
||||
from chains.local_doc_qa import LocalDocQA
|
||||
import os
|
||||
import nltk
|
||||
|
||||
nltk.data.path = [os.path.join(os.path.dirname(__file__), "nltk_data")] + nltk.data.path
|
||||
|
||||
# return top-k text chunk from vector store
|
||||
VECTOR_SEARCH_TOP_K = 6
|
||||
|
|
|
|||
|
|
@ -0,0 +1,76 @@
|
|||
The Carnegie Mellon Pronouncing Dictionary [cmudict.0.7a]
|
||||
|
||||
ftp://ftp.cs.cmu.edu/project/speech/dict/
|
||||
https://cmusphinx.svn.sourceforge.net/svnroot/cmusphinx/trunk/cmudict/cmudict.0.7a
|
||||
|
||||
Copyright (C) 1993-2008 Carnegie Mellon University. All rights reserved.
|
||||
|
||||
File Format: Each line consists of an uppercased word,
|
||||
a counter (for alternative pronunciations), and a transcription.
|
||||
Vowels are marked for stress (1=primary, 2=secondary, 0=no stress).
|
||||
E.g.: NATURAL 1 N AE1 CH ER0 AH0 L
|
||||
|
||||
The dictionary contains 127069 entries. Of these, 119400 words are assigned
|
||||
a unique pronunciation, 6830 words have two pronunciations, and 839 words have
|
||||
three or more pronunciations. Many of these are fast-speech variants.
|
||||
|
||||
Phonemes: There are 39 phonemes, as shown below:
|
||||
|
||||
Phoneme Example Translation Phoneme Example Translation
|
||||
------- ------- ----------- ------- ------- -----------
|
||||
AA odd AA D AE at AE T
|
||||
AH hut HH AH T AO ought AO T
|
||||
AW cow K AW AY hide HH AY D
|
||||
B be B IY CH cheese CH IY Z
|
||||
D dee D IY DH thee DH IY
|
||||
EH Ed EH D ER hurt HH ER T
|
||||
EY ate EY T F fee F IY
|
||||
G green G R IY N HH he HH IY
|
||||
IH it IH T IY eat IY T
|
||||
JH gee JH IY K key K IY
|
||||
L lee L IY M me M IY
|
||||
N knee N IY NG ping P IH NG
|
||||
OW oat OW T OY toy T OY
|
||||
P pee P IY R read R IY D
|
||||
S sea S IY SH she SH IY
|
||||
T tea T IY TH theta TH EY T AH
|
||||
UH hood HH UH D UW two T UW
|
||||
V vee V IY W we W IY
|
||||
Y yield Y IY L D Z zee Z IY
|
||||
ZH seizure S IY ZH ER
|
||||
|
||||
(For NLTK, entries have been sorted so that, e.g. FIRE 1 and FIRE 2
|
||||
are contiguous, and not separated by FIRE'S 1.)
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
The contents of this file are deemed to be source code.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
This work was supported in part by funding from the Defense Advanced
|
||||
Research Projects Agency, the Office of Naval Research and the National
|
||||
Science Foundation of the United States of America, and by member
|
||||
companies of the Carnegie Mellon Sphinx Speech Consortium. We acknowledge
|
||||
the contributions of many volunteers to the expansion and improvement of
|
||||
this dictionary.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
|
||||
ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
||||
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
|
||||
NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
|
|
@ -0,0 +1,98 @@
|
|||
Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
|
||||
|
||||
Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
|
||||
been contributed by various people using NLTK for sentence boundary detection.
|
||||
|
||||
For information about how to use these models, please confer the tokenization HOWTO:
|
||||
http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
|
||||
and chapter 3.8 of the NLTK book:
|
||||
http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
|
||||
|
||||
There are pretrained tokenizers for the following languages:
|
||||
|
||||
File Language Source Contents Size of training corpus(in tokens) Model contributed by
|
||||
=======================================================================================================================================================================
|
||||
czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
|
||||
Literarni Noviny
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
|
||||
(Berlingske Avisdata, Copenhagen) Weekend Avisen
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
|
||||
(American)
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
|
||||
Text Bank (Suomen Kielen newspapers
|
||||
Tekstipankki)
|
||||
Finnish Center for IT Science
|
||||
(CSC)
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
|
||||
(European)
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
|
||||
(Switzerland) CD-ROM
|
||||
(Uses "ss"
|
||||
instead of "ß")
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
|
||||
(Bokmål and Information Technologies,
|
||||
Nynorsk) Bergen
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
|
||||
(http://www.nkjp.pl/)
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
|
||||
(Brazilian) (Linguateca)
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
|
||||
Slovene Academy for Arts
|
||||
and Sciences
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
|
||||
(European)
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
|
||||
(and some other texts)
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
|
||||
(Türkçe Derlem Projesi)
|
||||
University of Ankara
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
|
||||
Unicode using the codecs module.
|
||||
|
||||
Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
|
||||
Computational Linguistics 32: 485-525.
|
||||
|
||||
---- Training Code ----
|
||||
|
||||
# import punkt
|
||||
import nltk.tokenize.punkt
|
||||
|
||||
# Make a new Tokenizer
|
||||
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
|
||||
|
||||
# Read in training corpus (one example: Slovene)
|
||||
import codecs
|
||||
text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
|
||||
|
||||
# Train tokenizer
|
||||
tokenizer.train(text)
|
||||
|
||||
# Dump pickled tokenizer
|
||||
import pickle
|
||||
out = open("slovene.pickle","wb")
|
||||
pickle.dump(tokenizer, out)
|
||||
out.close()
|
||||
|
||||
---------
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,98 @@
|
|||
Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
|
||||
|
||||
Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
|
||||
been contributed by various people using NLTK for sentence boundary detection.
|
||||
|
||||
For information about how to use these models, please confer the tokenization HOWTO:
|
||||
http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
|
||||
and chapter 3.8 of the NLTK book:
|
||||
http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
|
||||
|
||||
There are pretrained tokenizers for the following languages:
|
||||
|
||||
File Language Source Contents Size of training corpus(in tokens) Model contributed by
|
||||
=======================================================================================================================================================================
|
||||
czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
|
||||
Literarni Noviny
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
|
||||
(Berlingske Avisdata, Copenhagen) Weekend Avisen
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
|
||||
(American)
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
|
||||
Text Bank (Suomen Kielen newspapers
|
||||
Tekstipankki)
|
||||
Finnish Center for IT Science
|
||||
(CSC)
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
|
||||
(European)
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
|
||||
(Switzerland) CD-ROM
|
||||
(Uses "ss"
|
||||
instead of "ß")
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
|
||||
(Bokmål and Information Technologies,
|
||||
Nynorsk) Bergen
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
|
||||
(http://www.nkjp.pl/)
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
|
||||
(Brazilian) (Linguateca)
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
|
||||
Slovene Academy for Arts
|
||||
and Sciences
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
|
||||
(European)
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
|
||||
(and some other texts)
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
|
||||
(Türkçe Derlem Projesi)
|
||||
University of Ankara
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
|
||||
Unicode using the codecs module.
|
||||
|
||||
Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
|
||||
Computational Linguistics 32: 485-525.
|
||||
|
||||
---- Training Code ----
|
||||
|
||||
# import punkt
|
||||
import nltk.tokenize.punkt
|
||||
|
||||
# Make a new Tokenizer
|
||||
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
|
||||
|
||||
# Read in training corpus (one example: Slovene)
|
||||
import codecs
|
||||
text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
|
||||
|
||||
# Train tokenizer
|
||||
tokenizer.train(text)
|
||||
|
||||
# Dump pickled tokenizer
|
||||
import pickle
|
||||
out = open("slovene.pickle","wb")
|
||||
pickle.dump(tokenizer, out)
|
||||
out.close()
|
||||
|
||||
---------
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
3
webui.py
3
webui.py
|
|
@ -3,6 +3,9 @@ import os
|
|||
import shutil
|
||||
from chains.local_doc_qa import LocalDocQA
|
||||
from configs.model_config import *
|
||||
import nltk
|
||||
|
||||
nltk.data.path = [os.path.join(os.path.dirname(__file__), "nltk_data")] + nltk.data.path
|
||||
|
||||
# return top-k text chunk from vector store
|
||||
VECTOR_SEARCH_TOP_K = 6
|
||||
|
|
|
|||
Loading…
Reference in New Issue