# trump_bot/corpus.pyclassdictionary():'''
A dictionary which contains all words in the training set.
'''def__init__(self)->None:'''
Initialize the dictionary.
'''self.idx2word:List[str]=[]self.idx2freq:List[int]=[]self.word2idx:Dict[str,int]={}self.freq_threshold=1self.start_pos=3self.unk='<unk>'# unknown wordself.eos='<eos>'# end of sentenceself.sos='<sos>'# start of sentenceself.add_word(self.unk)self.add_word(self.eos)self.add_word(self.sos)deflen(self)->int:'''
Return the current size of the dictionary.
'''returnlen(self.idx2word)defadd_word(self,word:str)->int:'''
Add a new word to the dictionary.
Return the index of the word.
:param word: new word
'''ifwordnotinself.idx2word:idx=self.word2idx[word]=self.len()self.idx2word.append(word)self.idx2freq.append(1)else:idx=self.word2idx[word]self.idx2freq[idx]+=1returnidxdefclear_words(self)->None:'''
Remove infrequent words that appears at most `freq_threshold`.
'''i:int=self.start_poswhilei<len(self.idx2word):ifself.idx2freq[i]andself.idx2freq[i]<=self.freq_threshold:removed_word:str=self.idx2word[i]self.word2idx.pop(removed_word)last_word:str=self.idx2word.pop()ifi>=len(self.idx2word):self.idx2freq.pop()break# Swap the removed word with the last word in dictionaryself.idx2word[i]=last_wordself.idx2freq[i]=self.idx2freq.pop()self.word2idx[last_word]=ielse:i+=1
# trump_bot/corpus.pyclasscorpus():'''
A corpus built with the training set.
'''def__init__(self)->None:'''
Initialize the corpus.
'''self.json_dir:str=os.path.realpath('data/raw_json')self.text_dir:str=os.path.realpath('data/text')self.data_file='data.txt'self.train_set:List[str]=[]self.train_proportion=0.4self.dev_set:List[str]=[]self.dev_proportion=0.4self.test_set:List[str]=[]self.test_proportion=0.2self.dictionary=dictionary()defget_text_data(self,file_name:str,all_in_one:bool=False)->None:'''
Parse a dataset from JSON to plain text.
:param file_name: file name of the dataset without extension
:param all_in_one: write to a single file
'''def_filter_text(text:str)->str:'''
Filter a line of text and replace certain words.
Return the filtered text.
:param text: input text
'''return(text.replace('&','&').replace('&,','&'))json_path:str=os.path.join(self.json_dir,file_name+'.json')try:withopen(json_path,'r',encoding='utf-8')asfi:data:List[dict]=json.load(fi)exceptFileNotFoundError:data:List[dict]=[]text_name:str=self.data_fileifall_in_oneelsefile_name+'.txt'text_path:str=os.path.join(self.text_dir,text_name)buffer_size=1<<20# 1 MBtokenizer=get_tokenizer('spacy')withopen(text_path,'a'ifall_in_oneelse'w',buffering=buffer_size)asfo:buffer:str=''# Reverse the list to sort by time in ascending orderforentryinreversed(data):t:tweet=decode_tweet(entry)text:str=_filter_text(unidecode(t.text))words:List[str]=tokenizer(text)buffer+=' '.join(words)+'\n'fo.write(buffer)defget_all_text_data(self,all_in_one:bool=False)->None:'''
Parse all datasets in `json_dir` from JSON to plain text.
:param all_in_one: write to a single file
'''ifall_in_one:# Clear the contenttext_path:str=os.path.join(self.text_dir,self.data_file)open(text_path,'w').close()forjson_entryinos.scandir(self.json_dir):file_name:str=json_entry.nameiffile_name.endswith('.json'):self.get_text_data(file_name[:-len('.json')],all_in_one)defadd_sentence(self,words:List[str],dataset:str='train')->None:'''
Add a new sentence to the corpus.
:param words: a preprocessed word list of the new sentence
:param dataset: which dataset, can be `'train'`, `'dev'` or `'test'`
'''ifnotwords:returntry:ifwords[0].startswith('...'):words.pop(0)else:words.insert(0,self.dictionary.sos)ifwords[-1].endswith('...'):words.pop(-1)else:words.append(self.dictionary.eos)exceptIndexError:passelse:forwordinwords:self.dictionary.add_word(word)ifdataset=='dev':self.dev_set+=wordselifdataset=='test':self.test_set+=wordselse:self.train_set+=wordsdefread_data(self,file_name:str=None)->None:'''
Read a dataset from a file, and append to the corpus.
:param file_name: file name of the dataset without extension
'''text_name:str=file_name+'.txt'iffile_nameelseself.data_filetext_path:str=os.path.join(self.text_dir,text_name)withopen(text_path,'r')asfi:all_lines:List[str]=fi.read().splitlines()train_size:int=floor(len(all_lines)*self.train_proportion)dev_size:int=floor(len(all_lines)*self.dev_proportion)forlineinall_lines[:train_size]:self.add_sentence(line.split(),'train')forlineinall_lines[train_size:train_size+dev_size]:self.add_sentence(line.split(),'dev')forlineinall_lines[train_size+dev_size:]:self.add_sentence(line.split(),'test')# self.dictionary.clear_words()
# trump_bot/model.pyclassrnn(nn.Module):'''
Build an RNN model.
This model will take the last character as input and is expected to output
the next character. There are three layers - one linear layer that encodes
the input character into an internal state, one GRU layer (which may itself
have multiple layers) that operates on that internal state and a hidden
state, and a decoder layer that outputs the probability distribution.
'''def__init__(self,input_size:int,hidden_size:int,output_size:int,num_layers:int=1,dropout:float=0.2)->None:'''
Initialize the RNN model.
:param input_size: the number of expected features in the input
:param hidden_size: the number of features in the hidden state
:param output_size: the number of expected features in the output
:param num_layers: the number of recurrent layers
'''super().__init__()self.input_size=input_sizeself.hidden_size=hidden_sizeself.output_size=output_sizeself.num_layers=num_layersself.drop=nn.Dropout(dropout)self.encoder=nn.Embedding(input_size,hidden_size)self.gru=nn.GRU(hidden_size,hidden_size,num_layers,dropout=dropout)self.decoder=nn.Linear(hidden_size,output_size)defforward(self,inp:Tensor,hid:Tensor)->Tuple[Tensor,Tensor]:'''
The forward function which defines the network structure.
Return the result of output tensor and hidden tensor.
:param inp: input tensor
:param hid: hidden tensor
'''emb=self.encoder(inp.view(1,-1))emb=self.drop(emb)out,hid=self.gru(emb,hid)out=self.drop(out)dec=self.decoder(out).view(1,-1)returndec,hiddefinit_hidden(self,batch_size:int=1)->Tensor:'''
Initialize the hidden state.
:param batch_size: batch size
'''weight=next(self.parameters())returnweight.new_zeros(self.num_layers,batch_size,self.hidden_size)
# trump_bot/main.pydeftrain_model()->Tuple[List[float],List[float]]:'''
The main training function.
Return all training losses and all validation losses.
'''all_train_losses:List[float]=[]all_valid_losses:List[float]=[]total_train_loss:float=0.0total_valid_loss:float=0.0min_valid_loss:float=4.0forepochinrange(1,num_epochs+1):train_loss:float=train(*get_random_pair('train'))valid_loss:float=validate(*get_random_pair('dev'))total_train_loss+=train_losstotal_valid_loss+=valid_lossifvalid_loss<min_valid_loss:save_model(valid_loss)min_valid_loss=valid_lossifepoch%print_every==0:progress:float=epoch/num_epochs*100print('{}: ({}{:.1f}%) train_loss: {:.3f}, valid_loss: {:.3f}'.format(duration_since(start_time),epoch,progress,train_loss,valid_loss,))evaluate_model()ifepoch%plot_every==0:all_train_losses.append(total_train_loss/plot_every)all_valid_losses.append(total_valid_loss/plot_every)total_train_loss=0.0total_valid_loss=0.0returnall_train_losses,all_valid_losses
# trump_bot/main.pydefget_random_pair(dataset:str='train')->Tuple[Tensor,Tensor]:'''
Return a random pair of input and target from the dataset.
:param dataset: which dataset, can be `'train'`, `'dev'` or `'test'`
'''ifdataset=='dev':src=cp.dev_setelifdataset=='test':src=cp.test_setelse:src=cp.train_setmax_i:int=len(src)-chunk_sizei:int=torch.randint(0,max_i,(1,))[0]inp_words:List[str]=src[i:i+chunk_size]inp:Tensor=words_to_tensor(inp_words)tar_words:List[str]=src[i+1:i+1+chunk_size]tar:Tensor=words_to_tensor(tar_words)returninp,tar
# trump_bot/main.pydeftrain(inp:Tensor,tar:Tensor)->float:'''
Train the model using a pair of input and target.
Return the loss.
:param inp: input tensor
:param tar: target tensor
'''m.train()m.zero_grad()hid:Tensor=m.init_hidden()loss:Tensor=0foriinrange(inp.size(0)):out,hid=m(inp[i],hid)loss+=criterion(out,tar[i].view(-1))loss.backward()nn.utils.clip_grad_norm_(m.parameters(),clip)optimizer.step()returnloss.item()/chunk_size
# trump_bot/main.pydefvalidate(inp:Tensor,tar:Tensor)->float:'''
Validate the model using a pair of input and target.
Return the loss.
:param inp: input tensor
:param tar: target tensor
'''m.eval()hid:Tensor=m.init_hidden()loss:Tensor=0withtorch.no_grad():foriinrange(inp.size(0)):out,hid=m(inp[i],hid)loss+=criterion(out,tar[i].view(-1))returnloss.item()/chunk_size
# trump_bot/main.pydefgenerate()->None:'''
Generate new sentences using the best model, and save to local file.
'''load_model()foriinrange(1,batch_size+1):progress:float=i/batch_size*100print(f'({i}{progress:.1f}%)',end='\r',flush=True)evaluate_model(save=True)
# trump_bot/main.pydefevaluate_model(save:bool=False)->None:'''
The main evaluating function.
:param save: save the output to local file
'''m.eval()prime_words:List[str]=get_random_words(prime_len,'dev')predicted_words:List[str]=evaluate(prime_words,predict_len,temperature,)output:List[str]=' '.join(predicted_words)ifsave:current_time:str=datetime.now().strftime('%Y-%m-%d %H:%M:%S')withopen(output_path,'a')asf:f.write(f'{current_time}:\n{output}\n\n')else:print(output)
# trump_bot/main.pydefget_random_words(count:int=1,dataset:str='dev')->List[str]:'''
Return a sequence of random words from the dataset.
:param count: how many words are required
:param dataset: which dataset, can be `'train'`, `'dev'` or `'test'`
'''ifdataset=='dev':src=cp.dev_setelifdataset=='test':src=cp.test_setelse:src=cp.train_setmax_i:int=len(src)-counti:int=torch.randint(0,max_i,(1,))[0]words:List[str]=src[i:i+count]returnwords
# trump_bot/main.pydefevaluate(prime_words:List[str]=None,predict_len:int=30,temperature:float=0.8)->List[str]:'''
Evaluate the network by generating a sentence using a priming word.
To evaluate the network we feed one word at a time, use the outputs of the
network as a probability distribution for the next word, and repeat.
To start generation we pass some priming words to start setting up the
hidden state, from which we then generate one word at a time.
Return the predicted words.
:param prime_words: priming words to start
:param predict_len: expected length of words to predict
:param temperature: randomness of predictions; higher value results in more diversity
'''hid:Tensor=m.init_hidden()ifnotprime_words:prime_words=[cp.dictionary.sos]withtorch.no_grad():prime_inp:Tensor=words_to_tensor(prime_words)predicted_words:List[str]=prime_wordsforpinrange(len(prime_words)-1):_,hid=m(prime_inp[p],hid)inp:Tensor=prime_inp[-1]forpinrange(predict_len):out,hid=m(inp,hid)# Sample from the network as a multinomial distributionout_dist:Tensor=out.view(-1).div(temperature).exp()top_i:int=torch.multinomial(out_dist,1)[0]# Add predicted word to words and use as next inputpredicted_word:str=cp.dictionary.idx2word[top_i]predicted_words.append(predicted_word)# if (predicted_word == cp.dictionary.eos):# breakinp.fill_(top_i)returnpredicted_words
# 2021-01-18 23:50:38
The Democrats have been done to all of the President. Great Complete!
Enjoy of Border: New House States in a GREAT!
China Virus, and all, including almost 2020, and have been the Impeachment Hoax!
# 2021-01-19 00:57:52
this is a fraud on the World of the Fake News Media for me, on the Border (and keep was the Fake News Media, would keep win to the Republican side, and over the negative of the State of the National Left
do n't believe the Trump Report
There all over the President of the Federal Government so I know, the Democrats are doing a great job to do.
Our lowest success of the United States and that will be doing quickly on the Mueller Report, that is doing a good job you need, the USA will be doing a great job!
The GREAT Media is working more to U.S., or any of the highly Left can never let a short of our Country.
# 2021-01-19 00:57:53
There is no crime of our Country. They are doing the Do Nothing Democrats, and that, the Democrats, are doing a great job, and even the Democrats, or to the Democrats!
Sleepy n't want to report the Trump Administration
The Fake News Media is not better, but the Trump News, and it is a great job!
The Democrats are great with the Democrats that the Democrats are with the Republican Party.
# 2021-01-19 00:57:54
Biggest part of the National Administration is being built with the same side to use the Fake News Media, are doing more than ever before!
I will be watching from the Great State of the Russia Virus, has in the White House to the Republican side of the United States now.
# 2021-01-19 00:57:55
Democrat run cities and states are doing their job with respect that I am pleased to announce the Democrats in the Republican Party.
Chinese are very adept at to the White House!
I have n't think the Federal Party are doing such being important in their stone to do of the President and the State of the DNC in the history of the United States, China and keep more than he. Great!
GPU: NVIDIA Tesla V100-NVLINK-32G × 1 (with CUDA 10.1.105)
10 结论和感想
目前从生成的文本来看,效果还算可以。整个训练过程还是比较有趣的,中间生成了许多十分有特朗普风格的搞笑句子。前期训练一直在用自己笔记本的 1050 Ti 跑,显存完全不够,跑一轮的速度也非常慢。后来看时间来不及了,实在受不了还是去租了台 GPU 服务器。后来就一直用的 Tesla V100 跑,舒服多了。