In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
torch.manual_seed(0)
import torch.nn.functional as F
import time
import numpy as np
np.random.seed(30)

draft_model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B", device_map='auto',torch_dtype=torch.float16)
draft_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neox-20b", device_map='auto',torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████| 46/46 [00:27<00:00,  1.65it/s]


In [2]:
model.device.type=='cuda', draft_model.device.type=='cuda'

(True, True)

In [3]:
prompt = "Difference between a gasoline and hybrid vehicles is"
K=4
N=50

In [4]:
############## Autoregressive sampling test ##############
inputs = tokenizer(prompt, return_tensors="pt")
inp = inputs['input_ids'].to('cuda')
start = time.time()
while(inp.shape[-1]<N):
    o = model(inp)
    o = torch.softmax(o.logits,-1)
    o = torch.argmax(o,-1)[0,-1]
    inp = torch.cat((inp, torch.tensor([[o]],device='cuda')),-1)
end=time.time()
print(tokenizer.decode(inp[0]))
print(f'Time taken is {end-start}s')

Difference between a gasoline and hybrid vehicles is that the gasoline vehicle uses a spark plug to ignite the fuel, while the hybrid vehicle uses a battery to ignite the fuel.

The battery is charged by the engine, and the engine is
Time taken is 3.85893177986145s


In [5]:
# def sample(model, draft_model, tokenizer, draft_tokenizer, prompt):
inputs = draft_tokenizer(prompt, return_tensors="pt")
inp = inputs['input_ids'].to('cuda')
print(inp)
start = time.time()
while(inp.shape[-1]<N):
    global_inp = [inp[0]]
    global_o=[]
    # global_tok=[]
    for i in range(K):
        o = draft_model(inp)#attn_mask)#,inputs['attention_mask'])
        o['logits'] = torch.softmax(o['logits'],-1)
        new_token = torch.argmax(o['logits'][0,-1])
        inp = torch.cat((inp,torch.tensor([[new_token]],device='cuda')),-1)
        global_inp.append(inp[0])
        global_o.append((new_token,o.logits[0,-1,new_token]))
        # print(draft_tokenizer.decode(new_token), new_token, o.logits[0,-1,new_token])
    # print('Draft output: ',global_tok)

    ########## VERIFY INPUTS FOR TARGET MODEL #########################
    # print("Inputs for the target model are:")
    # for i in range(len(global_inp)):
    #     print(draft_tokenizer.decode(global_inp[i], ignore_special_tokens=True))
    
    target_inp=[]
    for i in global_inp:
        target_inp.append(torch.tensor(tokenizer(draft_tokenizer.decode(i)).input_ids))
    first_tok_idx = target_inp[0].shape[0]
    target_inp_padded = torch.nn.utils.rnn.pad_sequence(target_inp,batch_first=True,padding_value=0)
    
    ########## VERIFY INPUTS FOR TARGET MODEL AFTER TOKENIZING & PADDING #########################
    # for i in range(len(global_inp)):
    #     print(tokenizer.decode(target_inp_padded[i], ignore_special_tokens=True))

    target_output = model(target_inp_padded.to('cuda'))#, attention_mask=torch.where(target_inp_padded>0,1,0))
    target_output.logits = torch.softmax(target_output.logits,-1)

    ########## PRINT SERIALIZED OUTPUTS FROM TARGET MODEL #########################
    # out = torch.argmax(target_output.logits,-1)
    # out_decode = [tokenizer.decode(out[i][first_tok_idx+i-1]) for i in range(K+1)]
    # print('Target output: ',out_decode)
    
    all_accepted=True
    inp = global_inp[0] #Preparing draft model input for next Speculative Sampling
    for i in range(K):
        print(f'K: {first_tok_idx+i-1}')
        token_idx, prob_d = global_o[i] #token index and probability from draft prediction
        # probability from target prediction for the same token
        prob_t = target_output.logits[i,first_tok_idx+i-1,tokenizer(draft_tokenizer.decode(token_idx)).input_ids[0]]

        # Accepted token
        if np.random.random() < min(1,prob_t/prob_d):
        # if prob_t/prob_d>=1:
            # print(f'Accepted {first_tok_idx+i-1} token: ', draft_tokenizer.decode(token_idx), token_idx)
            inp = torch.cat((inp,torch.tensor([token_idx],device='cuda')))
        
        # Modified Rejected token
        else:
            token_idx = torch.argmax(target_output.logits[i][first_tok_idx+i-1])
            # print(f'Replaced  {first_tok_idx+i-1} token: ', tokenizer.decode(token_idx), token_idx)
            draft_token_idx = draft_tokenizer([tokenizer.decode(token_idx)]).input_ids[0]
            inp = torch.cat((inp,torch.tensor(draft_token_idx,device='cuda')))
            all_accepted = False
            break
            
        if inp.shape[-1]==N-1:
            print(inp.shape)
            break
            
    # If all accepted then add extra token from target prediction
    if all_accepted:
        #print('All tokens are accepted, adding extra token')
        token_idx = torch.argmax(target_output.logits[-1,first_tok_idx+K-1])
        draft_token_idx = draft_tokenizer([tokenizer.decode(token_idx)]).input_ids[0]
        prob_t = torch.tensor(draft_token_idx,device='cuda')
        inp = torch.cat((inp,prob_t))

    print(f'After verification: {draft_tokenizer.decode(inp)}\n')
    inp = inp.unsqueeze(0) #batched input
end = time.time()
print(f'Time taken is {end-start}s')

tensor([[28813,  1945,  1022,   257, 21408,   290, 14554,  5672,   318]],
       device='cuda:0')
K: 8
After verification: Difference between a gasoline and hybrid vehicles is that

K: 9
K: 10
K: 11
K: 12
After verification: Difference between a gasoline and hybrid vehicles is that the gasoline vehicle has

K: 13
K: 14
After verification: Difference between a gasoline and hybrid vehicles is that the gasoline vehicle has a fuel

K: 15
K: 16
After verification: Difference between a gasoline and hybrid vehicles is that the gasoline vehicle has a fuel tank that

K: 17
K: 18
K: 19
K: 20
After verification: Difference between a gasoline and hybrid vehicles is that the gasoline vehicle has a fuel tank that is filled with gasoline.

K: 22
K: 23
After verification: Difference between a gasoline and hybrid vehicles is that the gasoline vehicle has a fuel tank that is filled with gasoline. The hybrid

K: 24
K: 25
K: 26
K: 27
After verification: Difference between a gasoline and hybrid vehicles is

In [6]:
3.858/2.849

1.3541593541593542

In [12]:
start = time.time()
inputs = tokenizer(prompt, return_tensors="pt")
o = model.generate(inputs['input_ids'].to('cuda'), max_length=N)
end=time.time()
print(tokenizer.decode(o[0]))
print(f'Time taken is {end-start}s')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Difference between a gasoline and hybrid vehicles is that the gasoline vehicle uses a spark plug to ignite the fuel, while the hybrid vehicle uses a battery to power the electric motor.

The battery is charged by the engine, which is why the
Time taken is 1.913191556930542s
