Skip to content

Commit

Permalink
Merge pull request #49 from wbrown/rwang.llamathree07242024
Browse files Browse the repository at this point in the history
Llama 3 Support
  • Loading branch information
wbrown authored Aug 27, 2024
2 parents 1821792 + fa4c561 commit 38785c5
Show file tree
Hide file tree
Showing 32 changed files with 560,573 additions and 1,900 deletions.
610 changes: 443 additions & 167 deletions cmd/dataset_tokenizer/dataset_tokenizer.go

Large diffs are not rendered by default.

309 changes: 206 additions & 103 deletions cmd/dataset_tokenizer/dataset_tokenizer_test.go

Large diffs are not rendered by default.

14 changes: 7 additions & 7 deletions cmd/dataset_tokenizer/go.mod
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
module github.com/wbrown/gpt_bpe/cmd/dataset_tokenizer

go 1.18
go 1.19.3

replace github.com/wbrown/gpt_bpe => ../../

require (
github.com/aws/aws-sdk-go v1.45.4
github.com/stretchr/testify v1.7.1
github.com/stretchr/testify v1.9.0
github.com/wbrown/gpt_bpe v0.0.0-20240410161531-edd9879e0496
github.com/yargevad/filepathx v1.0.0
)
Expand All @@ -17,17 +17,17 @@ require (
github.com/dustin/go-humanize v1.0.1 // indirect
github.com/edsrzf/mmap-go v1.1.0 // indirect
github.com/golang/protobuf v1.5.2 // indirect
github.com/hashicorp/golang-lru v0.5.4 // indirect
github.com/hashicorp/golang-lru v1.0.2 // indirect
github.com/jdkato/prose/v2 v2.0.0 // indirect
github.com/jmespath/go-jmespath v0.4.0 // indirect
github.com/kr/text v0.2.0 // indirect
github.com/mingrammer/commonregex v1.0.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/vikesh-raj/go-sentencepiece-encoder v1.1.1 // indirect
golang.org/x/sys v0.3.0 // indirect
golang.org/x/text v0.4.0 // indirect
gonum.org/v1/gonum v0.12.0 // indirect
golang.org/x/sys v0.14.0 // indirect
golang.org/x/text v0.17.0 // indirect
gonum.org/v1/gonum v0.15.0 // indirect
google.golang.org/protobuf v1.26.0 // indirect
gopkg.in/neurosnap/sentences.v1 v1.0.7 // indirect
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
6 changes: 6 additions & 0 deletions cmd/dataset_tokenizer/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ
github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
github.com/hashicorp/golang-lru v0.5.4 h1:YDjusn29QI/Das2iO9M0BHnIbxPeyuCHsjMW+lJfyTc=
github.com/hashicorp/golang-lru v0.5.4/go.mod h1:iADmTwqILo4mZ8BN3D2Q6+9jd8WM5uGBxy+E8yxSoD4=
github.com/hashicorp/golang-lru v1.0.2/go.mod h1:iADmTwqILo4mZ8BN3D2Q6+9jd8WM5uGBxy+E8yxSoD4=
github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
github.com/hashicorp/logutils v1.0.0/go.mod h1:QIAnNjmIWmVIIkWDTG1z5v++HQmx9WQRO+LraFDTW64=
github.com/hashicorp/mdns v1.0.0/go.mod h1:tL+uN++7HEJ6SQLQ2/p+z2pH24WQKWjBPkE0mNTz8vQ=
Expand Down Expand Up @@ -286,6 +287,7 @@ github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.7.1 h1:5TQK59W5E3v0r2duFAb7P95B6hEeOyEnHRa8MjYSMTY=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/subosito/gotenv v1.2.0/go.mod h1:N0PQaV/YGNqwC0u51sEeR/aUtSLEXKX9iv69rRypqCw=
github.com/urfave/cli v1.22.4/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0=
github.com/vikesh-raj/go-sentencepiece-encoder v1.1.1 h1:q5Rm4ihhwmAiDycaL8rNiE/ly4on+nHQajElYLPN7TM=
Expand Down Expand Up @@ -494,6 +496,7 @@ golang.org/x/sys v0.0.0-20220727055044-e65921a090b8/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.3.0 h1:w8ZOecv6NaNa/zC8944JTU3vz4u6Lagfk4RPQxv92NQ=
golang.org/x/sys v0.3.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.14.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
Expand All @@ -508,6 +511,7 @@ golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.4.0 h1:BrVqGRd7+k1DiOgtnFvAkoQEWQvBc25ouMJM6429SFg=
golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.17.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY=
golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
Expand Down Expand Up @@ -580,6 +584,7 @@ gonum.org/v1/gonum v0.9.3/go.mod h1:TZumC3NeyVQskjXqmyWt4S3bINhy7B4eYwW69EbyX+0=
gonum.org/v1/gonum v0.11.0/go.mod h1:fSG4YDCxxUZQJ7rKsQrj0gMOg00Il0Z96/qMA4bVQhA=
gonum.org/v1/gonum v0.12.0 h1:xKuo6hzt+gMav00meVPUlXwSdoEJP46BR+wdxQEFK2o=
gonum.org/v1/gonum v0.12.0/go.mod h1:73TDxJfAAHeA8Mk9mf8NlIppyhQNo5GLTcYeqgo2lvY=
gonum.org/v1/gonum v0.15.0/go.mod h1:xzZVBJBtS+Mz4q0Yl2LJTk+OxOg4jiXZ7qBoM0uISGo=
gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc=
gonum.org/v1/plot v0.9.0/go.mod h1:3Pcqqmp6RHvJI72kgb8fThyUnav364FOsdDo2aGW5lY=
Expand Down Expand Up @@ -703,6 +708,7 @@ gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b h1:h8qDotaEPuJATrMmW04NCwg7v22aHH28wwpauUhK9Oo=
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
Expand Down
12 changes: 6 additions & 6 deletions cmd/tokenizer_repl/go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module github.com/wbrown/gpt_bpe/cmd/tokenizer_repl

go 1.18
go 1.19.3

replace github.com/wbrown/gpt_bpe => ../../

Expand All @@ -11,14 +11,14 @@ require (
github.com/dustin/go-humanize v1.0.1 // indirect
github.com/edsrzf/mmap-go v1.1.0 // indirect
github.com/golang/protobuf v1.5.2 // indirect
github.com/gopherjs/gopherjs v1.17.2 // indirect
github.com/hashicorp/golang-lru v0.5.4 // indirect
github.com/gopherjs/gopherjs v1.19.0-beta1 // indirect
github.com/hashicorp/golang-lru v1.0.2 // indirect
github.com/jdkato/prose/v2 v2.0.0 // indirect
github.com/mingrammer/commonregex v1.0.1 // indirect
github.com/vikesh-raj/go-sentencepiece-encoder v1.1.1 // indirect
golang.org/x/sys v0.0.0-20220727055044-e65921a090b8 // indirect
golang.org/x/text v0.3.7 // indirect
gonum.org/v1/gonum v0.11.0 // indirect
golang.org/x/sys v0.14.0 // indirect
golang.org/x/text v0.17.0 // indirect
gonum.org/v1/gonum v0.15.0 // indirect
google.golang.org/protobuf v1.26.0 // indirect
gopkg.in/neurosnap/sentences.v1 v1.0.7 // indirect
)
2 changes: 1 addition & 1 deletion cmd/tokens_transformer/tokens_transformer.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ func main() {
encoded = &padded
}
// write encoded context to output file
bytesToWrite := encoded.ToBin()
bytesToWrite := encoded.ToBin(false)
bytesWritten, writeErr := outputFileHandle.Write(*bytesToWrite)

if writeErr != nil {
Expand Down
20 changes: 9 additions & 11 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,37 +1,35 @@
module github.com/wbrown/gpt_bpe

go 1.17
go 1.19.13

require (
github.com/edsrzf/mmap-go v1.1.0
github.com/hashicorp/golang-lru v0.5.4
github.com/stretchr/testify v1.7.1
github.com/hashicorp/golang-lru v1.0.2
github.com/stretchr/testify v1.9.0
)

require (
github.com/dustin/go-humanize v1.0.1
github.com/gopherjs/gopherjs v1.17.2
github.com/gopherjs/gopherjs v1.19.0-beta1
github.com/jdkato/prose/v2 v2.0.0
)

require (
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/golang/protobuf v1.5.2 // indirect
github.com/google/go-cmp v0.5.8 // indirect
github.com/kr/pretty v0.1.0 // indirect
github.com/kr/pretty v0.3.1 // indirect
github.com/mingrammer/commonregex v1.0.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
golang.org/x/text v0.3.7 // indirect
golang.org/x/text v0.17.0 // indirect
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)

require (
github.com/deckarep/golang-set v1.8.0 // indirect
github.com/vikesh-raj/go-sentencepiece-encoder v1.1.1
golang.org/x/exp v0.0.0-20220722155223-a9213eeb770e // indirect
golang.org/x/sys v0.0.0-20220727055044-e65921a090b8 // indirect
gonum.org/v1/gonum v0.11.0 // indirect
golang.org/x/sys v0.14.0 // indirect
gonum.org/v1/gonum v0.15.0 // indirect
google.golang.org/protobuf v1.26.0
gopkg.in/neurosnap/sentences.v1 v1.0.7 // indirect
)
Loading

0 comments on commit 38785c5

Please sign in to comment.