diff --git a/.github/workflows/pushes.yaml b/.github/workflows/pushes.yaml index cbfee69e..510ec6ef 100644 --- a/.github/workflows/pushes.yaml +++ b/.github/workflows/pushes.yaml @@ -41,16 +41,16 @@ jobs: # The canonical entry is the only one where we run vet/lint/style checks. # `experimental: true` entries do not cause the tests to fail. include: - - go: '1.17.x' + - go: '1.19.x' os: ubuntu-latest canonical: true - - go: '1.16.x' + - go: '1.18.x' os: ubuntu-latest canonical: false - - go: '1.16.x' + - go: '1.19.x' os: windows-latest canonical: false - - go: '1.17.x' + - go: '1.19.x' os: macos-latest canonical: false diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 23170af7..59a04550 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -41,7 +41,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v2 with: - go-version: 1.17 + go-version: 1.18 check-latest: true - name: Install cosign diff --git a/cmd/importaccount_test.go b/cmd/importaccount_test.go index 43ac3a4f..78b89038 100644 --- a/cmd/importaccount_test.go +++ b/cmd/importaccount_test.go @@ -95,6 +95,7 @@ func Test_ImportDecoratedAccount(t *testing.T) { require.NoError(t, err) normal := filepath.Join(ts.Dir, "a.jwt") err = Write(normal, a) + require.NoError(t, err) // save a decorated jwt decorated := filepath.Join(ts.Dir, "decorated_a.jwt") diff --git a/go.mod b/go.mod index ba8406bc..23e54877 100644 --- a/go.mod +++ b/go.mod @@ -9,11 +9,11 @@ require ( github.com/inconshreveable/mousetrap v1.0.0 // indirect github.com/mitchellh/go-homedir v1.1.0 github.com/nats-io/cliprompts/v2 v2.0.0-20191226174129-372d79b36768 - github.com/nats-io/jsm.go v0.0.30 + github.com/nats-io/jsm.go v0.0.33 github.com/nats-io/jwt v1.2.2 - github.com/nats-io/jwt/v2 v2.2.1-0.20220509180118-3bcd719cc7d0 - github.com/nats-io/nats-server/v2 v2.7.5-0.20220309212130-5c0d1999ff72 - github.com/nats-io/nats.go v1.13.1-0.20220308171302-2f2f6968e98d + github.com/nats-io/jwt/v2 v2.3.0 + github.com/nats-io/nats-server/v2 v2.8.4 + github.com/nats-io/nats.go v1.16.0 github.com/nats-io/nkeys v0.3.0 github.com/nats-io/nuid v1.0.1 github.com/onsi/gomega v1.4.3 // indirect diff --git a/go.sum b/go.sum index d7fd9711..88151087 100644 --- a/go.sum +++ b/go.sum @@ -30,6 +30,7 @@ github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMyw github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.7 h1:81/ik6ipDQS2aGcBfIN5dHDB36BwrStyeAQquSYCV4o= github.com/google/go-cmp v0.5.7/go.mod h1:n+brtR0CgQNWTVd5ZUFpTBC8YFBDLK/h/bpaJ8/DtOE= +github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-github/v30 v30.1.0 h1:VLDx+UolQICEOKu2m4uAoMti1SxuEBAl7RSEG16L+Oo= github.com/google/go-github/v30 v30.1.0/go.mod h1:n8jBpHl45a/rlBUtRJMOG4GhNADUQFEufcolZ95JfU8= github.com/google/go-querystring v1.0.0 h1:Xkwi/a1rcvNg1PPYe5vI8GbeBY/jrVuDX5ASuANWTrk= @@ -45,6 +46,8 @@ github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 h1:Z9n2FFNU github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51/go.mod h1:CzGEWj7cYgsdH8dAjBGEr58BoE7ScuLd+fwFZ44+/x8= github.com/klauspost/compress v1.14.4 h1:eijASRJcobkVtSt81Olfh7JX43osYLwy5krOJo6YEu4= github.com/klauspost/compress v1.14.4/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk= +github.com/klauspost/compress v1.15.5 h1:qyCLMz2JCrKADihKOh9FxnW3houKeNsp2h5OEz0QSEA= +github.com/klauspost/compress v1.15.5/go.mod h1:PhcZ0MbTNciWF3rruxRgKxI5NkcHHrHUDtV4Yw2GlzU= github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= @@ -70,15 +73,25 @@ github.com/nats-io/cliprompts/v2 v2.0.0-20191226174129-372d79b36768 h1:sdr8zfPeN github.com/nats-io/cliprompts/v2 v2.0.0-20191226174129-372d79b36768/go.mod h1:oweZn7AeaVJYKlNHfCIhznJVsdySLSng55vfuINE/d0= github.com/nats-io/jsm.go v0.0.30 h1:0PR9TuJV1FMBJMyZmH0hhwb+ASn0YAjBE6QqnSUrx6o= github.com/nats-io/jsm.go v0.0.30/go.mod h1:EKSYvbvWAoh0hIfuZ+ieWm8u0VOTRTeDfuQvNPKRqEg= +github.com/nats-io/jsm.go v0.0.33 h1:mNxlZEnSiHo9BwAFpjZYuopVvtwVUdtoAana2ovyWOU= +github.com/nats-io/jsm.go v0.0.33/go.mod h1:1ySvWrDbPo/Rs1v0Ccoy7QjZKBGfVhvmolfJRBX+fCg= github.com/nats-io/jwt v1.2.2 h1:w3GMTO969dFg+UOKTmmyuu7IGdusK+7Ytlt//OYH/uU= github.com/nats-io/jwt v1.2.2/go.mod h1:/xX356yQA6LuXI9xWW7mZNpxgF2mBmGecH+Fj34sP5Q= github.com/nats-io/jwt/v2 v2.2.1-0.20220113022732-58e87895b296/go.mod h1:0tqz9Hlu6bCBFLWAASKhE5vUA4c24L9KPUUgvwumE/k= +github.com/nats-io/jwt/v2 v2.2.1-0.20220330180145-442af02fd36a/go.mod h1:0tqz9Hlu6bCBFLWAASKhE5vUA4c24L9KPUUgvwumE/k= github.com/nats-io/jwt/v2 v2.2.1-0.20220509180118-3bcd719cc7d0 h1:R2KQtvAFeZ7AbMYZ90qWE26yrg/PxhZYRsC93ROw6o4= github.com/nats-io/jwt/v2 v2.2.1-0.20220509180118-3bcd719cc7d0/go.mod h1:0tqz9Hlu6bCBFLWAASKhE5vUA4c24L9KPUUgvwumE/k= +github.com/nats-io/jwt/v2 v2.3.0 h1:z2mA1a7tIf5ShggOFlR1oBPgd6hGqcDYsISxZByUzdI= +github.com/nats-io/jwt/v2 v2.3.0/go.mod h1:0tqz9Hlu6bCBFLWAASKhE5vUA4c24L9KPUUgvwumE/k= github.com/nats-io/nats-server/v2 v2.7.5-0.20220309212130-5c0d1999ff72 h1:Moe/K4fo/5FCNpE/TYrMt7sEPUuldBVJ0D4g/SWFkd0= github.com/nats-io/nats-server/v2 v2.7.5-0.20220309212130-5c0d1999ff72/go.mod h1:1vZ2Nijh8tcyNe8BDVyTviCd9NYzRbubQYiEHsvOQWc= +github.com/nats-io/nats-server/v2 v2.8.4 h1:0jQzze1T9mECg8YZEl8+WYUXb9JKluJfCBriPUtluB4= +github.com/nats-io/nats-server/v2 v2.8.4/go.mod h1:8zZa+Al3WsESfmgSs98Fi06dRWLH5Bnq90m5bKD/eT4= github.com/nats-io/nats.go v1.13.1-0.20220308171302-2f2f6968e98d h1:zJf4l8Kp67RIZhoVeniSLZs69SHNgjLHz0aNsqPPlx8= github.com/nats-io/nats.go v1.13.1-0.20220308171302-2f2f6968e98d/go.mod h1:BPko4oXsySz4aSWeFgOHLZs3G4Jq4ZAyE6/zMCxRT6w= +github.com/nats-io/nats.go v1.15.0/go.mod h1:BPko4oXsySz4aSWeFgOHLZs3G4Jq4ZAyE6/zMCxRT6w= +github.com/nats-io/nats.go v1.16.0 h1:zvLE7fGBQYW6MWaFaRdsgm9qT39PJDQoju+DS8KsO1g= +github.com/nats-io/nats.go v1.16.0/go.mod h1:BPko4oXsySz4aSWeFgOHLZs3G4Jq4ZAyE6/zMCxRT6w= github.com/nats-io/nkeys v0.2.0/go.mod h1:XdZpAbhgyyODYqjTawOnIOI7VlbKSarI9Gfy1tqEu/s= github.com/nats-io/nkeys v0.3.0 h1:cgM5tL53EvYRU+2YLXIK0G2mJtK12Ft9oeooSZMA2G8= github.com/nats-io/nkeys v0.3.0/go.mod h1:gvUNGjVcM2IPr5rCsRsC6Wb3Hr2CQAm08dsxtV6A5y4= @@ -115,6 +128,8 @@ golang.org/x/crypto v0.0.0-20200323165209-0ec3e9974c59/go.mod h1:LzIPMQfyMNhhGPh golang.org/x/crypto v0.0.0-20210314154223-e6e6c4f2bb5b/go.mod h1:T9bdIzuCu7OtxOm1hfPfRQxPLYneinmdGuTeoZ9dtd4= golang.org/x/crypto v0.0.0-20220112180741-5e0467b6c7ce h1:Roh6XWxHFKrPgC/EQhVubSAGQ6Ozk6IdxHSzt1mR0EI= golang.org/x/crypto v0.0.0-20220112180741-5e0467b6c7ce/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= +golang.org/x/crypto v0.0.0-20220315160706-3147a52a75dd h1:XcWmESyNjXJMLahc3mqVQJcgSTDxFxhETVlfk9uGc38= +golang.org/x/crypto v0.0.0-20220315160706-3147a52a75dd/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= diff --git a/vendor/github.com/klauspost/compress/s2/README.md b/vendor/github.com/klauspost/compress/s2/README.md index 11979345..73c0c462 100644 --- a/vendor/github.com/klauspost/compress/s2/README.md +++ b/vendor/github.com/klauspost/compress/s2/README.md @@ -19,6 +19,7 @@ This is important, so you don't have to worry about spending CPU cycles on alrea * Adjustable compression (3 levels) * Concurrent stream compression * Faster decompression, even for Snappy compatible content +* Concurrent Snappy/S2 stream decompression * Ability to quickly skip forward in compressed stream * Random seeking with indexes * Compatible with reading Snappy compressed content @@ -415,6 +416,25 @@ Without assembly decompression is also very fast; single goroutine decompression Even though S2 typically compresses better than Snappy, decompression speed is always better. +### Concurrent Stream Decompression + +For full stream decompression S2 offers a [DecodeConcurrent](https://pkg.go.dev/github.com/klauspost/compress/s2#Reader.DecodeConcurrent) +that will decode a full stream using multiple goroutines. + +Example scaling, AMD Ryzen 3950X, 16 cores, decompression using `s2d -bench=3 `, best of 3: + +| Input | `-cpu=1` | `-cpu=2` | `-cpu=4` | `-cpu=8` | `-cpu=16` | +|-------------------------------------------|------------|------------|------------|------------|-------------| +| enwik10.snappy | 1098.6MB/s | 1819.8MB/s | 3625.6MB/s | 6910.6MB/s | 10818.2MB/s | +| enwik10.s2 | 1303.5MB/s | 2606.1MB/s | 4847.9MB/s | 8878.4MB/s | 9592.1MB/s | +| sofia-air-quality-dataset.tar.snappy | 1302.0MB/s | 2165.0MB/s | 4244.5MB/s | 8241.0MB/s | 12920.5MB/s | +| sofia-air-quality-dataset.tar.s2 | 1399.2MB/s | 2463.2MB/s | 5196.5MB/s | 9639.8MB/s | 11439.5MB/s | +| sofia-air-quality-dataset.tar.s2 (no asm) | 837.5MB/s | 1652.6MB/s | 3183.6MB/s | 5945.0MB/s | 9620.7MB/s | + +Scaling can be expected to be pretty linear until memory bandwidth is saturated. + +For now the DecodeConcurrent can only be used for full streams without seeking or combining with regular reads. + ## Block compression @@ -873,7 +893,7 @@ for each entry { } // Uncompressed uses previous offset and adds EstBlockSize - entry[entryNum].UncompressedOffset = entry[entryNum-1].UncompressedOffset + EstBlockSize + entry[entryNum].UncompressedOffset = entry[entryNum-1].UncompressedOffset + EstBlockSize + uOff } @@ -901,6 +921,14 @@ for each entry { } ``` +To decode from any given uncompressed offset `(wantOffset)`: + +* Iterate entries until `entry[n].UncompressedOffset > wantOffset`. +* Start decoding from `entry[n-1].CompressedOffset`. +* Discard `entry[n-1].UncompressedOffset - wantOffset` bytes from the decoded stream. + +See [using indexes](https://github.com/klauspost/compress/tree/master/s2#using-indexes) for functions that perform the operations with a simpler interface. + # Format Extensions * Frame [Stream identifier](https://github.com/google/snappy/blob/master/framing_format.txt#L68) changed from `sNaPpY` to `S2sTwO`. diff --git a/vendor/github.com/klauspost/compress/s2/decode.go b/vendor/github.com/klauspost/compress/s2/decode.go index 9e7fce88..e2c1b16e 100644 --- a/vendor/github.com/klauspost/compress/s2/decode.go +++ b/vendor/github.com/klauspost/compress/s2/decode.go @@ -11,6 +11,8 @@ import ( "fmt" "io" "io/ioutil" + "runtime" + "sync" ) var ( @@ -196,13 +198,13 @@ type Reader struct { // ensureBufferSize will ensure that the buffer can take at least n bytes. // If false is returned the buffer exceeds maximum allowed size. func (r *Reader) ensureBufferSize(n int) bool { - if len(r.buf) >= n { - return true - } if n > r.maxBufSize { r.err = ErrCorrupt return false } + if cap(r.buf) >= n { + return true + } // Realloc buffer. r.buf = make([]byte, n) return true @@ -220,6 +222,7 @@ func (r *Reader) Reset(reader io.Reader) { r.err = nil r.i = 0 r.j = 0 + r.blockStart = 0 r.readHeader = r.ignoreStreamID } @@ -435,6 +438,259 @@ func (r *Reader) Read(p []byte) (int, error) { } } +// DecodeConcurrent will decode the full stream to w. +// This function should not be combined with reading, seeking or other operations. +// Up to 'concurrent' goroutines will be used. +// If <= 0, runtime.NumCPU will be used. +// On success the number of bytes decompressed nil and is returned. +// This is mainly intended for bigger streams. +func (r *Reader) DecodeConcurrent(w io.Writer, concurrent int) (written int64, err error) { + if r.i > 0 || r.j > 0 || r.blockStart > 0 { + return 0, errors.New("DecodeConcurrent called after ") + } + if concurrent <= 0 { + concurrent = runtime.NumCPU() + } + + // Write to output + var errMu sync.Mutex + var aErr error + setErr := func(e error) (ok bool) { + errMu.Lock() + defer errMu.Unlock() + if e == nil { + return aErr == nil + } + if aErr == nil { + aErr = e + } + return false + } + hasErr := func() (ok bool) { + errMu.Lock() + v := aErr != nil + errMu.Unlock() + return v + } + + var aWritten int64 + toRead := make(chan []byte, concurrent) + writtenBlocks := make(chan []byte, concurrent) + queue := make(chan chan []byte, concurrent) + reUse := make(chan chan []byte, concurrent) + for i := 0; i < concurrent; i++ { + toRead <- make([]byte, 0, r.maxBufSize) + writtenBlocks <- make([]byte, 0, r.maxBufSize) + reUse <- make(chan []byte, 1) + } + // Writer + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + for toWrite := range queue { + entry := <-toWrite + reUse <- toWrite + if hasErr() { + writtenBlocks <- entry + continue + } + n, err := w.Write(entry) + want := len(entry) + writtenBlocks <- entry + if err != nil { + setErr(err) + continue + } + if n != want { + setErr(io.ErrShortWrite) + continue + } + aWritten += int64(n) + } + }() + + // Reader + defer func() { + close(queue) + if r.err != nil { + err = r.err + setErr(r.err) + } + wg.Wait() + if err == nil { + err = aErr + } + written = aWritten + }() + + for !hasErr() { + if !r.readFull(r.buf[:4], true) { + if r.err == io.EOF { + r.err = nil + } + return 0, r.err + } + chunkType := r.buf[0] + if !r.readHeader { + if chunkType != chunkTypeStreamIdentifier { + r.err = ErrCorrupt + return 0, r.err + } + r.readHeader = true + } + chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16 + + // The chunk types are specified at + // https://github.com/google/snappy/blob/master/framing_format.txt + switch chunkType { + case chunkTypeCompressedData: + r.blockStart += int64(r.j) + // Section 4.2. Compressed data (chunk type 0x00). + if chunkLen < checksumSize { + r.err = ErrCorrupt + return 0, r.err + } + if chunkLen > r.maxBufSize { + r.err = ErrCorrupt + return 0, r.err + } + orgBuf := <-toRead + buf := orgBuf[:chunkLen] + + if !r.readFull(buf, false) { + return 0, r.err + } + + checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24 + buf = buf[checksumSize:] + + n, err := DecodedLen(buf) + if err != nil { + r.err = err + return 0, r.err + } + if r.snappyFrame && n > maxSnappyBlockSize { + r.err = ErrCorrupt + return 0, r.err + } + + if n > r.maxBlock { + r.err = ErrCorrupt + return 0, r.err + } + wg.Add(1) + + decoded := <-writtenBlocks + entry := <-reUse + queue <- entry + go func() { + defer wg.Done() + decoded = decoded[:n] + _, err := Decode(decoded, buf) + toRead <- orgBuf + if err != nil { + writtenBlocks <- decoded + setErr(err) + return + } + if crc(decoded) != checksum { + writtenBlocks <- decoded + setErr(ErrCRC) + return + } + entry <- decoded + }() + continue + + case chunkTypeUncompressedData: + + // Section 4.3. Uncompressed data (chunk type 0x01). + if chunkLen < checksumSize { + r.err = ErrCorrupt + return 0, r.err + } + if chunkLen > r.maxBufSize { + r.err = ErrCorrupt + return 0, r.err + } + // Grab write buffer + orgBuf := <-writtenBlocks + buf := orgBuf[:checksumSize] + if !r.readFull(buf, false) { + return 0, r.err + } + checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24 + // Read content. + n := chunkLen - checksumSize + + if r.snappyFrame && n > maxSnappyBlockSize { + r.err = ErrCorrupt + return 0, r.err + } + if n > r.maxBlock { + r.err = ErrCorrupt + return 0, r.err + } + // Read uncompressed + buf = orgBuf[:n] + if !r.readFull(buf, false) { + return 0, r.err + } + + if crc(buf) != checksum { + r.err = ErrCRC + return 0, r.err + } + entry := <-reUse + queue <- entry + entry <- buf + continue + + case chunkTypeStreamIdentifier: + // Section 4.1. Stream identifier (chunk type 0xff). + if chunkLen != len(magicBody) { + r.err = ErrCorrupt + return 0, r.err + } + if !r.readFull(r.buf[:len(magicBody)], false) { + return 0, r.err + } + if string(r.buf[:len(magicBody)]) != magicBody { + if string(r.buf[:len(magicBody)]) != magicBodySnappy { + r.err = ErrCorrupt + return 0, r.err + } else { + r.snappyFrame = true + } + } else { + r.snappyFrame = false + } + continue + } + + if chunkType <= 0x7f { + // Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f). + // fmt.Printf("ERR chunktype: 0x%x\n", chunkType) + r.err = ErrUnsupported + return 0, r.err + } + // Section 4.4 Padding (chunk type 0xfe). + // Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd). + if chunkLen > maxChunkSize { + // fmt.Printf("ERR chunkLen: 0x%x\n", chunkLen) + r.err = ErrUnsupported + return 0, r.err + } + + // fmt.Printf("skippable: ID: 0x%x, len: 0x%x\n", chunkType, chunkLen) + if !r.skippable(r.buf, chunkLen, false, chunkType) { + return 0, r.err + } + } + return 0, r.err +} + // Skip will skip n bytes forward in the decompressed output. // For larger skips this consumes less CPU and is faster than reading output and discarding it. // CRC is not checked on skipped blocks. @@ -699,8 +955,16 @@ func (r *ReadSeeker) Seek(offset int64, whence int) (int64, error) { case io.SeekCurrent: offset += r.blockStart + int64(r.i) case io.SeekEnd: - offset = -offset + if offset > 0 { + return 0, errors.New("seek after end of file") + } + offset = r.index.TotalUncompressed + offset } + + if offset < 0 { + return 0, errors.New("seek before start of file") + } + c, u, err := r.index.Find(offset) if err != nil { return r.blockStart + int64(r.i), err @@ -712,10 +976,6 @@ func (r *ReadSeeker) Seek(offset int64, whence int) (int64, error) { return 0, err } - if offset < 0 { - offset = r.index.TotalUncompressed + offset - } - r.i = r.j // Remove rest of current block. if u < offset { // Forward inside block diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go index d9312e5b..88f27c09 100644 --- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go +++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go @@ -5,6 +5,8 @@ package s2 +func _dummy_() + // encodeBlockAsm encodes a non-empty src to a guaranteed-large-enough dst. // Maximum input 4294967295 bytes. // It assumes that the varint-encoded length of the decompressed bytes has already been written. diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s index 729dbf53..337a73ca 100644 --- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s +++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s @@ -5,6 +5,15 @@ #include "textflag.h" +// func _dummy_() +TEXT ·_dummy_(SB), $0 +#ifdef GOAMD64_v4 +#ifndef GOAMD64_v3 +#define GOAMD64_v3 +#endif +#endif + RET + // func encodeBlockAsm(dst []byte, src []byte) int // Requires: BMI, SSE2 TEXT ·encodeBlockAsm(SB), $65560-56 @@ -253,17 +262,6 @@ matchlen_loopback_repeat_extend_encodeBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -791,17 +789,6 @@ matchlen_loopback_match_nolit_encodeBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R9, R9 @@ -1134,17 +1121,36 @@ memmove_emit_remainder_encodeBlockAsm: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm + +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm + +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16: @@ -1466,17 +1472,6 @@ matchlen_loopback_repeat_extend_encodeBlockAsm4MB: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -1963,17 +1958,6 @@ matchlen_loopback_match_nolit_encodeBlockAsm4MB: #ifdef GOAMD64_v3 TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R9, R9 @@ -2276,17 +2260,36 @@ memmove_emit_remainder_encodeBlockAsm4MB: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16: @@ -2597,17 +2600,6 @@ matchlen_loopback_repeat_extend_encodeBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -2979,17 +2971,6 @@ matchlen_loopback_match_nolit_encodeBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R9, R9 @@ -3189,17 +3170,36 @@ memmove_emit_remainder_encodeBlockAsm12B: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16: @@ -3510,17 +3510,6 @@ matchlen_loopback_repeat_extend_encodeBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -3892,17 +3881,6 @@ matchlen_loopback_match_nolit_encodeBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R9, R9 @@ -4102,17 +4080,36 @@ memmove_emit_remainder_encodeBlockAsm10B: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16: @@ -4423,17 +4420,6 @@ matchlen_loopback_repeat_extend_encodeBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -4795,17 +4781,6 @@ matchlen_loopback_match_nolit_encodeBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R9, R9 @@ -4999,17 +4974,36 @@ memmove_emit_remainder_encodeBlockAsm8B: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16: @@ -5225,17 +5219,6 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -5979,8 +5962,9 @@ memmove_emit_remainder_encodeBetterBlockAsm: MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x04 - JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4 + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7 CMPQ BX, $0x10 @@ -5989,9 +5973,18 @@ memmove_emit_remainder_encodeBetterBlockAsm: JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4: - MOVL (CX), SI - MOVL SI, (AX) +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7: @@ -6214,17 +6207,6 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -6911,8 +6893,9 @@ memmove_emit_remainder_encodeBetterBlockAsm4MB: MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x04 - JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4 + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7 CMPQ BX, $0x10 @@ -6921,9 +6904,18 @@ memmove_emit_remainder_encodeBetterBlockAsm4MB: JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4: - MOVL (CX), SI - MOVL SI, (AX) +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7: @@ -7138,17 +7130,6 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -7689,8 +7670,9 @@ memmove_emit_remainder_encodeBetterBlockAsm12B: MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x04 - JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4 + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7 CMPQ BX, $0x10 @@ -7699,9 +7681,18 @@ memmove_emit_remainder_encodeBetterBlockAsm12B: JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4: - MOVL (CX), SI - MOVL SI, (AX) +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7: @@ -7916,17 +7907,6 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -8467,8 +8447,9 @@ memmove_emit_remainder_encodeBetterBlockAsm10B: MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x04 - JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4 + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7 CMPQ BX, $0x10 @@ -8477,9 +8458,18 @@ memmove_emit_remainder_encodeBetterBlockAsm10B: JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4: - MOVL (CX), SI - MOVL SI, (AX) +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7: @@ -8694,17 +8684,6 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -9235,8 +9214,9 @@ memmove_emit_remainder_encodeBetterBlockAsm8B: MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x04 - JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4 + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7 CMPQ BX, $0x10 @@ -9245,9 +9225,18 @@ memmove_emit_remainder_encodeBetterBlockAsm8B: JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4: - MOVL (CX), SI - MOVL SI, (AX) +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7: @@ -9584,17 +9573,6 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R10, R10 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R10, R10 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R10, R10 @@ -9918,17 +9896,6 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R9, R9 @@ -10127,17 +10094,36 @@ memmove_emit_remainder_encodeSnappyBlockAsm: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16: @@ -10448,17 +10434,6 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K: #ifdef GOAMD64_v3 TZCNTQ R10, R10 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R10, R10 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R10, R10 @@ -10739,17 +10714,6 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K: #ifdef GOAMD64_v3 TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R9, R9 @@ -10905,17 +10869,36 @@ memmove_emit_remainder_encodeSnappyBlockAsm64K: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16: @@ -11226,17 +11209,6 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R10, R10 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R10, R10 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R10, R10 @@ -11517,17 +11489,6 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R9, R9 @@ -11683,17 +11644,36 @@ memmove_emit_remainder_encodeSnappyBlockAsm12B: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16: @@ -12004,17 +11984,6 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R10, R10 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R10, R10 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R10, R10 @@ -12295,17 +12264,6 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R9, R9 @@ -12461,17 +12419,36 @@ memmove_emit_remainder_encodeSnappyBlockAsm10B: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16: @@ -12782,17 +12759,6 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R10, R10 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R10, R10 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R10, R10 @@ -13071,17 +13037,6 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R9, R9 @@ -13235,17 +13190,36 @@ memmove_emit_remainder_encodeSnappyBlockAsm8B: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16: @@ -13461,17 +13435,6 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -13850,17 +13813,36 @@ memmove_emit_remainder_encodeSnappyBetterBlockAsm: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16: @@ -14068,17 +14050,6 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -14386,17 +14357,36 @@ memmove_emit_remainder_encodeSnappyBetterBlockAsm64K: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16: @@ -14604,17 +14594,6 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -14922,17 +14901,36 @@ memmove_emit_remainder_encodeSnappyBetterBlockAsm12B: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16: @@ -15140,17 +15138,6 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -15458,17 +15445,36 @@ memmove_emit_remainder_encodeSnappyBetterBlockAsm10B: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16: @@ -15676,17 +15682,6 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -15992,17 +15987,36 @@ memmove_emit_remainder_encodeSnappyBetterBlockAsm8B: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16: @@ -16644,17 +16658,6 @@ matchlen_loopback_standalone: #ifdef GOAMD64_v3 TZCNTQ BX, BX -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ BX, BX - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ BX, BX diff --git a/vendor/github.com/klauspost/compress/s2/index.go b/vendor/github.com/klauspost/compress/s2/index.go index fd857682..7b24a006 100644 --- a/vendor/github.com/klauspost/compress/s2/index.go +++ b/vendor/github.com/klauspost/compress/s2/index.go @@ -10,6 +10,7 @@ import ( "encoding/json" "fmt" "io" + "sort" ) const ( @@ -100,6 +101,15 @@ func (i *Index) Find(offset int64) (compressedOff, uncompressedOff int64, err er if offset > i.TotalUncompressed { return 0, 0, io.ErrUnexpectedEOF } + if len(i.info) > 200 { + n := sort.Search(len(i.info), func(n int) bool { + return i.info[n].uncompressedOffset > offset + }) + if n == 0 { + n = 1 + } + return i.info[n-1].compressedOffset, i.info[n-1].uncompressedOffset, nil + } for _, info := range i.info { if info.uncompressedOffset > offset { break diff --git a/vendor/github.com/nats-io/jsm.go/natscontext/context.go b/vendor/github.com/nats-io/jsm.go/natscontext/context.go index a9a72fe0..42a3d113 100644 --- a/vendor/github.com/nats-io/jsm.go/natscontext/context.go +++ b/vendor/github.com/nats-io/jsm.go/natscontext/context.go @@ -87,6 +87,33 @@ func New(name string, load bool, opts ...Option) (*Context, error) { } } + c.configureNewContext(opts...) + + return c, nil +} + +// NewFromFile loads a new configuration context from the given filename. +// +// When opts is supplied those settings will override what was loaded or supply +// values for an empty context +func NewFromFile(filename string, opts ...Option) (*Context, error) { + c := &Context{ + Name: strings.TrimSuffix(filepath.Base(filename), filepath.Ext(filename)), + config: &settings{}, + path: filename, + } + + err := c.loadActiveContext() + if err != nil { + return nil, err + } + + c.configureNewContext(opts...) + + return c, nil +} + +func (c *Context) configureNewContext(opts ...Option) { // apply supplied overrides for _, opt := range opts { opt(c.config) @@ -95,8 +122,6 @@ func New(name string, load bool, opts ...Option) (*Context, error) { if c.config.NSCLookup == "" && c.config.URL == "" && c.config.nscUrl == "" { c.config.URL = nats.DefaultURL } - - return c, nil } // Connect connects to the NATS server configured by the named context, empty name connects to selected context @@ -291,28 +316,31 @@ func (c *Context) NATSOptions(opts ...nats.Option) ([]nats.Option, error) { } func (c *Context) loadActiveContext() error { - parent, err := parentDir() - if err != nil { - return err - } + if c.path == "" { + parent, err := parentDir() + if err != nil { + return err + } - // none given, lets try to find it via the fs - if c.Name == "" { - c.Name = SelectedContext() + // none given, lets try to find it via the fs if c.Name == "" { - return nil + c.Name = SelectedContext() + if c.Name == "" { + return nil + } } - } - if !validName(c.Name) { - return fmt.Errorf("invalid context name %s", c.Name) - } + if !validName(c.Name) { + return fmt.Errorf("invalid context name %s", c.Name) + } + + if !knownContext(parent, c.Name) { + return fmt.Errorf("unknown context %q", c.Name) + } - if !knownContext(parent, c.Name) { - return fmt.Errorf("unknown context %q", c.Name) + c.path = filepath.Join(parent, "nats", "context", c.Name+".json") } - c.path = filepath.Join(parent, "nats", "context", c.Name+".json") ctxContent, err := ioutil.ReadFile(c.path) if err != nil { return err diff --git a/vendor/github.com/nats-io/jwt/v2/header.go b/vendor/github.com/nats-io/jwt/v2/header.go index 198bf306..eadd4eaa 100644 --- a/vendor/github.com/nats-io/jwt/v2/header.go +++ b/vendor/github.com/nats-io/jwt/v2/header.go @@ -23,7 +23,7 @@ import ( const ( // Version is semantic version. - Version = "2.2.0" + Version = "2.3.0" // TokenTypeJwt is the JWT token type supported JWT tokens // encoded and decoded by this library diff --git a/vendor/github.com/nats-io/nats-server/v2/server/README.md b/vendor/github.com/nats-io/nats-server/v2/server/README.md new file mode 100644 index 00000000..3184eeda --- /dev/null +++ b/vendor/github.com/nats-io/nats-server/v2/server/README.md @@ -0,0 +1,17 @@ +# Tests + +Tests that run on Travis have been split into jobs that run in their own VM in parallel. This reduces the overall running time but also is allowing recycling of a job when we get a flapper as opposed to have to recycle the whole test suite. + +## JetStream Tests + +For JetStream tests, we need to observe a naming convention so that no tests are omitted when running on Travis. + +The script `runTestsOnTravis.sh` will run a given job based on the definition found in "`.travis.yml`". + +As for the naming convention: + +- All JetStream tests name should start with `TestJetStream` +- Cluster tests should go into `jetstream_cluster_test.go` and start with `TestJetStreamCluster` +- Super-cluster tests should go into `jetstream_super_cluster_test.go` and start with `TestJetStreamSuperCluster` + +Not following this convention means that some tests may not be executed on Travis. diff --git a/vendor/github.com/nats-io/nats-server/v2/server/accounts.go b/vendor/github.com/nats-io/nats-server/v2/server/accounts.go index 36de78d2..d1333955 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/accounts.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/accounts.go @@ -18,6 +18,7 @@ import ( "encoding/hex" "errors" "fmt" + "hash/fnv" "hash/maphash" "io/ioutil" "math" @@ -25,6 +26,7 @@ import ( "net/http" "net/textproto" "reflect" + "regexp" "sort" "strconv" "strings" @@ -74,7 +76,7 @@ type Account struct { imports importMap exports exportMap js *jsAccount - jsLimits *JetStreamAccountLimits + jsLimits map[string]JetStreamAccountLimits limits expired bool incomplete bool @@ -161,6 +163,10 @@ const ( Chunked ) +var commaSeparatorRegEx = regexp.MustCompile(`,\s*`) +var partitionMappingFunctionRegEx = regexp.MustCompile(`{{\s*partition\s*\((.*)\)\s*}}`) +var wildcardMappingFunctionRegEx = regexp.MustCompile(`{{\s*wildcard\s*\((.*)\)\s*}}`) + // String helper. func (rt ServiceRespType) String() string { switch rt { @@ -1700,8 +1706,7 @@ func (a *Account) checkForReverseEntry(reply string, si *serviceImport, checkInt return } - sres := a.imports.rrMap[reply] - if sres == nil { + if sres := a.imports.rrMap[reply]; sres == nil { a.mu.RUnlock() return } @@ -1722,9 +1727,11 @@ func (a *Account) checkForReverseEntry(reply string, si *serviceImport, checkInt // Delete the appropriate entries here based on optional si. a.mu.Lock() + // We need a new lookup here because we have released the lock. + sres := a.imports.rrMap[reply] if si == nil { delete(a.imports.rrMap, reply) - } else { + } else if sres != nil { // Find the one we are looking for.. for i, sre := range sres { if sre.msub == si.from { @@ -1743,6 +1750,8 @@ func (a *Account) checkForReverseEntry(reply string, si *serviceImport, checkInt // If we are here we no longer have interest and we have // response entries that we should clean up. if si == nil { + // sres is now known to have been removed from a.imports.rrMap, so we + // can safely (data race wise) iterate through. for _, sre := range sres { acc := sre.acc var trackingCleanup bool @@ -3030,9 +3039,6 @@ func (s *Server) updateAccountClaimsWithRefresh(a *Account, ac *jwt.AccountClaim } jsEnabled := s.JetStreamEnabled() - if jsEnabled && a == s.SystemAccount() { - s.checkJetStreamExports() - } streamTokenExpirationChanged := false serviceTokenExpirationChanged := false @@ -3268,15 +3274,41 @@ func (s *Server) updateAccountClaimsWithRefresh(a *Account, ac *jwt.AccountClaim a.srv = s } - // Setup js limits regardless of whether this server has jsEnabled. - if ac.Limits.JetStreamLimits.DiskStorage != 0 || ac.Limits.JetStreamLimits.MemoryStorage != 0 { - // JetStreamAccountLimits and jwt.JetStreamLimits use same value for unlimited - a.jsLimits = &JetStreamAccountLimits{ - MaxMemory: ac.Limits.JetStreamLimits.MemoryStorage, - MaxStore: ac.Limits.JetStreamLimits.DiskStorage, - MaxStreams: int(ac.Limits.JetStreamLimits.Streams), - MaxConsumers: int(ac.Limits.JetStreamLimits.Consumer), - MaxBytesRequired: ac.Limits.JetStreamLimits.MaxBytesRequired, + if ac.Limits.IsJSEnabled() { + toUnlimited := func(value int64) int64 { + if value > 0 { + return value + } + return -1 + } + if ac.Limits.JetStreamLimits.DiskStorage != 0 || ac.Limits.JetStreamLimits.MemoryStorage != 0 { + // JetStreamAccountLimits and jwt.JetStreamLimits use same value for unlimited + a.jsLimits = map[string]JetStreamAccountLimits{ + _EMPTY_: { + MaxMemory: ac.Limits.JetStreamLimits.MemoryStorage, + MaxStore: ac.Limits.JetStreamLimits.DiskStorage, + MaxStreams: int(ac.Limits.JetStreamLimits.Streams), + MaxConsumers: int(ac.Limits.JetStreamLimits.Consumer), + MemoryMaxStreamBytes: toUnlimited(ac.Limits.JetStreamLimits.MemoryMaxStreamBytes), + StoreMaxStreamBytes: toUnlimited(ac.Limits.JetStreamLimits.DiskMaxStreamBytes), + MaxBytesRequired: ac.Limits.JetStreamLimits.MaxBytesRequired, + MaxAckPending: int(toUnlimited(ac.Limits.JetStreamLimits.MaxAckPending)), + }, + } + } else { + a.jsLimits = map[string]JetStreamAccountLimits{} + for t, l := range ac.Limits.JetStreamTieredLimits { + a.jsLimits[t] = JetStreamAccountLimits{ + MaxMemory: l.MemoryStorage, + MaxStore: l.DiskStorage, + MaxStreams: int(l.Streams), + MaxConsumers: int(l.Consumer), + MemoryMaxStreamBytes: toUnlimited(l.MemoryMaxStreamBytes), + StoreMaxStreamBytes: toUnlimited(l.DiskMaxStreamBytes), + MaxBytesRequired: l.MaxBytesRequired, + MaxAckPending: int(toUnlimited(l.MaxAckPending)), + } + } } } else if a.jsLimits != nil { // covers failed update followed by disable @@ -4132,18 +4164,65 @@ type transform struct { src, dest string dtoks []string stoks []string - dtpi []int8 + dtpi [][]int // destination token position indexes + dtpinp []int32 // destination token position index number of partitions +} + +func getMappingFunctionArgs(functionRegEx *regexp.Regexp, token string) []string { + commandStrings := functionRegEx.FindStringSubmatch(token) + if len(commandStrings) > 1 { + return commaSeparatorRegEx.Split(commandStrings[1], -1) + } + return nil } -// Helper to pull raw place holder index. Returns -1 if not a place holder. -func placeHolderIndex(token string) int { - if len(token) > 1 && token[0] == '$' { - var tp int - if n, err := fmt.Sscanf(token, "$%d", &tp); err == nil && n == 1 { - return tp +// Helper to pull raw place holder indexes and number of partitions. Returns -1 if not a place holder. +func placeHolderIndex(token string) ([]int, int32, error) { + if len(token) > 1 { + // old $1, $2, etc... mapping format still supported to maintain backwards compatibility + if token[0] == '$' { // simple non-partition mapping + tp, err := strconv.Atoi(token[1:]) + if err != nil { + return []int{-1}, -1, nil + } + return []int{tp}, -1, nil + } + + // New 'moustache' style mapping + // wildcard(wildcard token index) (equivalent to $) + args := getMappingFunctionArgs(wildcardMappingFunctionRegEx, token) + if args != nil { + if len(args) == 1 { + tp, err := strconv.Atoi(strings.Trim(args[0], " ")) + if err != nil { + return []int{}, -1, err + } + return []int{tp}, -1, nil + } + } + + // partition(number of partitions, token1, token2, ...) + args = getMappingFunctionArgs(partitionMappingFunctionRegEx, token) + if args != nil { + if len(args) >= 2 { + tphnp, err := strconv.Atoi(strings.Trim(args[0], " ")) + if err != nil { + return []int{}, -1, err + } + var numPositions = len(args[1:]) + tps := make([]int, numPositions) + for ti, t := range args[1:] { + i, err := strconv.Atoi(strings.Trim(t, " ")) + if err != nil { + return []int{}, -1, err + } + tps[ti] = i + } + return tps, int32(tphnp), nil + } } } - return -1 + return []int{-1}, -1, nil } // newTransform will create a new transform checking the src and dest subjects for accuracy. @@ -4157,7 +4236,8 @@ func newTransform(src, dest string) (*transform, error) { return nil, ErrBadSubject } - var dtpi []int8 + var dtpi [][]int + var dtpinb []int32 // If the src has partial wildcards then the dest needs to have the token place markers. if npwcs > 0 || hasFwc { @@ -4171,25 +4251,33 @@ func newTransform(src, dest string) (*transform, error) { nphs := 0 for _, token := range dtokens { - tp := placeHolderIndex(token) - if tp >= 0 { - if tp > npwcs { - return nil, ErrBadSubject - } + tp, nb, err := placeHolderIndex(token) + if err != nil { + return nil, ErrBadSubjectMappingDestination + } + if tp[0] >= 0 { nphs++ // Now build up our runtime mapping from dest to source tokens. - dtpi = append(dtpi, int8(sti[tp])) + var stis []int + for _, position := range tp { + if position > npwcs { + return nil, ErrBadSubjectMappingDestination + } + stis = append(stis, sti[position]) + } + dtpi = append(dtpi, stis) + dtpinb = append(dtpinb, nb) } else { - dtpi = append(dtpi, -1) + dtpi = append(dtpi, []int{-1}) + dtpinb = append(dtpinb, -1) } } - - if nphs != npwcs { - return nil, ErrBadSubject + if nphs < npwcs { + return nil, ErrBadSubjectMappingDestination } } - return &transform{src: src, dest: dest, dtoks: dtokens, stoks: stokens, dtpi: dtpi}, nil + return &transform{src: src, dest: dest, dtoks: dtokens, stoks: stokens, dtpi: dtpi, dtpinp: dtpinb}, nil } // match will take a literal published subject that is associated with a client and will match and transform @@ -4233,6 +4321,13 @@ func (tr *transform) transformSubject(subject string) (string, error) { return tr.transform(tts) } +func (tr *transform) getHashPartition(key []byte, numBuckets int) string { + h := fnv.New32a() + h.Write(key) + + return strconv.Itoa(int(h.Sum32() % uint32(numBuckets))) +} + // Do a transform on the subject to the dest subject. func (tr *transform) transform(tokens []string) (string, error) { if len(tr.dtpi) == 0 { @@ -4248,7 +4343,7 @@ func (tr *transform) transform(tokens []string) (string, error) { li := len(tr.dtpi) - 1 for i, index := range tr.dtpi { // <0 means use destination token. - if index < 0 { + if index[0] < 0 { token = tr.dtoks[i] // Break if fwc if len(token) == 1 && token[0] == fwc { @@ -4256,7 +4351,18 @@ func (tr *transform) transform(tokens []string) (string, error) { } } else { // >= 0 means use source map index to figure out which source token to pull. - token = tokens[index] + if tr.dtpinp[i] > 0 { // there is a valid (i.e. not -1) value for number of partitions, this is a partition transform token + var ( + _buffer [64]byte + keyForHashing = _buffer[:0] + ) + for _, sourceToken := range tr.dtpi[i] { + keyForHashing = append(keyForHashing, []byte(tokens[sourceToken])...) + } + token = tr.getHashPartition(keyForHashing, int(tr.dtpinp[i])) + } else { // back to normal substitution + token = tokens[tr.dtpi[i][0]] + } } b.WriteString(token) if i < li { diff --git a/vendor/github.com/nats-io/nats-server/v2/server/client.go b/vendor/github.com/nats-io/nats-server/v2/server/client.go index 5d372cd4..d7a8da09 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/client.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/client.go @@ -201,6 +201,8 @@ const ( ClusterNameConflict DuplicateRemoteLeafnodeConnection DuplicateClientID + DuplicateServerName + MinimumVersionRequired ) // Some flags passed to processMsgResults @@ -1225,6 +1227,15 @@ func (c *client) readLoop(pre []byte) { // to process messages, etc. for i := 0; i < len(bufs); i++ { if err := c.parse(bufs[i]); err != nil { + if err == ErrMinimumVersionRequired { + // Special case here, currently only for leaf node connections. + // When process the CONNECT protocol, if the minimum version + // required was not met, an error was printed and sent back to + // the remote, and connection was closed after a certain delay + // (to avoid "rapid" reconnection from the remote). + // We don't need to do any of the things below, simply return. + return + } if dur := time.Since(start); dur >= readLoopReportThreshold { c.Warnf("Readloop processing time: %v", dur) } @@ -2430,7 +2441,7 @@ func (c *client) processSubEx(subject, queue, bsid []byte, cb msgHandler, noForw // allow = ["foo", "foo v1"] -> can subscribe to 'foo' but can only queue subscribe to 'foo v1' // if sub.queue != nil { - if !c.canQueueSubscribe(string(sub.subject), string(sub.queue)) { + if !c.canSubscribe(string(sub.subject), string(sub.queue)) { c.mu.Unlock() c.subPermissionViolation(sub) return nil, ErrSubscribePermissionViolation @@ -2643,7 +2654,7 @@ func (c *client) addShadowSub(sub *subscription, ime *ime) (*subscription, error nsub := *sub // copy nsub.im = im - if !im.usePub && ime.dyn { + if !im.usePub && ime.dyn && im.tr != nil { if im.rtr == nil { im.rtr = im.tr.reverse() } @@ -2656,7 +2667,7 @@ func (c *client) addShadowSub(sub *subscription, ime *ime) (*subscription, error return nil, err } nsub.subject = []byte(subj) - } else if !im.usePub || !ime.dyn { + } else if !im.usePub || (im.usePub && ime.overlapSubj != _EMPTY_) || !ime.dyn { if ime.overlapSubj != _EMPTY_ { nsub.subject = []byte(ime.overlapSubj) } else { @@ -2680,17 +2691,27 @@ func (c *client) addShadowSub(sub *subscription, ime *ime) (*subscription, error // canSubscribe determines if the client is authorized to subscribe to the // given subject. Assumes caller is holding lock. -func (c *client) canSubscribe(subject string) bool { +func (c *client) canSubscribe(subject string, optQueue ...string) bool { if c.perms == nil { return true } allowed := true + // Optional queue group. + var queue string + if len(optQueue) > 0 { + queue = optQueue[0] + } + // Check allow list. If no allow list that means all are allowed. Deny can overrule. if c.perms.sub.allow != nil { r := c.perms.sub.allow.Match(subject) - allowed = len(r.psubs) != 0 + allowed = len(r.psubs) > 0 + if queue != _EMPTY_ && len(r.qsubs) > 0 { + // If the queue appears in the allow list, then DO allow. + allowed = queueMatches(queue, r.qsubs) + } // Leafnodes operate slightly differently in that they allow broader scoped subjects. // They will prune based on publish perms before sending to a leafnode client. if !allowed && c.kind == LEAF && subjectHasWildcard(subject) { @@ -2703,6 +2724,11 @@ func (c *client) canSubscribe(subject string) bool { r := c.perms.sub.deny.Match(subject) allowed = len(r.psubs) == 0 + if queue != _EMPTY_ && len(r.qsubs) > 0 { + // If the queue appears in the deny list, then DO NOT allow. + allowed = !queueMatches(queue, r.qsubs) + } + // We use the actual subscription to signal us to spin up the deny mperms // and cache. We check if the subject is a wildcard that contains any of // the deny clauses. @@ -2738,42 +2764,6 @@ func queueMatches(queue string, qsubs [][]*subscription) bool { return false } -func (c *client) canQueueSubscribe(subject, queue string) bool { - if c.perms == nil { - return true - } - - allowed := true - - if c.perms.sub.allow != nil { - r := c.perms.sub.allow.Match(subject) - - // If perms DO NOT have queue name, then psubs will be greater than - // zero. If perms DO have queue name, then qsubs will be greater than - // zero. - allowed = len(r.psubs) > 0 - if len(r.qsubs) > 0 { - // If the queue appears in the allow list, then DO allow. - allowed = queueMatches(queue, r.qsubs) - } - } - - if allowed && c.perms.sub.deny != nil { - r := c.perms.sub.deny.Match(subject) - - // If perms DO NOT have queue name, then psubs will be greater than - // zero. If perms DO have queue name, then qsubs will be greater than - // zero. - allowed = len(r.psubs) == 0 - if len(r.qsubs) > 0 { - // If the queue appears in the deny list, then DO NOT allow. - allowed = !queueMatches(queue, r.qsubs) - } - } - - return allowed -} - // Low level unsubscribe for a given client. func (c *client) unsubscribe(acc *Account, sub *subscription, force, remove bool) { c.mu.Lock() @@ -2921,21 +2911,26 @@ func (c *client) checkDenySub(subject string) bool { // Create a message header for routes or leafnodes. Header and origin cluster aware. func (c *client) msgHeaderForRouteOrLeaf(subj, reply []byte, rt *routeTarget, acc *Account) []byte { hasHeader := c.pa.hdr > 0 - canReceiveHeader := rt.sub.client.headers + subclient := rt.sub.client + canReceiveHeader := subclient.headers mh := c.msgb[:msgHeadProtoLen] - kind := rt.sub.client.kind + kind := subclient.kind var lnoc bool if kind == ROUTER { // If we are coming from a leaf with an origin cluster we need to handle differently // if we can. We will send a route based LMSG which has origin cluster and headers // by default. - if c.kind == LEAF && c.remoteCluster() != _EMPTY_ && rt.sub.client.route.lnoc { + if c.kind == LEAF && c.remoteCluster() != _EMPTY_ { + subclient.mu.Lock() + lnoc = subclient.route.lnoc + subclient.mu.Unlock() + } + if lnoc { mh[0] = 'L' mh = append(mh, c.remoteCluster()...) mh = append(mh, ' ') - lnoc = true } else { // Router (and Gateway) nodes are RMSG. Set here since leafnodes may rewrite. mh[0] = 'R' @@ -3477,8 +3472,11 @@ func isReservedReply(reply []byte) bool { if isServiceReply(reply) { return true } + rLen := len(reply) // Faster to check with string([:]) than byte-by-byte - if len(reply) > gwReplyPrefixLen && string(reply[:gwReplyPrefixLen]) == gwReplyPrefix { + if rLen > jsAckPreLen && string(reply[:jsAckPreLen]) == jsAckPre { + return true + } else if rLen > gwReplyPrefixLen && string(reply[:gwReplyPrefixLen]) == gwReplyPrefix { return true } return false @@ -4087,11 +4085,13 @@ func (c *client) processMsgResults(acc *Account, r *SublistResult, msg, deliver, // Check for JetStream encoded reply subjects. // For now these will only be on $JS.ACK prefixed reply subjects. + var remapped bool if len(creply) > 0 && c.kind != CLIENT && c.kind != SYSTEM && c.kind != JETSTREAM && c.kind != ACCOUNT && bytes.HasPrefix(creply, []byte(jsAckPre)) { // We need to rewrite the subject and the reply. if li := bytes.LastIndex(creply, []byte("@")); li != -1 && li < len(creply)-1 { + remapped = true subj, creply = creply[li+1:], creply[:li] } } @@ -4138,13 +4138,18 @@ func (c *client) processMsgResults(acc *Account, r *SublistResult, msg, deliver, continue } if sub.im.tr != nil { - to, _ := sub.im.tr.transformSubject(string(dsubj)) + to, _ := sub.im.tr.transformSubject(string(subject)) dsubj = append(_dsubj[:0], to...) } else if sub.im.usePub { dsubj = append(_dsubj[:0], subj...) } else { dsubj = append(_dsubj[:0], sub.im.to...) } + + // Make sure deliver is set if inbound from a route. + if remapped && (c.kind == GATEWAY || c.kind == ROUTER || c.kind == LEAF) { + deliver = subj + } // If we are mapping for a deliver subject we will reverse roles. // The original subj we set from above is correct for the msg header, // but we need to transform the deliver subject to properly route. @@ -4274,13 +4279,23 @@ func (c *client) processMsgResults(acc *Account, r *SublistResult, msg, deliver, continue } if sub.im.tr != nil { - to, _ := sub.im.tr.transformSubject(string(subj)) + to, _ := sub.im.tr.transformSubject(string(subject)) dsubj = append(_dsubj[:0], to...) } else if sub.im.usePub { dsubj = append(_dsubj[:0], subj...) } else { dsubj = append(_dsubj[:0], sub.im.to...) } + // Make sure deliver is set if inbound from a route. + if remapped && (c.kind == GATEWAY || c.kind == ROUTER || c.kind == LEAF) { + deliver = subj + } + // If we are mapping for a deliver subject we will reverse roles. + // The original subj we set from above is correct for the msg header, + // but we need to transform the deliver subject to properly route. + if len(deliver) > 0 { + dsubj, subj = subj, dsubj + } } mh := c.msgHeader(dsubj, creply, sub) @@ -4648,7 +4663,7 @@ func (c *client) processSubsOnConfigReload(awcsti map[string]struct{}) { // Just checking to rebuild mperms under the lock, will collect removed though here. // Only collect under subs array of canSubscribe and checkAcc true. canSub := c.canSubscribe(string(sub.subject)) - canQSub := sub.queue != nil && c.canQueueSubscribe(string(sub.subject), string(sub.queue)) + canQSub := sub.queue != nil && c.canSubscribe(string(sub.subject), string(sub.queue)) if !canSub && !canQSub { removed = append(removed, sub) @@ -5319,6 +5334,15 @@ func (c *client) Warnf(format string, v ...interface{}) { c.srv.Warnf(format, v...) } +func (c *client) RateLimitWarnf(format string, v ...interface{}) { + // Do the check before adding the client info to the format... + statement := fmt.Sprintf(format, v...) + if _, loaded := c.srv.rateLimitLogging.LoadOrStore(statement, time.Now()); loaded { + return + } + c.Warnf("%s", statement) +} + // Set the very first PING to a lower interval to capture the initial RTT. // After that the PING interval will be set to the user defined value. // Client lock should be held. diff --git a/vendor/github.com/nats-io/nats-server/v2/server/const.go b/vendor/github.com/nats-io/nats-server/v2/server/const.go index 396b900b..12902b1b 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/const.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/const.go @@ -41,7 +41,7 @@ var ( const ( // VERSION is the current version for the server. - VERSION = "2.7.5-beta" + VERSION = "2.8.4" // PROTO is the currently supported protocol. // 0 was the original diff --git a/vendor/github.com/nats-io/nats-server/v2/server/consumer.go b/vendor/github.com/nats-io/nats-server/v2/server/consumer.go index 73bd2e42..9fce5e65 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/consumer.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/consumer.go @@ -25,6 +25,7 @@ import ( "strconv" "strings" "sync" + "sync/atomic" "time" "github.com/nats-io/nuid" @@ -67,8 +68,9 @@ type ConsumerConfig struct { HeadersOnly bool `json:"headers_only,omitempty"` // Pull based options. - MaxRequestBatch int `json:"max_batch,omitempty"` - MaxRequestExpires time.Duration `json:"max_expires,omitempty"` + MaxRequestBatch int `json:"max_batch,omitempty"` + MaxRequestExpires time.Duration `json:"max_expires,omitempty"` + MaxRequestMaxBytes int `json:"max_bytes,omitempty"` // Push based consumers. DeliverSubject string `json:"deliver_subject,omitempty"` @@ -77,6 +79,11 @@ type ConsumerConfig struct { // Ephemeral inactivity threshold. InactiveThreshold time.Duration `json:"inactive_threshold,omitempty"` + // Generally inherited by parent stream and other markers, now can be configured directly. + Replicas int `json:"num_replicas"` + // Force memory storage. + MemoryStorage bool `json:"mem_storage,omitempty"` + // Don't add to general clients. Direct bool `json:"direct,omitempty"` } @@ -198,6 +205,10 @@ var ( // Consumer is a jetstream consumer. type consumer struct { + // Atomic used to notify that we want to process an ack. + // This will be checked in checkPending to abort processing + // and let ack be processed in priority. + awl int64 mu sync.RWMutex js *jetStream mset *stream @@ -212,8 +223,8 @@ type consumer struct { dseq uint64 adflr uint64 asflr uint64 - sgap uint64 - lsgap uint64 + npc uint64 + npcm uint64 dsubj string qgroup string lss *lastSeqSkipList @@ -252,6 +263,7 @@ type consumer struct { inch chan bool sfreq int32 ackEventT string + nakEventT string deliveryExcEventT string created time.Time ldt time.Time @@ -259,17 +271,22 @@ type consumer struct { closed bool // Clustered. - ca *consumerAssignment - node RaftNode - infoSub *subscription - lqsent time.Time - prm map[string]struct{} - prOk bool + ca *consumerAssignment + node RaftNode + infoSub *subscription + lqsent time.Time + prm map[string]struct{} + prOk bool + uch chan struct{} + retention RetentionPolicy // R>1 proposals pch chan struct{} phead *proposal ptail *proposal + + // Ack queue + ackMsgs *ipQueue } type proposal struct { @@ -285,13 +302,13 @@ const ( JsDeleteWaitTimeDefault = 5 * time.Second // JsFlowControlMaxPending specifies default pending bytes during flow control that can be // outstanding. - JsFlowControlMaxPending = 1 * 1024 * 1024 + JsFlowControlMaxPending = 32 * 1024 * 1024 // JsDefaultMaxAckPending is set for consumers with explicit ack that do not set the max ack pending. - JsDefaultMaxAckPending = 20_000 + JsDefaultMaxAckPending = 1000 ) // Helper function to set consumer config defaults from above. -func setConsumerConfigDefaults(config *ConsumerConfig) { +func setConsumerConfigDefaults(config *ConsumerConfig, lim *JSLimitOpts, accLim *JetStreamAccountLimits) { // Set to default if not specified. if config.DeliverSubject == _EMPTY_ && config.MaxWaiting == 0 { config.MaxWaiting = JSWaitQueueDefaultMax @@ -310,106 +327,118 @@ func setConsumerConfigDefaults(config *ConsumerConfig) { } // Set proper default for max ack pending if we are ack explicit and none has been set. if (config.AckPolicy == AckExplicit || config.AckPolicy == AckAll) && config.MaxAckPending == 0 { - config.MaxAckPending = JsDefaultMaxAckPending + accPending := JsDefaultMaxAckPending + if lim.MaxAckPending > 0 && lim.MaxAckPending < accPending { + accPending = lim.MaxAckPending + } + if accLim.MaxAckPending > 0 && accLim.MaxAckPending < accPending { + accPending = accLim.MaxAckPending + } + config.MaxAckPending = accPending + } + // if applicable set max request batch size + if config.DeliverSubject == _EMPTY_ && config.MaxRequestBatch == 0 && lim.MaxRequestBatch > 0 { + config.MaxRequestBatch = lim.MaxRequestBatch } } func (mset *stream) addConsumer(config *ConsumerConfig) (*consumer, error) { - return mset.addConsumerWithAssignment(config, _EMPTY_, nil) + return mset.addConsumerWithAssignment(config, _EMPTY_, nil, false) } -func (mset *stream) addConsumerWithAssignment(config *ConsumerConfig, oname string, ca *consumerAssignment) (*consumer, error) { - mset.mu.RLock() - s, jsa := mset.srv, mset.jsa - mset.mu.RUnlock() +// Check the consumer config. If we are recovering don't check filter subjects. +func checkConsumerCfg( + config *ConsumerConfig, + srvLim *JSLimitOpts, + cfg *StreamConfig, + acc *Account, + accLim *JetStreamAccountLimits, + isRecovering bool, +) *ApiError { - // If we do not have the consumer currently assigned to us in cluster mode we will proceed but warn. - // This can happen on startup with restored state where on meta replay we still do not have - // the assignment. Running in single server mode this always returns true. - if oname != _EMPTY_ && !jsa.consumerAssigned(mset.name(), oname) { - s.Debugf("Consumer %q > %q does not seem to be assigned to this server", mset.name(), oname) - } - - if config == nil { - return nil, NewJSConsumerConfigRequiredError() + // Check if replicas is defined but exceeds parent stream. + if config.Replicas > 0 && config.Replicas > cfg.Replicas { + return NewJSConsumerReplicasExceedsStreamError() } - // Make sure we have sane defaults. - setConsumerConfigDefaults(config) - // Check if we have a BackOff defined that MaxDeliver is within range etc. if lbo := len(config.BackOff); lbo > 0 && config.MaxDeliver <= lbo { - return nil, NewJSConsumerMaxDeliverBackoffError() + return NewJSConsumerMaxDeliverBackoffError() } if len(config.Description) > JSMaxDescriptionLen { - return nil, NewJSConsumerDescriptionTooLongError(JSMaxDescriptionLen) + return NewJSConsumerDescriptionTooLongError(JSMaxDescriptionLen) } - var err error // For now expect a literal subject if its not empty. Empty means work queue mode (pull mode). if config.DeliverSubject != _EMPTY_ { if !subjectIsLiteral(config.DeliverSubject) { - return nil, NewJSConsumerDeliverToWildcardsError() + return NewJSConsumerDeliverToWildcardsError() } if !IsValidSubject(config.DeliverSubject) { - return nil, NewJSConsumerInvalidDeliverSubjectError() + return NewJSConsumerInvalidDeliverSubjectError() } - if mset.deliveryFormsCycle(config.DeliverSubject) { - return nil, NewJSConsumerDeliverCycleError() + if deliveryFormsCycle(cfg, config.DeliverSubject) { + return NewJSConsumerDeliverCycleError() } if config.MaxWaiting != 0 { - return nil, NewJSConsumerPushMaxWaitingError() + return NewJSConsumerPushMaxWaitingError() } if config.MaxAckPending > 0 && config.AckPolicy == AckNone { - return nil, NewJSConsumerMaxPendingAckPolicyRequiredError() + return NewJSConsumerMaxPendingAckPolicyRequiredError() } if config.Heartbeat > 0 && config.Heartbeat < 100*time.Millisecond { - return nil, NewJSConsumerSmallHeartbeatError() + return NewJSConsumerSmallHeartbeatError() } } else { // Pull mode / work queue mode require explicit ack. if config.AckPolicy == AckNone { - return nil, NewJSConsumerPullRequiresAckError() + return NewJSConsumerPullRequiresAckError() } if config.RateLimit > 0 { - return nil, NewJSConsumerPullWithRateLimitError() + return NewJSConsumerPullWithRateLimitError() } if config.MaxWaiting < 0 { - return nil, NewJSConsumerMaxWaitingNegativeError() + return NewJSConsumerMaxWaitingNegativeError() } if config.Heartbeat > 0 { - return nil, NewJSConsumerHBRequiresPushError() + return NewJSConsumerHBRequiresPushError() } if config.FlowControl { - return nil, NewJSConsumerFCRequiresPushError() + return NewJSConsumerFCRequiresPushError() } if config.MaxRequestBatch < 0 { - return nil, NewJSConsumerMaxRequestBatchNegativeError() + return NewJSConsumerMaxRequestBatchNegativeError() } if config.MaxRequestExpires != 0 && config.MaxRequestExpires < time.Millisecond { - return nil, NewJSConsumerMaxRequestExpiresToSmallError() + return NewJSConsumerMaxRequestExpiresToSmallError() + } + if srvLim.MaxRequestBatch > 0 && config.MaxRequestBatch > srvLim.MaxRequestBatch { + return NewJSConsumerMaxRequestBatchExceededError(srvLim.MaxRequestBatch) } } + if srvLim.MaxAckPending > 0 && config.MaxAckPending > srvLim.MaxAckPending { + return NewJSConsumerMaxPendingAckExcessError(srvLim.MaxAckPending) + } + if accLim.MaxAckPending > 0 && config.MaxAckPending > accLim.MaxAckPending { + return NewJSConsumerMaxPendingAckExcessError(accLim.MaxAckPending) + } // Direct need to be non-mapped ephemerals. if config.Direct { if config.DeliverSubject == _EMPTY_ { - return nil, NewJSConsumerDirectRequiresPushError() + return NewJSConsumerDirectRequiresPushError() } if isDurableConsumer(config) { - return nil, NewJSConsumerDirectRequiresEphemeralError() - } - if ca != nil { - return nil, NewJSConsumerOnMappedError() + return NewJSConsumerDirectRequiresEphemeralError() } } // As best we can make sure the filtered subject is valid. - if config.FilterSubject != _EMPTY_ { - subjects, hasExt := mset.allSubjects() + if config.FilterSubject != _EMPTY_ && !isRecovering { + subjects, hasExt := allSubjects(cfg, acc) if !validFilteredSubject(config.FilterSubject, subjects) && !hasExt { - return nil, NewJSConsumerFilterNotSubsetError() + return NewJSConsumerFilterNotSubsetError() } } @@ -425,67 +454,115 @@ func (mset *stream) addConsumerWithAssignment(config *ConsumerConfig, oname stri switch config.DeliverPolicy { case DeliverAll: if config.OptStartSeq > 0 { - return nil, NewJSConsumerInvalidPolicyError(badStart("all", "sequence")) + return NewJSConsumerInvalidPolicyError(badStart("all", "sequence")) } if config.OptStartTime != nil { - return nil, NewJSConsumerInvalidPolicyError(badStart("all", "time")) + return NewJSConsumerInvalidPolicyError(badStart("all", "time")) } case DeliverLast: if config.OptStartSeq > 0 { - return nil, NewJSConsumerInvalidPolicyError(badStart("last", "sequence")) + return NewJSConsumerInvalidPolicyError(badStart("last", "sequence")) } if config.OptStartTime != nil { - return nil, NewJSConsumerInvalidPolicyError(badStart("last", "time")) + return NewJSConsumerInvalidPolicyError(badStart("last", "time")) } case DeliverLastPerSubject: if config.OptStartSeq > 0 { - return nil, NewJSConsumerInvalidPolicyError(badStart("last per subject", "sequence")) + return NewJSConsumerInvalidPolicyError(badStart("last per subject", "sequence")) } if config.OptStartTime != nil { - return nil, NewJSConsumerInvalidPolicyError(badStart("last per subject", "time")) + return NewJSConsumerInvalidPolicyError(badStart("last per subject", "time")) } if config.FilterSubject == _EMPTY_ { - return nil, NewJSConsumerInvalidPolicyError(notSet("last per subject", "filter subject")) + return NewJSConsumerInvalidPolicyError(notSet("last per subject", "filter subject")) } case DeliverNew: if config.OptStartSeq > 0 { - return nil, NewJSConsumerInvalidPolicyError(badStart("new", "sequence")) + return NewJSConsumerInvalidPolicyError(badStart("new", "sequence")) } if config.OptStartTime != nil { - return nil, NewJSConsumerInvalidPolicyError(badStart("new", "time")) + return NewJSConsumerInvalidPolicyError(badStart("new", "time")) } case DeliverByStartSequence: if config.OptStartSeq == 0 { - return nil, NewJSConsumerInvalidPolicyError(notSet("by start sequence", "start sequence")) + return NewJSConsumerInvalidPolicyError(notSet("by start sequence", "start sequence")) } if config.OptStartTime != nil { - return nil, NewJSConsumerInvalidPolicyError(badStart("by start sequence", "time")) + return NewJSConsumerInvalidPolicyError(badStart("by start sequence", "time")) } case DeliverByStartTime: if config.OptStartTime == nil { - return nil, NewJSConsumerInvalidPolicyError(notSet("by start time", "start time")) + return NewJSConsumerInvalidPolicyError(notSet("by start time", "start time")) } if config.OptStartSeq != 0 { - return nil, NewJSConsumerInvalidPolicyError(badStart("by start time", "start sequence")) + return NewJSConsumerInvalidPolicyError(badStart("by start time", "start sequence")) } } - sampleFreq := 0 if config.SampleFrequency != _EMPTY_ { s := strings.TrimSuffix(config.SampleFrequency, "%") - sampleFreq, err = strconv.Atoi(s) - if err != nil { - return nil, NewJSConsumerInvalidSamplingError(err) + if sampleFreq, err := strconv.Atoi(s); err != nil || sampleFreq < 0 { + return NewJSConsumerInvalidSamplingError(err) } } + // We reject if flow control is set without heartbeats. + if config.FlowControl && config.Heartbeat == 0 { + return NewJSConsumerWithFlowControlNeedsHeartbeatsError() + } + + return nil +} + +func (mset *stream) addConsumerWithAssignment(config *ConsumerConfig, oname string, ca *consumerAssignment, isRecovering bool) (*consumer, error) { + mset.mu.RLock() + s, jsa, tierName, cfg, acc := mset.srv, mset.jsa, mset.tier, mset.cfg, mset.acc + retention := cfg.Retention + mset.mu.RUnlock() + + // If we do not have the consumer currently assigned to us in cluster mode we will proceed but warn. + // This can happen on startup with restored state where on meta replay we still do not have + // the assignment. Running in single server mode this always returns true. + if oname != _EMPTY_ && !jsa.consumerAssigned(mset.name(), oname) { + s.Debugf("Consumer %q > %q does not seem to be assigned to this server", mset.name(), oname) + } + + if config == nil { + return nil, NewJSConsumerConfigRequiredError() + } + + jsa.usageMu.RLock() + selectedLimits, limitsFound := jsa.limits[tierName] + jsa.usageMu.RUnlock() + if !limitsFound { + return nil, NewJSNoLimitsError() + } + + srvLim := &s.getOpts().JetStreamLimits + // Make sure we have sane defaults. + setConsumerConfigDefaults(config, srvLim, &selectedLimits) + + if err := checkConsumerCfg(config, srvLim, &cfg, acc, &selectedLimits, isRecovering); err != nil { + return nil, err + } + + sampleFreq := 0 + if config.SampleFrequency != _EMPTY_ { + // Can't fail as checkConsumerCfg checks correct format + sampleFreq, _ = strconv.Atoi(strings.TrimSuffix(config.SampleFrequency, "%")) + } + // Grab the client, account and server reference. c := mset.client if c == nil { return nil, NewJSStreamInvalidError() } + var accName string c.mu.Lock() s, a := c.srv, c.acc + if a != nil { + accName = a.Name + } c.mu.Unlock() // Hold mset lock here. @@ -512,8 +589,8 @@ func (mset *stream) addConsumerWithAssignment(config *ConsumerConfig, oname stri // than stream config we prefer the account limits to handle cases where account limits are // updated during the lifecycle of the stream maxc := mset.cfg.MaxConsumers - if maxc <= 0 || (mset.jsa.limits.MaxConsumers > 0 && mset.jsa.limits.MaxConsumers < maxc) { - maxc = mset.jsa.limits.MaxConsumers + if maxc <= 0 || (selectedLimits.MaxConsumers > 0 && selectedLimits.MaxConsumers < maxc) { + maxc = selectedLimits.MaxConsumers } if maxc > 0 && mset.numPublicConsumers() >= maxc { mset.mu.Unlock() @@ -546,22 +623,24 @@ func (mset *stream) addConsumerWithAssignment(config *ConsumerConfig, oname stri // Set name, which will be durable name if set, otherwise we create one at random. o := &consumer{ - mset: mset, - js: s.getJetStream(), - acc: a, - srv: s, - client: s.createInternalJetStreamClient(), - sysc: s.createInternalJetStreamClient(), - cfg: *config, - dsubj: config.DeliverSubject, - outq: mset.outq, - active: true, - qch: make(chan struct{}), - mch: make(chan struct{}, 1), - sfreq: int32(sampleFreq), - maxdc: uint64(config.MaxDeliver), - maxp: config.MaxAckPending, - created: time.Now().UTC(), + mset: mset, + js: s.getJetStream(), + acc: a, + srv: s, + client: s.createInternalJetStreamClient(), + sysc: s.createInternalJetStreamClient(), + cfg: *config, + dsubj: config.DeliverSubject, + outq: mset.outq, + active: true, + qch: make(chan struct{}), + uch: make(chan struct{}, 1), + mch: make(chan struct{}, 1), + sfreq: int32(sampleFreq), + maxdc: uint64(config.MaxDeliver), + maxp: config.MaxAckPending, + retention: retention, + created: time.Now().UTC(), } // Bind internal client to the user account. @@ -586,6 +665,9 @@ func (mset *stream) addConsumerWithAssignment(config *ConsumerConfig, oname stri } } } + // Create ackMsgs queue now that we have a consumer name + o.ackMsgs = s.newIPQueue(fmt.Sprintf("[ACC:%s] consumer '%s' on stream '%s' ackMsgs", accName, o.name, mset.cfg.Name)) + // Create our request waiting queue. if o.isPullMode() { o.waiting = newWaitQueue(config.MaxWaiting) @@ -599,6 +681,7 @@ func (mset *stream) addConsumerWithAssignment(config *ConsumerConfig, oname stri // already under lock, mset.Name() would deadlock o.stream = mset.cfg.Name o.ackEventT = JSMetricConsumerAckPre + "." + o.stream + "." + o.name + o.nakEventT = JSAdvisoryConsumerMsgNakPre + "." + o.stream + "." + o.name o.deliveryExcEventT = JSAdvisoryConsumerMaxDeliveryExceedPre + "." + o.stream + "." + o.name if !isValidName(o.name) { @@ -607,9 +690,7 @@ func (mset *stream) addConsumerWithAssignment(config *ConsumerConfig, oname stri return nil, NewJSConsumerBadDurableNameError() } - // Select starting sequence number - o.selectStartingSeqNo() - + // Setup our storage if not a direct consumer. if !config.Direct { store, err := mset.store.ConsumerStore(o.name, config) if err != nil { @@ -620,6 +701,14 @@ func (mset *stream) addConsumerWithAssignment(config *ConsumerConfig, oname stri o.store = store } + if o.store != nil && o.store.HasState() { + // Restore our saved state. + o.readStoredState(0) + } else { + // Select starting sequence number + o.selectStartingSeqNo() + } + // Now register with mset and create the ack subscription. // Check if we already have this one registered. if eo, ok := mset.consumers[o.name]; ok { @@ -721,13 +810,27 @@ func (o *consumer) consumerAssignment() *consumerAssignment { func (o *consumer) setConsumerAssignment(ca *consumerAssignment) { o.mu.Lock() defer o.mu.Unlock() + o.ca = ca + if ca == nil { + return + } // Set our node. - if ca != nil { - o.node = ca.Group.node + o.node = ca.Group.node + + // Trigger update chan. + select { + case o.uch <- struct{}{}: + default: } } +func (o *consumer) updateC() <-chan struct{} { + o.mu.RLock() + defer o.mu.RUnlock() + return o.uch +} + // checkQueueInterest will check on our interest's queue group status. // Lock should be held. func (o *consumer) checkQueueInterest() { @@ -780,12 +883,27 @@ func (o *consumer) setLeader(isLeader bool) { } mset.mu.RLock() - s, jsa, stream := mset.srv, mset.jsa, mset.cfg.Name + s, jsa, stream, lseq := mset.srv, mset.jsa, mset.cfg.Name, mset.lseq mset.mu.RUnlock() o.mu.Lock() + o.rdq, o.rdqi = nil, nil + // Restore our saved state. During non-leader status we just update our underlying store. - o.readStoredState() + o.readStoredState(lseq) + + // Setup initial num pending. + o.streamNumPending() + + // Cleanup lss when we take over in clustered mode. + if o.hasSkipListPending() && o.sseq >= o.lss.resume { + o.lss = nil + } + + // Update the group on the our starting sequence if we are starting but we skipped some in the stream. + if o.dseq == 1 && o.sseq > 1 { + o.updateSkipped() + } // Do info sub. if o.infoSub == nil && jsa != nil { @@ -795,7 +913,7 @@ func (o *consumer) setLeader(isLeader bool) { } var err error - if o.ackSub, err = o.subscribeInternal(o.ackSubj, o.processAck); err != nil { + if o.ackSub, err = o.subscribeInternal(o.ackSubj, o.pushAck); err != nil { o.mu.Unlock() o.deleteWithoutAdvisory() return @@ -820,9 +938,6 @@ func (o *consumer) setLeader(isLeader bool) { } } - // Setup initial pending and proper start sequence. - o.setInitialPendingAndStart() - // If push mode, register for notifications on interest. if o.isPushMode() { o.inch = make(chan bool, 8) @@ -866,6 +981,9 @@ func (o *consumer) setLeader(isLeader bool) { // Now start up Go routine to deliver msgs. go o.loopAndGatherMsgs(qch) + // Now start up Go routine to process acks. + go o.processInboundAcks(qch) + // If we are R>1 spin up our proposal loop. if node != nil { // Determine if we can send pending requests info to the group. @@ -878,6 +996,14 @@ func (o *consumer) setLeader(isLeader bool) { } else { // Shutdown the go routines and the subscriptions. o.mu.Lock() + if o.qch != nil { + close(o.qch) + o.qch = nil + } + // Make sure to clear out any re delivery queues + stopAndClearTimer(&o.ptmr) + o.rdq, o.rdqi = nil, nil + o.pending = nil // ok if they are nil, we protect inside unsubscribe() o.unsubscribe(o.ackSub) o.unsubscribe(o.reqSub) @@ -887,13 +1013,14 @@ func (o *consumer) setLeader(isLeader bool) { o.srv.sysUnsubscribe(o.infoSub) o.infoSub = nil } - if o.qch != nil { - close(o.qch) - o.qch = nil - } // Reset waiting if we are in pull mode. if o.isPullMode() { o.waiting = newWaitQueue(o.cfg.MaxWaiting) + if !o.isDurable() { + stopAndClearTimer(&o.dtmr) + } + } else if o.srv.gateway.enabled { + stopAndClearTimer(&o.gwdtmr) } o.mu.Unlock() } @@ -1335,6 +1462,13 @@ func (o *consumer) updateConfig(cfg *ConsumerConfig) error { // We need both locks here so do in Go routine. go o.setRateLimitNeedsLocks() } + if cfg.SampleFrequency != o.cfg.SampleFrequency { + s := strings.TrimSuffix(cfg.SampleFrequency, "%") + // String has been already verified for validity up in the stack, so no + // need to check for error here. + sampleFreq, _ := strconv.Atoi(s) + o.sfreq = int32(sampleFreq) + } // Record new config for others that do not need special handling. // Allowed but considered no-op, [Description, MaxDeliver, SampleFrequency, MaxWaiting, HeadersOnly] @@ -1384,9 +1518,55 @@ func (o *consumer) sendAckReply(subj string) { o.sendAdvisory(subj, nil) } -// Process a message for the ack reply subject delivered with a message. -func (o *consumer) processAck(_ *subscription, c *client, acc *Account, subject, reply string, rmsg []byte) { - _, msg := c.msgParts(rmsg) +type jsAckMsg struct { + subject string + reply string + hdr int + msg []byte +} + +var jsAckMsgPool sync.Pool + +func newJSAckMsg(subj, reply string, hdr int, msg []byte) *jsAckMsg { + var m *jsAckMsg + am := jsAckMsgPool.Get() + if am != nil { + m = am.(*jsAckMsg) + } else { + m = &jsAckMsg{} + } + // When getting something from a pool it is criticical that all fields are + // initialized. Doing this way guarantees that if someone adds a field to + // the structure, the compiler will fail the build if this line is not updated. + (*m) = jsAckMsg{subj, reply, hdr, msg} + return m +} + +func (am *jsAckMsg) returnToPool() { + if am == nil { + return + } + am.subject, am.reply, am.hdr, am.msg = _EMPTY_, _EMPTY_, -1, nil + jsAckMsgPool.Put(am) +} + +// Push the ack message to the consumer's ackMsgs queue +func (o *consumer) pushAck(_ *subscription, c *client, _ *Account, subject, reply string, rmsg []byte) { + atomic.AddInt64(&o.awl, 1) + o.ackMsgs.push(newJSAckMsg(subject, reply, c.pa.hdr, copyBytes(rmsg))) +} + +// Processes a message for the ack reply subject delivered with a message. +func (o *consumer) processAck(subject, reply string, hdr int, rmsg []byte) { + defer atomic.AddInt64(&o.awl, -1) + + var msg []byte + if hdr > 0 { + msg = rmsg[hdr:] + } else { + msg = rmsg + } + sseq, dseq, dc := ackReplyInfo(subject) skipAckReply := sseq == 0 @@ -1396,16 +1576,7 @@ func (o *consumer) processAck(_ *subscription, c *client, acc *Account, subject, o.processAckMsg(sseq, dseq, dc, true) case bytes.HasPrefix(msg, AckNext): o.processAckMsg(sseq, dseq, dc, true) - // processNextMsgReq can be invoked from an internal subscription or from here. - // Therefore, it has to call msgParts(), so we can't simply pass msg[len(AckNext):] - // with current c.pa.hdr because it would cause a panic. We will save the current - // c.pa.hdr value and disable headers before calling processNextMsgReq and then - // restore so that we don't mess with the calling stack in case it is used - // somewhere else. - phdr := c.pa.hdr - c.pa.hdr = -1 - o.processNextMsgReq(nil, c, acc, subject, reply, msg[len(AckNext):]) - c.pa.hdr = phdr + o.processNextMsgRequest(reply, msg[len(AckNext):]) skipAckReply = true case bytes.HasPrefix(msg, AckNak): o.processNak(sseq, dseq, dc, msg) @@ -1469,7 +1640,7 @@ func (o *consumer) loopAndForwardProposals(qch chan struct{}) { sz += len(proposal.data) if sz > maxBatch { node.ProposeDirect(entries) - // We need to re-craete `entries` because there is a reference + // We need to re-create `entries` because there is a reference // to it in the node's pae map. sz, entries = 0, nil } @@ -1644,6 +1815,29 @@ func (o *consumer) processNak(sseq, dseq, dc uint64, nak []byte) { return } } + + // Deliver an advisory + e := JSConsumerDeliveryNakAdvisory{ + TypedEvent: TypedEvent{ + Type: JSConsumerDeliveryNakAdvisoryType, + ID: nuid.Next(), + Time: time.Now().UTC(), + }, + Stream: o.stream, + Consumer: o.name, + ConsumerSeq: dseq, + StreamSeq: sseq, + Deliveries: dc, + Domain: o.srv.getOpts().JetStreamDomain, + } + + j, err := json.Marshal(e) + if err != nil { + return + } + + o.sendAdvisory(o.nakEventT, j) + // Check to see if we have delays attached. if len(nak) > len(AckNak) { arg := bytes.TrimSpace(nak[len(AckNak):]) @@ -1735,10 +1929,10 @@ func (o *consumer) ackWait(next time.Duration) time.Duration { } // Due to bug in calculation of sequences on restoring redelivered let's do quick sanity check. -func (o *consumer) checkRedelivered() { +func (o *consumer) checkRedelivered(slseq uint64) { var lseq uint64 if mset := o.mset; mset != nil { - lseq = mset.lastSeq() + lseq = slseq } var shouldUpdateState bool for sseq := range o.rdc { @@ -1755,15 +1949,15 @@ func (o *consumer) checkRedelivered() { // This will restore the state from disk. // Lock should be held. -func (o *consumer) readStoredState() error { +func (o *consumer) readStoredState(slseq uint64) error { if o.store == nil { return nil } state, err := o.store.State() - if err == nil && state != nil && state.Delivered.Consumer != 0 { + if err == nil { o.applyState(state) if len(o.rdc) > 0 { - o.checkRedelivered() + o.checkRedelivered(slseq) } } return err @@ -1775,8 +1969,12 @@ func (o *consumer) applyState(state *ConsumerState) { return } + // If o.sseq is greater don't update. Don't go backwards on o.sseq. + if o.sseq <= state.Delivered.Stream { + o.sseq = state.Delivered.Stream + 1 + } o.dseq = state.Delivered.Consumer + 1 - o.sseq = state.Delivered.Stream + 1 + o.adflr = state.AckFloor.Consumer o.asflr = state.AckFloor.Stream o.pending = state.Pending @@ -1795,16 +1993,6 @@ func (o *consumer) applyState(state *ConsumerState) { } } -func (o *consumer) readStoreState() *ConsumerState { - o.mu.RLock() - defer o.mu.RUnlock() - if o.store == nil { - return nil - } - state, _ := o.store.State() - return state -} - // Sets our store state from another source. Used in clustered mode on snapshot restore. func (o *consumer) setStoreState(state *ConsumerState) error { if state == nil || o.store == nil { @@ -1827,7 +2015,6 @@ func (o *consumer) writeStoreStateUnlocked() error { if o.store == nil { return nil } - state := ConsumerState{ Delivered: SequencePair{ Consumer: o.dseq - 1, @@ -1904,7 +2091,7 @@ func (o *consumer) infoWithSnap(snap bool) *ConsumerInfo { }, NumAckPending: len(o.pending), NumRedelivered: len(o.rdc), - NumPending: o.adjustedPending(), + NumPending: o.streamNumPending(), PushBound: o.isPushMode() && o.active, Cluster: ci, } @@ -1920,7 +2107,7 @@ func (o *consumer) infoWithSnap(snap bool) *ConsumerInfo { // If we are a pull mode consumer, report on number of waiting requests. if o.isPullMode() { - o.processWaiting() + o.processWaiting(false) info.NumWaiting = o.waiting.len() } // If we were asked to snapshot do so here. @@ -2062,7 +2249,7 @@ func (o *consumer) processAckMsg(sseq, dseq, dc uint64, doSample bool) { } } - // If we had max ack pending set and were at limit we need to unblock folks. + // If we had max ack pending set and were at limit we need to unblock ourselves. if needSignal { o.signalNewMessages() } @@ -2097,8 +2284,9 @@ func (o *consumer) needAck(sseq uint64) bool { // Check first if we are filtered, and if so check if this is even applicable to us. if o.isFiltered() && o.mset != nil { - subj, _, _, _, err := o.mset.store.LoadMsg(sseq) - if err != nil || !o.isFilteredMatch(subj) { + var svp StoreMsg + sm, err := o.mset.store.LoadMsg(sseq, &svp) + if err != nil || !o.isFilteredMatch(sm.subj) { o.mu.RUnlock() return false } @@ -2143,36 +2331,36 @@ func (o *consumer) needAck(sseq uint64) bool { } // Helper for the next message requests. -func nextReqFromMsg(msg []byte) (time.Time, int, bool, time.Duration, time.Time, error) { +func nextReqFromMsg(msg []byte) (time.Time, int, int, bool, time.Duration, time.Time, error) { req := bytes.TrimSpace(msg) switch { case len(req) == 0: - return time.Time{}, 1, false, 0, time.Time{}, nil + return time.Time{}, 1, 0, false, 0, time.Time{}, nil case req[0] == '{': var cr JSApiConsumerGetNextRequest if err := json.Unmarshal(req, &cr); err != nil { - return time.Time{}, -1, false, 0, time.Time{}, err + return time.Time{}, -1, 0, false, 0, time.Time{}, err } var hbt time.Time if cr.Heartbeat > 0 { if cr.Heartbeat*2 > cr.Expires { - return time.Time{}, 1, false, 0, time.Time{}, errors.New("heartbeat value too large") + return time.Time{}, 1, 0, false, 0, time.Time{}, errors.New("heartbeat value too large") } hbt = time.Now().Add(cr.Heartbeat) } if cr.Expires == time.Duration(0) { - return time.Time{}, cr.Batch, cr.NoWait, cr.Heartbeat, hbt, nil + return time.Time{}, cr.Batch, cr.MaxBytes, cr.NoWait, cr.Heartbeat, hbt, nil } - return time.Now().Add(cr.Expires), cr.Batch, cr.NoWait, cr.Heartbeat, hbt, nil + return time.Now().Add(cr.Expires), cr.Batch, cr.MaxBytes, cr.NoWait, cr.Heartbeat, hbt, nil default: if n, err := strconv.Atoi(string(req)); err == nil { - return time.Time{}, n, false, 0, time.Time{}, nil + return time.Time{}, n, 0, false, 0, time.Time{}, nil } } - return time.Time{}, 1, false, 0, time.Time{}, nil + return time.Time{}, 1, 0, false, 0, time.Time{}, nil } // Represents a request that is on the internal waiting queue @@ -2182,6 +2370,7 @@ type waitingRequest struct { reply string n int // For batching d int + b int // For max bytes tracking. expires time.Time received time.Time hb time.Duration @@ -2215,9 +2404,9 @@ func (wr *waitingRequest) recycle() { // waiting queue for requests that are waiting for new messages to arrive. type waitQueue struct { - rp, wp int - last time.Time - reqs []*waitingRequest + rp, wp, n int + last time.Time + reqs []*waitingRequest } // Create a new ring buffer with at most max items. @@ -2248,11 +2437,12 @@ func (wq *waitQueue) add(wr *waitingRequest) error { } // Track last active via when we receive a request. wq.last = wr.received + wq.n++ return nil } func (wq *waitQueue) isFull() bool { - return wq.rp == wq.wp + return wq.n == cap(wq.reqs) } func (wq *waitQueue) isEmpty() bool { @@ -2260,13 +2450,10 @@ func (wq *waitQueue) isEmpty() bool { } func (wq *waitQueue) len() int { - if wq == nil || wq.rp < 0 { + if wq == nil { return 0 } - if wq.rp < wq.wp { - return wq.wp - wq.rp - } - return cap(wq.reqs) - wq.rp + wq.wp + return wq.n } // Peek will return the next request waiting or nil if empty. @@ -2301,8 +2488,9 @@ func (wq *waitQueue) removeCurrent() { } wq.reqs[wq.rp] = nil wq.rp = (wq.rp + 1) % cap(wq.reqs) + wq.n-- // Check if we are empty. - if wq.rp == wq.wp { + if wq.n == 0 { wq.rp, wq.wp = -1, 0 } } @@ -2313,14 +2501,15 @@ func (wq *waitQueue) compact() { return } nreqs, i := make([]*waitingRequest, cap(wq.reqs)), 0 - for rp := wq.rp; rp != wq.wp; rp = (rp + 1) % cap(wq.reqs) { + for j, rp := 0, wq.rp; j < wq.n; j++ { if wr := wq.reqs[rp]; wr != nil { nreqs[i] = wr i++ } + rp = (rp + 1) % cap(wq.reqs) } // Reset here. - wq.rp, wq.wp, wq.reqs = 0, i, nreqs + wq.rp, wq.wp, wq.n, wq.reqs = 0, i, i, nreqs } // Return the replies for our pending requests. @@ -2332,10 +2521,11 @@ func (o *consumer) pendingRequestReplies() []string { return nil } wq, m := o.waiting, make(map[string]struct{}) - for rp := o.waiting.rp; o.waiting.rp >= 0 && rp != wq.wp; rp = (rp + 1) % cap(wq.reqs) { + for i, rp := 0, o.waiting.rp; i < wq.n; i++ { if wr := wq.reqs[rp]; wr != nil { m[wr.reply] = struct{}{} } + rp = (rp + 1) % cap(wq.reqs) } var replies []string for reply := range m { @@ -2347,23 +2537,52 @@ func (o *consumer) pendingRequestReplies() []string { // Return next waiting request. This will check for expirations but not noWait or interest. // That will be handled by processWaiting. // Lock should be held. -func (o *consumer) nextWaiting() *waitingRequest { +func (o *consumer) nextWaiting(sz int) *waitingRequest { if o.waiting == nil || o.waiting.isEmpty() { return nil } for wr := o.waiting.peek(); !o.waiting.isEmpty(); wr = o.waiting.peek() { - if wr == nil || wr.expires.IsZero() || time.Now().Before(wr.expires) { + if wr == nil { + break + } + // Check if we have max bytes set. + if wr.b > 0 { + if sz <= wr.b { + wr.b -= sz + // If we are right now at zero, set batch to 1 to deliver this one but stop after. + if wr.b == 0 { + wr.n = 1 + } + } else { + // If we have not delivered anything to the requestor let them know. + if wr.d == 0 { + hdr := []byte("NATS/1.0 408 Message Size Exceeds MaxBytes\r\n\r\n") + o.outq.send(newJSPubMsg(wr.reply, _EMPTY_, _EMPTY_, hdr, nil, nil, 0)) + } + // Remove the current one, no longer valid due to max bytes limit. + o.waiting.removeCurrent() + if o.node != nil { + o.removeClusterPendingRequest(wr.reply) + } + wr.recycle() + continue + } + } + + if wr.expires.IsZero() || time.Now().Before(wr.expires) { rr := wr.acc.sl.Match(wr.interest) if len(rr.psubs)+len(rr.qsubs) > 0 { return o.waiting.pop() - } else if o.srv.gateway.enabled { - if o.srv.hasGatewayInterest(wr.acc.Name, wr.interest) || time.Since(wr.received) < defaultGatewayRecentSubExpiration { - return o.waiting.pop() - } + } else if time.Since(wr.received) < defaultGatewayRecentSubExpiration && (o.srv.leafNodeEnabled || o.srv.gateway.enabled) { + return o.waiting.pop() + } else if o.srv.gateway.enabled && o.srv.hasGatewayInterest(wr.acc.Name, wr.interest) { + return o.waiting.pop() } } - hdr := []byte("NATS/1.0 408 Request Timeout\r\n\r\n") - o.outq.send(newJSPubMsg(wr.reply, _EMPTY_, _EMPTY_, hdr, nil, nil, 0)) + if wr.interest != wr.reply { + hdr := []byte("NATS/1.0 408 Interest Expired\r\n\r\n") + o.outq.send(newJSPubMsg(wr.reply, _EMPTY_, _EMPTY_, hdr, nil, nil, 0)) + } // Remove the current one, no longer valid. o.waiting.removeCurrent() if o.node != nil { @@ -2382,7 +2601,10 @@ func (o *consumer) processNextMsgReq(_ *subscription, c *client, _ *Account, _, return } _, msg = c.msgParts(msg) + o.processNextMsgRequest(reply, msg) +} +func (o *consumer) processNextMsgRequest(reply string, msg []byte) { o.mu.Lock() defer o.mu.Unlock() @@ -2402,7 +2624,7 @@ func (o *consumer) processNextMsgReq(_ *subscription, c *client, _ *Account, _, } // Check payload here to see if they sent in batch size or a formal request. - expires, batchSize, noWait, hb, hbt, err := nextReqFromMsg(msg) + expires, batchSize, maxBytes, noWait, hb, hbt, err := nextReqFromMsg(msg) if err != nil { sendErr(400, fmt.Sprintf("Bad Request - %v", err)) return @@ -2419,18 +2641,20 @@ func (o *consumer) processNextMsgReq(_ *subscription, c *client, _ *Account, _, return } + if maxBytes > 0 && o.cfg.MaxRequestMaxBytes > 0 && maxBytes > o.cfg.MaxRequestMaxBytes { + sendErr(409, fmt.Sprintf("Exceeded MaxRequestMaxBytes of %v", o.cfg.MaxRequestMaxBytes)) + return + } + // If we have the max number of requests already pending try to expire. if o.waiting.isFull() { // Try to expire some of the requests. - if expired, _, _, _ := o.processWaiting(); expired == 0 { - // Force expiration if needed. - o.forceExpireFirstWaiting() - } + o.processWaiting(false) } // If the request is for noWait and we have pending requests already, check if we have room. if noWait { - msgsPending := o.adjustedPending() + uint64(len(o.rdq)) + msgsPending := o.numPending() + uint64(len(o.rdq)) // If no pending at all, decide what to do with request. // If no expires was set then fail. if msgsPending == 0 && expires.IsZero() { @@ -2438,7 +2662,7 @@ func (o *consumer) processNextMsgReq(_ *subscription, c *client, _ *Account, _, return } if msgsPending > 0 { - _, _, batchPending, _ := o.processWaiting() + _, _, batchPending, _ := o.processWaiting(false) if msgsPending < uint64(batchPending) { sendErr(408, "Requests Pending") return @@ -2449,18 +2673,12 @@ func (o *consumer) processNextMsgReq(_ *subscription, c *client, _ *Account, _, } // If we receive this request though an account export, we need to track that interest subject and account. - acc, interest := o.acc, reply - for strings.HasPrefix(interest, replyPrefix) && acc.exports.responses != nil { - if si := acc.exports.responses[interest]; si != nil { - acc, interest = si.acc, si.to - } else { - break - } - } + acc, interest := trackDownAccountAndInterest(o.acc, reply) - // In case we have to queue up this request. + // Create a waiting request. wr := wrPool.Get().(*waitingRequest) wr.acc, wr.interest, wr.reply, wr.n, wr.d, wr.noWait, wr.expires, wr.hb, wr.hbt = acc, interest, reply, batchSize, 0, noWait, expires, hb, hbt + wr.b = maxBytes wr.received = time.Now() if err := o.waiting.add(wr); err != nil { @@ -2474,6 +2692,25 @@ func (o *consumer) processNextMsgReq(_ *subscription, c *client, _ *Account, _, } } +func trackDownAccountAndInterest(acc *Account, interest string) (*Account, string) { + for strings.HasPrefix(interest, replyPrefix) { + oa := acc + oa.mu.RLock() + if oa.exports.responses == nil { + oa.mu.RUnlock() + break + } + si := oa.exports.responses[interest] + if si == nil { + oa.mu.RUnlock() + break + } + acc, interest = si.acc, si.to + oa.mu.RUnlock() + } + return acc, interest +} + // Increase the delivery count for this message. // ONLY used on redelivery semantics. // Lock should be held. @@ -2532,9 +2769,9 @@ var ( // Get next available message from underlying store. // Is partition aware and redeliver aware. // Lock should be held. -func (o *consumer) getNextMsg() (subj string, hdr, msg []byte, sseq uint64, dc uint64, ts int64, err error) { +func (o *consumer) getNextMsg() (*jsPubMsg, uint64, error) { if o.mset == nil || o.mset.store == nil { - return _EMPTY_, nil, nil, 0, 0, 0, errBadConsumer + return nil, 0, errBadConsumer } seq, dc := o.sseq, uint64(1) if o.hasSkipListPending() { @@ -2559,8 +2796,13 @@ func (o *consumer) getNextMsg() (subj string, hdr, msg []byte, sseq uint64, dc u continue } if seq > 0 { - subj, hdr, msg, ts, err = o.mset.store.LoadMsg(seq) - return subj, hdr, msg, seq, dc, ts, err + pmsg := getJSPubMsgFromPool() + sm, err := o.mset.store.LoadMsg(seq, &pmsg.StoreMsg) + if sm == nil || err != nil { + pmsg.returnToPool() + pmsg, dc = nil, 0 + } + return pmsg, dc, err } } // Fallback if all redeliveries are gone. @@ -2571,11 +2813,12 @@ func (o *consumer) getNextMsg() (subj string, hdr, msg []byte, sseq uint64, dc u if o.maxp > 0 && len(o.pending) >= o.maxp { // maxp only set when ack policy != AckNone and user set MaxAckPending // Stall if we have hit max pending. - return _EMPTY_, nil, nil, 0, 0, 0, errMaxAckPending + return nil, 0, errMaxAckPending } // Grab next message applicable to us. - subj, sseq, hdr, msg, ts, err = o.mset.store.LoadNextMsg(o.cfg.FilterSubject, o.filterWC, seq) + pmsg := getJSPubMsgFromPool() + sm, sseq, err := o.mset.store.LoadNextMsg(o.cfg.FilterSubject, o.filterWC, seq, &pmsg.StoreMsg) if sseq >= o.sseq { o.sseq = sseq + 1 @@ -2584,33 +2827,17 @@ func (o *consumer) getNextMsg() (subj string, hdr, msg []byte, sseq uint64, dc u } } - return subj, hdr, msg, sseq, dc, ts, err -} - -// forceExpireFirstWaiting will force expire the first waiting. -// Lock should be held. -func (o *consumer) forceExpireFirstWaiting() { - // FIXME(dlc) - Should we do advisory here as well? - wr := o.waiting.peek() - if wr == nil { - return - } - // If we are expiring this and we think there is still interest, alert. - if rr := wr.acc.sl.Match(wr.interest); len(rr.psubs)+len(rr.qsubs) > 0 && o.mset != nil { - // We still appear to have interest, so send alert as courtesy. - hdr := []byte("NATS/1.0 408 Request Canceled\r\n\r\n") - o.outq.send(newJSPubMsg(wr.reply, _EMPTY_, _EMPTY_, hdr, nil, nil, 0)) - } - o.waiting.removeCurrent() - if o.node != nil { - o.removeClusterPendingRequest(wr.reply) + if sm == nil { + pmsg.returnToPool() + return nil, 0, err } - wr.recycle() + + return pmsg, dc, err } // Will check for expiration and lack of interest on waiting requests. // Will also do any heartbeats and return the next expiration or HB interval. -func (o *consumer) processWaiting() (int, int, int, time.Time) { +func (o *consumer) processWaiting(eos bool) (int, int, int, time.Time) { var fexp time.Time if o.srv == nil || o.waiting.isEmpty() { return 0, 0, 0, fexp @@ -2636,23 +2863,25 @@ func (o *consumer) processWaiting() (int, int, int, time.Time) { } wq := o.waiting - - for rp := o.waiting.rp; o.waiting.rp >= 0 && rp != wq.wp; rp = (rp + 1) % cap(wq.reqs) { + for i, rp, n := 0, wq.rp, wq.n; i < n; rp = (rp + 1) % cap(wq.reqs) { wr := wq.reqs[rp] // Check expiration. - if (wr.noWait && wr.d > 0) || (!wr.expires.IsZero() && now.After(wr.expires)) { + if (eos && wr.noWait && wr.d > 0) || (!wr.expires.IsZero() && now.After(wr.expires)) { hdr := []byte("NATS/1.0 408 Request Timeout\r\n\r\n") o.outq.send(newJSPubMsg(wr.reply, _EMPTY_, _EMPTY_, hdr, nil, nil, 0)) remove(wr, rp) + i++ continue } // Now check interest. rr := wr.acc.sl.Match(wr.interest) interest := len(rr.psubs)+len(rr.qsubs) > 0 - if !interest && s.gateway.enabled { - // If we are here check on gateways. + if !interest && (s.leafNodeEnabled || s.gateway.enabled) { + // If we are here check on gateways and leaf nodes (as they can mask gateways on the other end). // If we have interest or the request is too young break and do not expire. - if s.hasGatewayInterest(wr.acc.Name, wr.interest) || time.Since(wr.received) < defaultGatewayRecentSubExpiration { + if time.Since(wr.received) < defaultGatewayRecentSubExpiration { + interest = true + } else if s.gateway.enabled && s.hasGatewayInterest(wr.acc.Name, wr.interest) { interest = true } } @@ -2673,10 +2902,12 @@ func (o *consumer) processWaiting() (int, int, int, time.Time) { if !wr.expires.IsZero() && (fexp.IsZero() || wr.expires.Before(fexp)) { fexp = wr.expires } + i++ continue } // No more interest here so go ahead and remove this one from our list. remove(wr, rp) + i++ } // If we have interior deletes from out of order invalidation, compact the waiting queue. @@ -2684,12 +2915,12 @@ func (o *consumer) processWaiting() (int, int, int, time.Time) { o.waiting.compact() } - return expired, o.waiting.len(), brp, fexp + return expired, wq.len(), brp, fexp } // Will check to make sure those waiting still have registered interest. func (o *consumer) checkWaitingForInterest() bool { - o.processWaiting() + o.processWaiting(true) return o.waiting.len() > 0 } @@ -2701,6 +2932,30 @@ func (o *consumer) hbTimer() (time.Duration, *time.Timer) { return o.cfg.Heartbeat, time.NewTimer(o.cfg.Heartbeat) } +func (o *consumer) processInboundAcks(qch chan struct{}) { + // Grab the server lock to watch for server quit. + o.mu.RLock() + s := o.srv + o.mu.RUnlock() + + for { + select { + case <-o.ackMsgs.ch: + acks := o.ackMsgs.pop() + for _, acki := range acks { + ack := acki.(*jsAckMsg) + o.processAck(ack.subject, ack.reply, ack.hdr, ack.msg) + ack.returnToPool() + } + o.ackMsgs.recycle(&acks) + case <-qch: + return + case <-s.quitCh: + return + } + } +} + func (o *consumer) loopAndGatherMsgs(qch chan struct{}) { // On startup check to see if we are in a a reply situation where replay policy is not instant. var ( @@ -2708,15 +2963,24 @@ func (o *consumer) loopAndGatherMsgs(qch chan struct{}) { lseq uint64 ) + o.mu.RLock() + mset := o.mset + getLSeq := o.replay + o.mu.RUnlock() + // consumer is closed when mset is set to nil. + if mset == nil { + return + } + if getLSeq { + lseq = mset.state().LastSeq + } + o.mu.Lock() s := o.srv - if o.replay { - // consumer is closed when mset is set to nil. - if o.mset == nil { - o.mu.Unlock() - return - } - lseq = o.mset.state().LastSeq + // need to check again if consumer is closed + if o.mset == nil { + o.mu.Unlock() + return } // For idle heartbeat support. var hbc <-chan time.Time @@ -2728,18 +2992,21 @@ func (o *consumer) loopAndGatherMsgs(qch chan struct{}) { inch := o.inch o.mu.Unlock() + // Grab the stream's retention policy + mset.mu.RLock() + rp := mset.cfg.Retention + mset.mu.RUnlock() + + var err error + // Deliver all the msgs we have now, once done or on a condition, we wait for new ones. for { var ( - seq, dc uint64 - subj, dsubj string - hdr []byte - msg []byte - err error - ts int64 - delay time.Duration + pmsg *jsPubMsg + dc uint64 + dsubj string + delay time.Duration ) - o.mu.Lock() // consumer is closed when mset is set to nil. if o.mset == nil { @@ -2747,6 +3014,9 @@ func (o *consumer) loopAndGatherMsgs(qch chan struct{}) { return } + // Clear last error. + err = nil + // If we are in push mode and not active or under flowcontrol let's stop sending. if o.isPushMode() { if !o.active || (o.maxpb > 0 && o.pbytes > o.maxpb) { @@ -2757,10 +3027,11 @@ func (o *consumer) loopAndGatherMsgs(qch chan struct{}) { goto waitForMsgs } - subj, hdr, msg, seq, dc, ts, err = o.getNextMsg() + // Grab our next msg. + pmsg, dc, err = o.getNextMsg() // On error either wait or return. - if err != nil { + if err != nil || pmsg == nil { if err == ErrStoreMsgNotFound || err == ErrStoreEOF || err == errMaxAckPending || err == errPartialCache { goto waitForMsgs } else { @@ -2771,7 +3042,7 @@ func (o *consumer) loopAndGatherMsgs(qch chan struct{}) { if o.isPushMode() { dsubj = o.dsubj - } else if wr := o.nextWaiting(); wr != nil { + } else if wr := o.nextWaiting(len(pmsg.hdr) + len(pmsg.msg)); wr != nil { dsubj = wr.reply if done := wr.recycleIfDone(); done && o.node != nil { o.removeClusterPendingRequest(dsubj) @@ -2781,15 +3052,17 @@ func (o *consumer) loopAndGatherMsgs(qch chan struct{}) { } else { // We will redo this one. o.sseq-- + pmsg.returnToPool() goto waitForMsgs } // If we are in a replay scenario and have not caught up check if we need to delay here. if o.replay && lts > 0 { - if delay = time.Duration(ts - lts); delay > time.Millisecond { + if delay = time.Duration(pmsg.ts - lts); delay > time.Millisecond { o.mu.Unlock() select { case <-qch: + pmsg.returnToPool() return case <-time.After(delay): } @@ -2798,17 +3071,18 @@ func (o *consumer) loopAndGatherMsgs(qch chan struct{}) { } // Track this regardless. - lts = ts + lts = pmsg.ts // If we have a rate limit set make sure we check that here. if o.rlimit != nil { - now := time.Now() - r := o.rlimit.ReserveN(now, len(msg)+len(hdr)+len(subj)+len(dsubj)+len(o.ackReplyT)) + now, sm := time.Now(), &pmsg.StoreMsg + r := o.rlimit.ReserveN(now, len(sm.msg)+len(sm.hdr)+len(sm.subj)+len(dsubj)+len(o.ackReplyT)) delay := r.DelayFrom(now) if delay > 0 { o.mu.Unlock() select { case <-qch: + pmsg.returnToPool() return case <-time.After(delay): } @@ -2817,7 +3091,7 @@ func (o *consumer) loopAndGatherMsgs(qch chan struct{}) { } // Do actual delivery. - o.deliverMsg(dsubj, subj, hdr, msg, seq, dc, ts) + o.deliverMsg(dsubj, pmsg, dc, rp) // Reset our idle heartbeat timer if set. if hb != nil { @@ -2836,7 +3110,8 @@ func (o *consumer) loopAndGatherMsgs(qch chan struct{}) { // Make sure to process any expired requests that are pending. var wrExp <-chan time.Time if o.isPullMode() { - _, _, _, fexp := o.processWaiting() + // Dont expire oneshots if we are here because of max ack pending limit. + _, _, _, fexp := o.processWaiting(err != errMaxAckPending) if !fexp.IsZero() { expires := time.Until(fexp) if expires <= 0 { @@ -2851,17 +3126,17 @@ func (o *consumer) loopAndGatherMsgs(qch chan struct{}) { o.mu.Unlock() select { + case <-mch: + // Messages are waiting. case interest := <-inch: // inch can be nil on pull-based, but then this will // just block and not fire. o.updateDeliveryInterest(interest) case <-qch: return - case <-mch: - // Messages are waiting. case <-wrExp: o.mu.Lock() - o.processWaiting() + o.processWaiting(true) o.mu.Unlock() case <-hbc: if o.isActive() { @@ -2901,27 +3176,52 @@ func (o *consumer) setMaxPendingBytes(limit int) { } } -// We have the case where a consumer can become greedy and pick up a messages before the stream has incremented our pending(sgap). -// Instead of trying to slow things down and synchronize we will allow this to wrap and go negative (biggest uint64) for a short time. -// This functions checks for that and returns 0. // Lock should be held. -func (o *consumer) adjustedPending() uint64 { - if o.sgap&(1<<63) != 0 { +func (o *consumer) numPending() uint64 { + if o.npcm == 0 { + o.streamNumPending() + } + // This can wrap based on possibly having a dec before the inc. Account for that here. + if o.npc&(1<<63) != 0 { return 0 } - return o.sgap + return o.npc +} + +// Will force a set from the stream store of num pending. +// Depends on delivery policy, for last per subject we calculate differently. +// Lock should be held. +func (o *consumer) streamNumPending() uint64 { + if o.mset == nil || o.mset.store == nil { + o.npc, o.npcm = 0, 0 + } else if o.cfg.DeliverPolicy == DeliverLastPerSubject { + o.npc, o.npcm = 0, 0 + for _, ss := range o.mset.store.SubjectsState(o.cfg.FilterSubject) { + if o.sseq <= ss.Last { + o.npc++ + if ss.Last > o.npcm { + o.npcm = ss.Last + } + } + } + } else { + ss := o.mset.store.FilteredState(o.sseq, o.cfg.FilterSubject) + o.npc, o.npcm = ss.Msgs, ss.Last + } + return o.npc } // Deliver a msg to the consumer. // Lock should be held and o.mset validated to be non-nil. -func (o *consumer) deliverMsg(dsubj, subj string, hdr, msg []byte, seq, dc uint64, ts int64) { +func (o *consumer) deliverMsg(dsubj string, pmsg *jsPubMsg, dc uint64, rp RetentionPolicy) { if o.mset == nil { + pmsg.returnToPool() return } - // Update pending on first attempt. This can go upside down for a short bit, that is ok. - // See adjustedPending(). - if dc == 1 { - o.sgap-- + + // Update our cached num pending. + if dc == 1 && o.npcm > 0 { + o.npc-- } dseq := o.dseq @@ -2930,6 +3230,7 @@ func (o *consumer) deliverMsg(dsubj, subj string, hdr, msg []byte, seq, dc uint6 // If headers only do not send msg payload. // Add in msg size itself as header. if o.cfg.HeadersOnly { + hdr, msg := pmsg.hdr, pmsg.msg var bb bytes.Buffer if len(hdr) == 0 { bb.WriteString(hdrLine) @@ -2942,12 +3243,17 @@ func (o *consumer) deliverMsg(dsubj, subj string, hdr, msg []byte, seq, dc uint6 bb.WriteString(strconv.FormatInt(int64(len(msg)), 10)) bb.WriteString(CR_LF) bb.WriteString(CR_LF) - hdr = bb.Bytes() + // Replace underlying buf which we can use directly when we send. + // TODO(dlc) - Probably just use directly when forming bytes.Buffer? + pmsg.buf = pmsg.buf[:0] + pmsg.buf = append(pmsg.buf, bb.Bytes()...) + // Replace with new header. + pmsg.hdr = pmsg.buf // Cancel msg payload - msg = nil + pmsg.msg = nil } - pmsg := newJSPubMsg(dsubj, subj, o.ackReply(seq, dseq, dc, ts, o.adjustedPending()), hdr, msg, o, seq) + pmsg.dsubj, pmsg.reply, pmsg.o = dsubj, o.ackReply(pmsg.seq, dseq, dc, pmsg.ts, o.numPending()), o psz := pmsg.size() if o.maxpb > 0 { @@ -2957,6 +3263,8 @@ func (o *consumer) deliverMsg(dsubj, subj string, hdr, msg []byte, seq, dc uint6 mset := o.mset ap := o.cfg.AckPolicy + // Cant touch pmsg after this sending so capture what we need. + seq, ts := pmsg.seq, pmsg.ts // Send message. o.outq.send(pmsg) @@ -2976,7 +3284,7 @@ func (o *consumer) deliverMsg(dsubj, subj string, hdr, msg []byte, seq, dc uint6 o.updateDelivered(dseq, seq, dc, ts) // If we are ack none and mset is interest only we should make sure stream removes interest. - if ap == AckNone && mset.cfg.Retention != LimitsPolicy { + if ap == AckNone && rp != LimitsPolicy { if o.node == nil || o.cfg.Direct { mset.ackq.push(seq) } else { @@ -3174,13 +3482,19 @@ func (o *consumer) checkPending() { defer o.mu.Unlock() mset := o.mset - if mset == nil { + // On stop, mset and timer will be nil. + if mset == nil || o.ptmr == nil { return } now := time.Now().UnixNano() ttl := int64(o.cfg.AckWait) next := int64(o.ackWait(0)) + // However, if there is backoff, initializes with the largest backoff. + // It will be adjusted as needed. + if l := len(o.cfg.BackOff); l > 0 { + next = int64(o.cfg.BackOff[l-1]) + } var shouldUpdateState bool var state StreamState @@ -3190,7 +3504,12 @@ func (o *consumer) checkPending() { // Since we can update timestamps, we have to review all pending. // We may want to unlock here or warn if list is big. var expired []uint64 + check := len(o.pending) > 1024 for seq, p := range o.pending { + if check && atomic.LoadInt64(&o.awl) > 0 { + o.ptmr.Reset(100 * time.Millisecond) + return + } // Check if these are no longer valid. if seq < fseq { delete(o.pending, seq) @@ -3200,12 +3519,20 @@ func (o *consumer) checkPending() { continue } elapsed, deadline := now-p.Timestamp, ttl - if len(o.cfg.BackOff) > 0 && o.rdc != nil { + if len(o.cfg.BackOff) > 0 { + // This is ok even if o.rdc is nil, we would get dc == 0, which is what we want. dc := int(o.rdc[seq]) - if dc >= len(o.cfg.BackOff) { + // This will be the index for the next backoff, will set to last element if needed. + nbi := dc + 1 + if dc+1 >= len(o.cfg.BackOff) { dc = len(o.cfg.BackOff) - 1 + nbi = dc } deadline = int64(o.cfg.BackOff[dc]) + // Set `next` to the next backoff (if smaller than current `next` value). + if nextBackoff := int64(o.cfg.BackOff[nbi]); nextBackoff < next { + next = nextBackoff + } } if elapsed >= deadline { if !o.onRedeliverQueue(seq) { @@ -3379,6 +3706,7 @@ func (o *consumer) selectStartingSeqNo() { // TODO(dlc) - Once clustered can't rely on this. o.sseq = o.mset.store.GetSeqFromTime(*o.cfg.OptStartTime) } else { + // DeliverNew o.sseq = state.LastSeq + 1 } } else { @@ -3400,6 +3728,11 @@ func (o *consumer) selectStartingSeqNo() { o.adflr = o.dseq - 1 // Set ack store floor to store-1 o.asflr = o.sseq - 1 + + // Set our starting sequence state. + if o.store != nil && o.sseq > 0 { + o.store.SetStarting(o.sseq - 1) + } } // Test whether a config represents a durable subscriber. @@ -3466,14 +3799,14 @@ func (o *consumer) hasNoLocalInterest() bool { // This is when the underlying stream has been purged. // sseq is the new first seq for the stream after purge. // Lock should be held. -func (o *consumer) purge(sseq uint64) { +func (o *consumer) purge(sseq uint64, slseq uint64) { // Do not update our state unless we know we are the leader. if !o.isLeader() { return } // Signals all have been purged for this consumer. if sseq == 0 { - sseq = o.mset.lastSeq() + 1 + sseq = slseq + 1 } o.mu.Lock() @@ -3487,7 +3820,6 @@ func (o *consumer) purge(sseq uint64) { o.adflr = o.dseq - 1 } } - o.sgap = 0 o.pending = nil // We need to remove all those being queued for redelivery under o.rdq @@ -3583,6 +3915,7 @@ func (o *consumer) stopWithFlags(dflag, sdflag, doSignal, advisory bool) error { } n := o.node qgroup := o.cfg.DeliverGroup + o.ackMsgs.unregister() o.mu.Unlock() if c != nil { @@ -3608,7 +3941,7 @@ func (o *consumer) stopWithFlags(dflag, sdflag, doSignal, advisory bool) error { stop := mset.lastSeq() o.mu.Lock() if !o.isLeader() { - o.readStoredState() + o.readStoredState(stop) } start := o.asflr o.mu.Unlock() @@ -3655,11 +3988,8 @@ func (o *consumer) stopWithFlags(dflag, sdflag, doSignal, advisory bool) error { // Check that we do not form a cycle by delivering to a delivery subject // that is part of the interest group. -func (mset *stream) deliveryFormsCycle(deliverySubject string) bool { - mset.mu.RLock() - defer mset.mu.RUnlock() - - for _, subject := range mset.cfg.Subjects { +func deliveryFormsCycle(cfg *StreamConfig, deliverySubject string) bool { + for _, subject := range cfg.Subjects { if subjectIsSubsetMatch(deliverySubject, subject) { return true } @@ -3737,65 +4067,11 @@ func (o *consumer) requestNextMsgSubject() string { return o.nextMsgSubj } -// Will set the initial pending and start sequence. -// mset lock should be held. -func (o *consumer) setInitialPendingAndStart() { - mset := o.mset - if mset == nil || mset.store == nil { - return - } - - // !filtered means we want all messages. - filtered, dp := o.cfg.FilterSubject != _EMPTY_, o.cfg.DeliverPolicy - if filtered { - // Check to see if we directly match the configured stream. - // Many clients will always send a filtered subject. - cfg := &mset.cfg - if len(cfg.Subjects) == 1 && cfg.Subjects[0] == o.cfg.FilterSubject { - filtered = false - } - } - - if !filtered && dp != DeliverLastPerSubject { - var state StreamState - mset.store.FastState(&state) - if state.Msgs > 0 { - o.sgap = state.Msgs - (o.sseq - state.FirstSeq) - o.lsgap = state.LastSeq - } - } else { - // Here we are filtered. - if dp == DeliverLastPerSubject && o.hasSkipListPending() && o.sseq < o.lss.resume { - ss := mset.store.FilteredState(o.lss.resume+1, o.cfg.FilterSubject) - o.sseq = o.lss.seqs[0] - o.sgap = ss.Msgs + uint64(len(o.lss.seqs)) - o.lsgap = ss.Last - } else if ss := mset.store.FilteredState(o.sseq, o.cfg.FilterSubject); ss.Msgs > 0 { - o.sgap = ss.Msgs - o.lsgap = ss.Last - // See if we should update our starting sequence. - if dp == DeliverLast || dp == DeliverLastPerSubject { - o.sseq = ss.Last - } else if dp == DeliverNew { - o.sseq = ss.Last + 1 - } else { - // DeliverAll, DeliverByStartSequence, DeliverByStartTime - o.sseq = ss.First - } - // Cleanup lss when we take over in clustered mode. - if dp == DeliverLastPerSubject && o.hasSkipListPending() && o.sseq >= o.lss.resume { - o.lss = nil - } - } - o.updateSkipped() - } -} - func (o *consumer) decStreamPending(sseq uint64, subj string) { o.mu.Lock() - // Ignore if we have already seen this one. - if sseq >= o.sseq && o.sgap > 0 && o.isFilteredMatch(subj) { - o.sgap-- + // Update our cached num pending. Only do so if we think deliverMsg has not done so. + if sseq > o.npcm && sseq >= o.sseq && o.isFilteredMatch(subj) { + o.npc-- } // Check if this message was pending. p, wasPending := o.pending[sseq] diff --git a/vendor/github.com/nats-io/nats-server/v2/server/dirstore.go b/vendor/github.com/nats-io/nats-server/v2/server/dirstore.go index b0d82ea6..9bb499fd 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/dirstore.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/dirstore.go @@ -28,6 +28,8 @@ import ( "sync" "time" + "github.com/nats-io/nkeys" + "github.com/nats-io/jwt/v2" // only used to decode, not for storage ) @@ -321,6 +323,9 @@ func (store *DirJWTStore) Merge(pack string) error { return fmt.Errorf("line in package didn't contain 2 entries: %q", line) } pubKey := split[0] + if !nkeys.IsValidPublicAccountKey(pubKey) { + return fmt.Errorf("key to merge is not a valid public account key") + } if err := store.saveIfNewer(pubKey, split[1]); err != nil { return err } @@ -370,6 +375,9 @@ func (store *DirJWTStore) pathForKey(publicKey string) string { if len(publicKey) < 2 { return _EMPTY_ } + if !nkeys.IsValidPublicKey(publicKey) { + return _EMPTY_ + } fileName := fmt.Sprintf("%s%s", publicKey, fileExtension) if store.shard { last := publicKey[len(publicKey)-2:] @@ -488,7 +496,7 @@ func (store *DirJWTStore) save(publicKey string, theJWT string) error { } // Assumes the lock is NOT held, and only updates if the jwt is new, or the one on disk is older -// returns true when the jwt changed +// When changed, invokes jwt changed callback func (store *DirJWTStore) saveIfNewer(publicKey string, theJWT string) error { if store.readonly { return fmt.Errorf("store is read-only") @@ -505,7 +513,7 @@ func (store *DirJWTStore) saveIfNewer(publicKey string, theJWT string) error { } if _, err := os.Stat(path); err == nil { if newJWT, err := jwt.DecodeGeneric(theJWT); err != nil { - // skip if it can't be decoded + return err } else if existing, err := ioutil.ReadFile(path); err != nil { return err } else if existingJWT, err := jwt.DecodeGeneric(string(existing)); err != nil { @@ -514,6 +522,10 @@ func (store *DirJWTStore) saveIfNewer(publicKey string, theJWT string) error { return nil } else if existingJWT.IssuedAt > newJWT.IssuedAt { return nil + } else if newJWT.Subject != publicKey { + return fmt.Errorf("jwt subject nkey and provided nkey do not match") + } else if existingJWT.Subject != newJWT.Subject { + return fmt.Errorf("subject of existing and new jwt do not match") } } store.Lock() diff --git a/vendor/github.com/nats-io/nats-server/v2/server/errors.go b/vendor/github.com/nats-io/nats-server/v2/server/errors.go index 1dcbe8d9..56b5af3f 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/errors.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/errors.go @@ -46,6 +46,9 @@ var ( // ErrBadSubject represents an error condition for an invalid subject. ErrBadSubject = errors.New("invalid subject") + // ErrBadSubjectMappingDestination is used to error on a bad transform destination mapping + ErrBadSubjectMappingDestination = errors.New("invalid subject mapping destination") + // ErrBadQualifier is used to error on a bad qualifier for a transform. ErrBadQualifier = errors.New("bad qualifier") @@ -181,6 +184,13 @@ var ( // ErrCertNotPinned is returned when pinned certs are set and the certificate is not in it ErrCertNotPinned = errors.New("certificate not pinned") + + // ErrDuplicateServerName is returned when processing a server remote connection and + // the server reports that this server name is already used in the cluster. + ErrDuplicateServerName = errors.New("duplicate server name") + + // ErrMinimumVersionRequired is returned when a connection is not at the minimum version required. + ErrMinimumVersionRequired = errors.New("minimum version required") ) // configErr is a configuration error. diff --git a/vendor/github.com/nats-io/nats-server/v2/server/errors.json b/vendor/github.com/nats-io/nats-server/v2/server/errors.json index b520415c..1192377b 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/errors.json +++ b/vendor/github.com/nats-io/nats-server/v2/server/errors.json @@ -51,7 +51,7 @@ }, { "constant": "JSStreamSubjectOverlapErr", - "code": 500, + "code": 400, "error_code": 10065, "description": "subjects overlap with an existing stream", "comment": "", @@ -293,7 +293,7 @@ "constant": "JSMirrorWithSubjectsErr", "code": 400, "error_code": 10034, - "description": "stream mirrors can not also contain subjects", + "description": "stream mirrors can not contain subjects", "comment": "", "help": "", "url": "", @@ -333,7 +333,7 @@ "constant": "JSStreamMirrorNotUpdatableErr", "code": 400, "error_code": 10055, - "description": "Mirror configuration can not be updated", + "description": "stream mirror configuration can not be updated", "comment": "", "help": "", "url": "", @@ -404,7 +404,7 @@ "code": 500, "error_code": 10029, "description": "{err}", - "comment": "Generic mirror consumer setup failure string", + "comment": "generic mirror consumer setup failure string", "help": "", "url": "", "deprecates": "" @@ -1158,5 +1158,95 @@ "help": "", "url": "", "deprecates": "" + }, + { + "constant": "JSStreamOfflineErr", + "code": 500, + "error_code": 10118, + "description": "stream is offline", + "comment": "", + "help": "", + "url": "", + "deprecates": "" + }, + { + "constant": "JSConsumerOfflineErr", + "code": 500, + "error_code": 10119, + "description": "consumer is offline", + "comment": "", + "help": "", + "url": "", + "deprecates": "" + }, + { + "constant": "JSNoLimitsErr", + "code": 400, + "error_code": 10120, + "description": "no JetStream default or applicable tiered limit present", + "comment": "", + "help": "", + "url": "", + "deprecates": "" + }, + { + "constant": "JSConsumerMaxPendingAckExcessErrF", + "code": 400, + "error_code": 10121, + "description": "consumer max ack pending exceeds system limit of {limit}", + "comment": "", + "help": "", + "url": "", + "deprecates": "" + }, + { + "constant": "JSStreamMaxStreamBytesExceeded", + "code": 400, + "error_code": 10122, + "description": "stream max bytes exceeds account limit max stream bytes", + "comment": "", + "help": "", + "url": "", + "deprecates": "" + }, + { + "constant": "JSStreamMoveAndScaleErr", + "code": 400, + "error_code": 10123, + "description": "can not move and scale a stream in a single update", + "comment": "", + "help": "", + "url": "", + "deprecates": "" + }, + { + "constant": "JSStreamMoveInProgress", + "code": 400, + "error_code": 10124, + "description": "stream move already in progress", + "comment": "", + "help": "", + "url": "", + "deprecates": "" + }, + { + "constant": "JSConsumerMaxRequestBatchExceededF", + "code": 400, + "error_code": 10125, + "description": "consumer max request batch exceeds server limit of {limit}", + "comment": "", + "help": "", + "url": "", + "deprecates": "" + }, + { + "constant": "JSConsumerReplicasExceedsStream", + "code": 400, + "error_code": 10126, + "description": "consumer config replica count exceeds parent stream", + "comment": "", + "help": "", + "url": "", + "deprecates": "" } -] +] \ No newline at end of file diff --git a/vendor/github.com/nats-io/nats-server/v2/server/events.go b/vendor/github.com/nats-io/nats-server/v2/server/events.go index e519b1e7..bec38180 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/events.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/events.go @@ -433,6 +433,7 @@ RESET: // there is a chance that the process will exit before the // writeLoop has a chance to send it. c.flushClients(time.Second) + sendq.recycle(&msgs) return } pm.returnToPool() @@ -690,6 +691,9 @@ func (s *Server) sendStatsz(subj string) { ni := v.(nodeInfo) ni.stats = jStat.Stats ni.cfg = jStat.Config + s.optsMu.RLock() + ni.tags = copyStrings(s.opts.Tags) + s.optsMu.RUnlock() s.nodeToInfo.Store(ourNode, ni) } // Metagroup info. @@ -1009,8 +1013,8 @@ func (s *Server) addSystemAccountExports(sacc *Account) { if err := sacc.AddServiceExport(accSubsSubj, nil); err != nil { s.Errorf("Error adding system service export for %q: %v", accSubsSubj, err) } - - if s.JetStreamEnabled() { + // in case of a mixed mode setup, enable js exports anyway + if s.JetStreamEnabled() || !s.standAloneMode() { s.checkJetStreamExports() } } @@ -1573,12 +1577,12 @@ func (s *Server) registerSystemImports(a *Account) { // Add in this to the account in 2 places. // "$SYS.REQ.SERVER.PING.CONNZ" and "$SYS.REQ.ACCOUNT.PING.CONNZ" - if _, ok := a.imports.services[connzSubj]; !ok { + if !a.serviceImportExists(connzSubj) { if err := a.AddServiceImport(sacc, connzSubj, mappedSubj); err != nil { s.Errorf("Error setting up system service imports for account: %v", err) } } - if _, ok := a.imports.services[accConnzReqSubj]; !ok { + if !a.serviceImportExists(accConnzReqSubj) { if err := a.AddServiceImport(sacc, accConnzReqSubj, mappedSubj); err != nil { s.Errorf("Error setting up system service imports for account: %v", err) } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/filestore.go b/vendor/github.com/nats-io/nats-server/v2/server/filestore.go index d23cdf59..f3a30662 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/filestore.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/filestore.go @@ -32,6 +32,7 @@ import ( "path/filepath" "runtime" "sort" + "strings" "sync" "sync/atomic" "time" @@ -92,7 +93,7 @@ type fileStore struct { psmc map[string]uint64 hh hash.Hash64 qch chan struct{} - cfs []*consumerFileStore + cfs []ConsumerStore sips int closed bool fip bool @@ -120,6 +121,7 @@ type msgBlock struct { rbytes uint64 // Total bytes (raw) including deleted. Used for rolling to new blk. msgs uint64 // User visible message count. fss map[string]*SimpleState + sfilter string // Single subject filter sfn string kfn string lwits int64 @@ -150,6 +152,7 @@ type cache struct { idx []uint32 lrl uint32 fseq uint64 + nra bool } type msgId struct { @@ -157,16 +160,6 @@ type msgId struct { ts int64 } -type fileStoredMsg struct { - subj string - hdr []byte - msg []byte - seq uint64 - ts int64 // nanoseconds - mb *msgBlock - off int64 // offset into block file -} - const ( // Magic is used to identify the file store files. magic = uint8(22) @@ -225,22 +218,28 @@ const ( blkKeySize = 72 // Default stream block size. - defaultStreamBlockSize = 16 * 1024 * 1024 // 16MB + defaultLargeBlockSize = 8 * 1024 * 1024 // 8MB // Default for workqueue or interest based. - defaultOtherBlockSize = 8 * 1024 * 1024 // 8MB + defaultMediumBlockSize = 4 * 1024 * 1024 // 4MB + // For smaller reuse buffers. Usually being generated during contention on the lead write buffer. + // E.g. mirrors/sources etc. + defaultSmallBlockSize = 1 * 1024 * 1024 // 1MB // Default for KV based - defaultKVBlockSize = 8 * 1024 * 1024 // 8MB + defaultKVBlockSize = defaultMediumBlockSize // max block size for now. - maxBlockSize = defaultStreamBlockSize + maxBlockSize = defaultLargeBlockSize // Compact minimum threshold. compactMinimum = 2 * 1024 * 1024 // 2MB // FileStoreMinBlkSize is minimum size we will do for a blk size. FileStoreMinBlkSize = 32 * 1000 // 32kib // FileStoreMaxBlkSize is maximum size we will do for a blk size. FileStoreMaxBlkSize = maxBlockSize - // Check for bad record length value due to corrupt data. rlBadThresh = 32 * 1024 * 1024 + // Time threshold to write index info. + wiThresh = int64(2 * time.Second) + // Time threshold to write index info for non FIFO cases + winfThresh = int64(500 * time.Millisecond) ) func newFileStore(fcfg FileStoreConfig, cfg StreamConfig) (*fileStore, error) { @@ -376,6 +375,18 @@ func (fs *fileStore) UpdateConfig(cfg *StreamConfig) error { fs.ageChk.Stop() fs.ageChk = nil } + + // Update our sfilter for the last block. + if lmb := fs.lmb; lmb != nil { + lmb.mu.Lock() + if len(fs.cfg.Subjects) == 1 { + lmb.sfilter = fs.cfg.Subjects[0] + } else { + lmb.sfilter = _EMPTY_ + } + lmb.mu.Unlock() + } + fs.mu.Unlock() if cfg.MaxAge != 0 { @@ -391,21 +402,22 @@ func dynBlkSize(retention RetentionPolicy, maxBytes int64) uint64 { if m := blkSize % 100; m != 0 { blkSize += 100 - m } - if blkSize < FileStoreMinBlkSize { + if blkSize <= FileStoreMinBlkSize { blkSize = FileStoreMinBlkSize - } - if blkSize > FileStoreMaxBlkSize { + } else if blkSize >= FileStoreMaxBlkSize { blkSize = FileStoreMaxBlkSize + } else { + blkSize = defaultMediumBlockSize } return uint64(blkSize) } if retention == LimitsPolicy { // TODO(dlc) - Make the blocksize relative to this if set. - return defaultStreamBlockSize + return defaultLargeBlockSize } else { // TODO(dlc) - Make the blocksize relative to this if set. - return defaultOtherBlockSize + return defaultMediumBlockSize } } @@ -490,6 +502,57 @@ func (fs *fileStore) writeStreamMeta() error { return nil } +// Pools to recycle the blocks to help with memory pressure. +var blkPoolBig sync.Pool // 16MB +var blkPoolMedium sync.Pool // 8MB +var blkPoolSmall sync.Pool // 2MB + +// Get a new msg block based on sz estimate. +func getMsgBlockBuf(sz int) (buf []byte) { + var pb interface{} + if sz <= defaultSmallBlockSize { + pb = blkPoolSmall.Get() + } else if sz <= defaultMediumBlockSize { + pb = blkPoolMedium.Get() + } else { + pb = blkPoolBig.Get() + } + if pb != nil { + buf = *(pb.(*[]byte)) + } else { + // Here we need to make a new blk. + // If small leave as is.. + if sz > defaultSmallBlockSize && sz <= defaultMediumBlockSize { + sz = defaultMediumBlockSize + } else if sz > defaultMediumBlockSize { + sz = defaultLargeBlockSize + } + buf = make([]byte, sz) + } + return buf[:0] +} + +// Recycle the msg block. +func recycleMsgBlockBuf(buf []byte) { + if buf == nil || cap(buf) < defaultSmallBlockSize { + return + } + // Make sure to reset before placing back into pool. + buf = buf[:0] + + // We need to make sure the load code gets a block that can fit the maximum for a size block. + // E.g. 8, 16 etc. otherwise we thrash and actually make things worse by pulling it out, and putting + // it right back in and making a new []byte. + // From above we know its already >= defaultSmallBlockSize + if sz := cap(buf); sz < defaultMediumBlockSize { + blkPoolSmall.Put(&buf) + } else if sz < defaultLargeBlockSize { + blkPoolMedium.Put(&buf) + } else { + blkPoolBig.Put(&buf) + } +} + const ( msgHdrSize = 22 checksumSize = 8 @@ -499,6 +562,7 @@ const ( // This is the max room needed for index header. const indexHdrSize = 7*binary.MaxVarintLen64 + hdrLen + checksumSize +// Lock held on entry func (fs *fileStore) recoverMsgBlock(fi os.FileInfo, index uint64) (*msgBlock, error) { mb := &msgBlock{fs: fs, index: index, cexp: fs.fcfg.CacheExpire} @@ -599,20 +663,28 @@ func (fs *fileStore) recoverMsgBlock(fi os.FileInfo, index uint64) (*msgBlock, e } // Grab last checksum from main block file. var lchk [8]byte - file.ReadAt(lchk[:], fi.Size()-8) + if mb.rbytes >= checksumSize { + file.ReadAt(lchk[:], fi.Size()-checksumSize) + } file.Close() // Read our index file. Use this as source of truth if possible. if err := mb.readIndexInfo(); err == nil { // Quick sanity check here. - // Note this only checks that the message blk file is not newer then this file. - if bytes.Equal(lchk[:], mb.lchk[:]) { + // Note this only checks that the message blk file is not newer then this file, or is empty and we expect empty. + if (mb.rbytes == 0 && mb.msgs == 0) || bytes.Equal(lchk[:], mb.lchk[:]) { if fs.tms { if err = mb.readPerSubjectInfo(); err != nil { return nil, err } } fs.blks = append(fs.blks, mb) + // If we only have one subject registered we can optimize filtered lookups here. + if len(mb.fss) == 1 { + for sfilter := range mb.fss { + mb.sfilter = sfilter + } + } return mb, nil } } @@ -784,7 +856,10 @@ func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, error) { // This is an old erased message, or a new one that we can track. if seq == 0 || seq&ebit != 0 || seq < mb.first.seq { seq = seq &^ ebit - addToDmap(seq) + // Only add to dmap if past recorded first seq and non-zero. + if seq != 0 && seq >= mb.first.seq { + addToDmap(seq) + } index += rl mb.last.seq = seq mb.last.ts = ts @@ -834,14 +909,16 @@ func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, error) { mb.bytes += uint64(rl) // Do per subject info. - if mb.fss != nil { - if subj := string(data[:slen]); len(subj) > 0 { - if ss := mb.fss[subj]; ss != nil { - ss.Msgs++ - ss.Last = seq - } else { - mb.fss[subj] = &SimpleState{Msgs: 1, First: seq, Last: seq} - } + if slen > 0 && mb.fss != nil { + // For the lookup, we cast the byte slice and there won't be any copy + if ss := mb.fss[string(data[:slen])]; ss != nil { + ss.Msgs++ + ss.Last = seq + } else { + // This will either use a subject from the config, or make a copy + // so we don't reference the underlying buffer. + subj := mb.subjString(data[:slen]) + mb.fss[subj] = &SimpleState{Msgs: 1, First: seq, Last: seq} } } } @@ -854,6 +931,13 @@ func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, error) { mb.last.seq = mb.first.seq - 1 } + // If we only have one subject registered we can optimize filtered lookups here. + if len(mb.fss) == 1 { + for sfilter := range mb.fss { + mb.sfilter = sfilter + } + } + return nil, nil } @@ -905,6 +989,12 @@ func (fs *fileStore) recoverMsgs() error { if len(fs.blks) > 0 { sort.Slice(fs.blks, func(i, j int) bool { return fs.blks[i].index < fs.blks[j].index }) fs.lmb = fs.blks[len(fs.blks)-1] + // Update our sfilter for the last block since we could have only see one subject during recovery. + if len(fs.cfg.Subjects) == 1 { + fs.lmb.sfilter = fs.cfg.Subjects[0] + } else { + fs.lmb.sfilter = _EMPTY_ + } } else { _, err = fs.newMsgBlockForWrite() } @@ -993,9 +1083,11 @@ func (fs *fileStore) expireMsgsOnRecover() { break } + var smv StoreMsg + // Walk messages and remove if expired. for seq := mb.first.seq; seq <= mb.last.seq; seq++ { - sm, err := mb.cacheLookup(seq) + sm, err := mb.cacheLookup(seq, &smv) // Process interior deleted msgs. if err == errDeletedMsg { // Update dmap. @@ -1005,10 +1097,14 @@ func (fs *fileStore) expireMsgsOnRecover() { mb.dmap = nil } } + // Keep this update just in case since we are removing dmap entries. + mb.first.seq = seq continue } // Break on other errors. if err != nil || sm == nil { + // Keep this update just in case since we could have removed dmap entries. + mb.first.seq = seq break } @@ -1023,14 +1119,16 @@ func (fs *fileStore) expireMsgsOnRecover() { } // Delete the message here. - sz := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg) - mb.bytes -= sz - bytes += sz - mb.msgs-- - purged++ + if mb.msgs > 0 { + sz := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg) + mb.bytes -= sz + bytes += sz + mb.msgs-- + purged++ + } // Update fss fs.removePerSubject(sm.subj) - mb.removeSeqPerSubject(sm.subj, seq) + mb.removeSeqPerSubject(sm.subj, seq, &smv) } // Check if empty after processing, could happen if tail of messages are all deleted. @@ -1106,10 +1204,12 @@ func (fs *fileStore) GetSeqFromTime(t time.Time) uint64 { lseq := mb.last.seq mb.mu.RUnlock() + var smv StoreMsg + // Linear search, hence the dumb part.. ts := t.UnixNano() for seq := fseq; seq <= lseq; seq++ { - sm, _, _ := mb.fetchMsg(seq) + sm, _, _ := mb.fetchMsg(seq, &smv) if sm != nil && sm.ts >= ts { return sm.seq } @@ -1118,32 +1218,40 @@ func (fs *fileStore) GetSeqFromTime(t time.Time) uint64 { } // Find the first matching message. -func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64) (*fileStoredMsg, bool, error) { +func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64, sm *StoreMsg) (*StoreMsg, bool, error) { mb.mu.Lock() defer mb.mu.Unlock() - isAll, subs := filter == _EMPTY_ || filter == fwcs, []string{filter} - // If we have a wildcard match against all tracked subjects we know about. - if wc || isAll { - subs = subs[:0] - for subj := range mb.fss { - if isAll || subjectIsSubsetMatch(subj, filter) { - subs = append(subs, subj) + fseq, isAll, subs := start, filter == _EMPTY_ || filter == mb.sfilter || filter == fwcs, []string{filter} + + // Skip scan of mb.fss is number of messages in the block are less than + // 1/2 the number of subjects in mb.fss. + doLinearScan := isAll || 2*int(mb.last.seq-start) < len(mb.fss) + + if !doLinearScan { + // If we have a wildcard match against all tracked subjects we know about. + if wc { + subs = subs[:0] + for subj := range mb.fss { + if subjectIsSubsetMatch(subj, filter) { + subs = append(subs, subj) + } } } - } - fseq := mb.last.seq + 1 - for _, subj := range subs { - ss := mb.fss[subj] - if ss == nil || start > ss.Last || ss.First >= fseq { - continue - } - if ss.First < start { - fseq = start - } else { - fseq = ss.First + fseq = mb.last.seq + 1 + for _, subj := range subs { + ss := mb.fss[subj] + if ss == nil || start > ss.Last || ss.First >= fseq { + continue + } + if ss.First < start { + fseq = start + } else { + fseq = ss.First + } } } + if fseq > mb.last.seq { return nil, false, ErrStoreMsgNotFound } @@ -1154,19 +1262,31 @@ func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64) (*fileSt } } + if sm == nil { + sm = new(StoreMsg) + } + for seq := fseq; seq <= mb.last.seq; seq++ { llseq := mb.llseq - sm, err := mb.cacheLookup(seq) + fsm, err := mb.cacheLookup(seq, sm) if err != nil { continue } - expireOk := seq == mb.last.seq && mb.llseq == seq-1 - if len(subs) == 1 && sm.subj == subs[0] { - return sm, expireOk, nil - } - for _, subj := range subs { - if sm.subj == subj { - return sm, expireOk, nil + expireOk := seq == mb.last.seq && mb.llseq == seq + if doLinearScan { + if isAll { + return fsm, expireOk, nil + } + if wc && subjectIsSubsetMatch(fsm.subj, filter) { + return fsm, expireOk, nil + } else if !wc && fsm.subj == filter { + return fsm, expireOk, nil + } + } else { + for _, subj := range subs { + if fsm.subj == subj { + return fsm, expireOk, nil + } } } // If we are here we did not match, so put the llseq back. @@ -1258,8 +1378,10 @@ func (mb *msgBlock) filteredPendingLocked(filter string, wc bool, seq uint64) (t } numScanIn, numScanOut := lseq-seq, seq-mb.first.seq + var smv StoreMsg + isMatch := func(seq uint64) bool { - if sm, _ := mb.cacheLookup(seq); sm != nil { + if sm, _ := mb.cacheLookup(seq, &smv); sm != nil { if len(subs) == 1 && sm.subj == subs[0] { return true } @@ -1361,9 +1483,10 @@ func (fs *fileStore) FilteredState(sseq uint64, subj string) SimpleState { } } else { // Fallback to linear scan. + var smv StoreMsg eq := compareFn(subj) for seq := sseq; seq <= lseq; seq++ { - if sm, _ := fs.msgForSeq(seq); sm != nil && eq(sm.subj, subj) { + if sm, _ := fs.msgForSeq(seq, &smv); sm != nil && eq(sm.subj, subj) { ss.Msgs++ if ss.First == 0 { ss.First = seq @@ -1449,6 +1572,7 @@ func (mb *msgBlock) setupWriteCache(buf []byte) { if mb.cache != nil { return } + // Setup simple cache. mb.cache = &cache{buf: buf} // Make sure we set the proper cache offset if we have existing data. @@ -1484,14 +1608,7 @@ func (fs *fileStore) newMsgBlockForWrite() (*msgBlock, error) { lmb.closeFDsLocked() if lmb.cache != nil { // Reset write timestamp and see if we can expire this cache. - lwts, buf, llts := lmb.lwts, lmb.cache.buf, lmb.llts - lmb.lwts = 0 - lmb.expireCacheLocked() - lmb.lwts = lwts - // We could check for a certain time since last load, but to be safe just reuse if no loads at all. - if llts == 0 && (lmb.cache == nil || lmb.cache.buf == nil) { - rbuf = buf[:0] - } + rbuf = lmb.tryExpireWriteCache() } lmb.mu.Unlock() } @@ -1499,6 +1616,11 @@ func (fs *fileStore) newMsgBlockForWrite() (*msgBlock, error) { mb := &msgBlock{fs: fs, index: index, cexp: fs.fcfg.CacheExpire} + // If we only have one subject registered we can optimize filtered lookups here. + if len(fs.cfg.Subjects) == 1 { + mb.sfilter = fs.cfg.Subjects[0] + } + // Lock should be held to quiet race detector. mb.mu.Lock() mb.setupWriteCache(rbuf) @@ -1594,11 +1716,13 @@ func (fs *fileStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts in return ErrStoreClosed } + var pscheck bool + var asl bool // Check if we are discarding new messages when we reach the limit. if fs.cfg.Discard == DiscardNew { - var asl bool var fseq uint64 if fs.cfg.MaxMsgsPer > 0 && len(subj) > 0 { + pscheck = true var msgs uint64 if msgs, fseq, _ = fs.perSubjectState(subj); msgs >= uint64(fs.cfg.MaxMsgsPer) { asl = true @@ -1649,7 +1773,9 @@ func (fs *fileStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts in // Enforce per message limits. if fs.cfg.MaxMsgsPer > 0 && len(subj) > 0 { - fs.enforcePerSubjectLimit(subj) + if !pscheck || asl { + fs.enforcePerSubjectLimit(subj) + } } // Limits checks and enforcement. @@ -1708,26 +1834,30 @@ func (mb *msgBlock) skipMsg(seq uint64, now time.Time) { } var needsRecord bool + nowts := now.UnixNano() + mb.mu.Lock() // If we are empty can just do meta. if mb.msgs == 0 { mb.last.seq = seq - mb.last.ts = now.UnixNano() + mb.last.ts = nowts mb.first.seq = seq + 1 - mb.first.ts = now.UnixNano() + mb.first.ts = nowts + // Take care of index if needed. + if nowts-mb.lwits > wiThresh { + mb.writeIndexInfoLocked() + } } else { needsRecord = true if mb.dmap == nil { mb.dmap = make(map[uint64]struct{}) } mb.dmap[seq] = struct{}{} - mb.msgs-- - mb.bytes -= emptyRecordLen } mb.mu.Unlock() if needsRecord { - mb.writeMsgRecord(emptyRecordLen, seq|ebit, _EMPTY_, nil, nil, now.UnixNano(), true) + mb.writeMsgRecord(emptyRecordLen, seq|ebit, _EMPTY_, nil, nil, nowts, true) } else { mb.kickFlusher() } @@ -1738,18 +1868,14 @@ func (fs *fileStore) SkipMsg() uint64 { fs.mu.Lock() defer fs.mu.Unlock() - // Grab time. - now := time.Now().UTC() - seq := fs.state.LastSeq + 1 - fs.state.LastSeq = seq - fs.state.LastTime = now + // Grab time and last seq. + now, seq := time.Now().UTC(), fs.state.LastSeq+1 + fs.state.LastSeq, fs.state.LastTime = seq, now if fs.state.Msgs == 0 { - fs.state.FirstSeq = seq - fs.state.FirstTime = now + fs.state.FirstSeq, fs.state.FirstTime = seq, now } if seq == fs.state.FirstSeq { - fs.state.FirstSeq = seq + 1 - fs.state.FirstTime = now + fs.state.FirstSeq, fs.state.FirstTime = seq+1, now } fs.lmb.skipMsg(seq, now) @@ -1881,7 +2007,7 @@ func (fs *fileStore) removeMsg(seq uint64, secure, needFSLock bool) (bool, error mb.mu.Lock() - // See if the sequence numbers is still relevant. + // See if the sequence number is still relevant. if seq < mb.first.seq { mb.mu.Unlock() fsUnlock() @@ -1908,7 +2034,8 @@ func (fs *fileStore) removeMsg(seq uint64, secure, needFSLock bool) (bool, error } } - sm, err := mb.cacheLookup(seq) + var smv StoreMsg + sm, err := mb.cacheLookup(seq, &smv) if err != nil { mb.mu.Unlock() fsUnlock() @@ -1930,9 +2057,7 @@ func (fs *fileStore) removeMsg(seq uint64, secure, needFSLock bool) (bool, error // If we are tracking multiple subjects here make sure we update that accounting. fs.removePerSubject(sm.subj) - mb.removeSeqPerSubject(sm.subj, seq) - - var shouldWriteIndex, firstSeqNeedsUpdate bool + mb.removeSeqPerSubject(sm.subj, seq, &smv) if secure { // Grab record info. @@ -1940,39 +2065,48 @@ func (fs *fileStore) removeMsg(seq uint64, secure, needFSLock bool) (bool, error mb.eraseMsg(seq, int(ri), int(rl)) } - // Optimize for FIFO case. fifo := seq == mb.first.seq + isLastBlock := mb == fs.lmb + isEmpty := mb.msgs == 0 + shouldWriteIndex := !isEmpty + if fifo { mb.selectNextFirst() - if mb.isEmpty() { - fs.removeMsgBlock(mb) - firstSeqNeedsUpdate = seq == fs.state.FirstSeq - } else { - shouldWriteIndex = true + if !isEmpty { + // Can update this one in place. if seq == fs.state.FirstSeq { fs.state.FirstSeq = mb.first.seq // new one. fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC() } } - } else { - // Check if we are empty first, as long as not the last message block. - if notLast := mb != fs.lmb; notLast && mb.msgs == 0 { - fs.removeMsgBlock(mb) - firstSeqNeedsUpdate = seq == fs.state.FirstSeq - } else { - // Out of order delete. - shouldWriteIndex = true - if mb.dmap == nil { - mb.dmap = make(map[uint64]struct{}) - } - mb.dmap[seq] = struct{}{} - // Check if <25% utilization and minimum size met. - if notLast && mb.rbytes > compactMinimum && mb.rbytes>>2 > mb.bytes { + } else if !isEmpty { + // Out of order delete. + if mb.dmap == nil { + mb.dmap = make(map[uint64]struct{}) + } + mb.dmap[seq] = struct{}{} + // Check if <25% utilization and minimum size met. + if mb.rbytes > compactMinimum && !isLastBlock { + // Remove the interior delete records + rbytes := mb.rbytes - uint64(len(mb.dmap)*emptyRecordLen) + if rbytes>>2 > mb.bytes { mb.compact() } } } + var firstSeqNeedsUpdate bool + + // Decide how we want to clean this up. If last block we will hold into index. + if isEmpty { + if isLastBlock { + mb.closeAndKeepIndex() + } else { + fs.removeMsgBlock(mb) + } + firstSeqNeedsUpdate = seq == fs.state.FirstSeq + } + var qch, fch chan struct{} if shouldWriteIndex { qch, fch = mb.qch, mb.fch @@ -1987,22 +2121,26 @@ func (fs *fileStore) removeMsg(seq uint64, secure, needFSLock bool) (bool, error // Check if we need to write the index file and we are flush in place (fip). if shouldWriteIndex && fs.fip { // Check if this is the first message, common during expirations etc. - if !fifo || time.Now().UnixNano()-mb.lwits > int64(2*time.Second) { + threshold := wiThresh + if !fifo { + // For out-of-order deletes, we will have a shorter threshold, but + // still won't write the index for every single delete. + threshold = winfThresh + } + if time.Now().UnixNano()-mb.lwits > threshold { mb.writeIndexInfoLocked() } } mb.mu.Unlock() // Kick outside of lock. - if shouldWriteIndex { - if !fs.fip { - if qch == nil { - mb.spinUpFlushLoop() - } - select { - case fch <- struct{}{}: - default: - } + if !fs.fip && shouldWriteIndex { + if qch == nil { + mb.spinUpFlushLoop() + } + select { + case fch <- struct{}{}: + default: } } @@ -2011,7 +2149,11 @@ func (fs *fileStore) removeMsg(seq uint64, secure, needFSLock bool) (bool, error // we don't lose track of the first sequence. if firstSeqNeedsUpdate { fs.selectNextFirst() - fs.lmb.writeIndexInfo() + // Write out the new first message block if we have one. + if len(fs.blks) > 0 { + fmb := fs.blks[0] + fmb.writeIndexInfo() + } } fs.mu.Unlock() @@ -2037,7 +2179,8 @@ func (fs *fileStore) removeMsg(seq uint64, secure, needFSLock bool) (bool, error // writing new messages. We will silently bail on any issues with the underlying block and let someone else detect. // Write lock needs to be held. func (mb *msgBlock) compact() { - if mb.cacheNotLoaded() { + wasLoaded := mb.cacheAlreadyLoaded() + if !wasLoaded { if err := mb.loadMsgsWithLock(); err != nil { return } @@ -2119,11 +2262,12 @@ func (mb *msgBlock) compact() { // We will write to a new file and mv/rename it in case of failure. mfn := filepath.Join(filepath.Join(mb.fs.fcfg.StoreDir, msgDir), fmt.Sprintf(newScan, mb.index)) - defer os.Remove(mfn) if err := ioutil.WriteFile(mfn, nbuf, defaultFilePerms); err != nil { + os.Remove(mfn) return } if err := os.Rename(mfn, mb.mfn); err != nil { + os.Remove(mfn) return } @@ -2132,7 +2276,11 @@ func (mb *msgBlock) compact() { mb.removeIndexFileLocked() mb.deleteDmap() mb.rebuildStateLocked() - mb.loadMsgsWithLock() + + // If we entered with the msgs loaded make sure to reload them. + if wasLoaded { + mb.loadMsgsWithLock() + } } // Nil out our dmap. @@ -2335,7 +2483,7 @@ func (mb *msgBlock) eraseMsg(seq uint64, ri, rl int) error { } // Truncate this message block to the storedMsg. -func (mb *msgBlock) truncate(sm *fileStoredMsg) (nmsgs, nbytes uint64, err error) { +func (mb *msgBlock) truncate(sm *StoreMsg) (nmsgs, nbytes uint64, err error) { // Make sure we are loaded to process messages etc. if err := mb.loadMsgs(); err != nil { return 0, 0, err @@ -2352,23 +2500,34 @@ func (mb *msgBlock) truncate(sm *fileStoredMsg) (nmsgs, nbytes uint64, err error var purged, bytes uint64 mb.mu.Lock() + checkDmap := len(mb.dmap) > 0 + var smv StoreMsg + for seq := mb.last.seq; seq > sm.seq; seq-- { if checkDmap { if _, ok := mb.dmap[seq]; ok { // Delete and skip to next. delete(mb.dmap, seq) + if len(mb.dmap) == 0 { + mb.dmap = nil + checkDmap = false + } continue } } // We should have a valid msg to calculate removal stats. - _, rl, _, err := mb.slotInfo(int(seq - mb.cache.fseq)) - if err != nil { - mb.mu.Unlock() - return 0, 0, err + if m, err := mb.cacheLookup(seq, &smv); err == nil { + if mb.msgs > 0 { + rl := fileStoreMsgSize(m.subj, m.hdr, m.msg) + mb.msgs-- + mb.bytes -= rl + mb.rbytes -= rl + // For return accounting. + purged++ + bytes += uint64(rl) + } } - purged++ - bytes += uint64(rl) } // Truncate our msgs and close file. @@ -2384,11 +2543,6 @@ func (mb *msgBlock) truncate(sm *fileStoredMsg) (nmsgs, nbytes uint64, err error return 0, 0, fmt.Errorf("failed to truncate msg block %d, file not open", mb.index) } - // Do local mb stat updates. - mb.msgs -= purged - mb.bytes -= bytes - mb.rbytes -= bytes - // Update our last msg. mb.last.seq = sm.seq mb.last.ts = sm.ts @@ -2432,11 +2586,12 @@ func (mb *msgBlock) selectNextFirst() { // Need to get the timestamp. // We will try the cache direct and fallback if needed. - sm, _ := mb.cacheLookup(seq) + var smv StoreMsg + sm, _ := mb.cacheLookup(seq, &smv) if sm == nil { // Slow path, need to unlock. mb.mu.Unlock() - sm, _, _ = mb.fetchMsg(seq) + sm, _, _ = mb.fetchMsg(seq, &smv) mb.mu.Lock() } if sm != nil { @@ -2500,6 +2655,7 @@ func (mb *msgBlock) clearCache() { return } + buf := mb.cache.buf if mb.cache.off == 0 { mb.cache = nil } else { @@ -2508,6 +2664,7 @@ func (mb *msgBlock) clearCache() { mb.cache.idx = nil mb.cache.wp = 0 } + recycleMsgBlockBuf(buf) } // Called to possibly expire a message block cache. @@ -2531,6 +2688,29 @@ func (mb *msgBlock) tryForceExpireCacheLocked() { mb.llts = llts } +// This is for expiration of the write cache, which will be partial with fip. +// So we want to bypass the Pools here. +// Lock should be held. +func (mb *msgBlock) tryExpireWriteCache() []byte { + if mb.cache == nil { + return nil + } + lwts, buf, llts, nra := mb.lwts, mb.cache.buf, mb.llts, mb.cache.nra + mb.lwts, mb.cache.nra = 0, true + mb.expireCacheLocked() + mb.lwts = lwts + if mb.cache != nil { + mb.cache.nra = nra + } + // We could check for a certain time since last load, but to be safe just reuse if no loads at all. + if llts == 0 && (mb.cache == nil || mb.cache.buf == nil) { + // Clear last write time since we now are about to move on to a new lmb. + mb.lwts = 0 + return buf[:0] + } + return nil +} + // Lock should be held. func (mb *msgBlock) expireCacheLocked() { if mb.cache == nil { @@ -2565,6 +2745,9 @@ func (mb *msgBlock) expireCacheLocked() { // If we are here we will at least expire the core msg buffer. // We need to capture offset in case we do a write next before a full load. mb.cache.off += len(mb.cache.buf) + if !mb.cache.nra { + recycleMsgBlockBuf(mb.cache.buf) + } mb.cache.buf = nil mb.cache.wp = 0 @@ -2608,9 +2791,12 @@ func (fs *fileStore) cancelAgeChk() { func (fs *fileStore) expireMsgs() { // We need to delete one by one here and can not optimize for the time being. // Reason is that we need more information to adjust ack pending in consumers. - var sm *fileStoredMsg + var smv StoreMsg + var sm *StoreMsg + fs.mu.RLock() minAge := time.Now().UnixNano() - int64(fs.cfg.MaxAge) - for sm, _ = fs.msgForSeq(0); sm != nil && sm.ts <= minAge; sm, _ = fs.msgForSeq(0) { + fs.mu.RUnlock() + for sm, _ = fs.msgForSeq(0, &smv); sm != nil && sm.ts <= minAge; sm, _ = fs.msgForSeq(0, &smv) { fs.removeMsg(sm.seq, false, true) } @@ -2758,10 +2944,10 @@ func (mb *msgBlock) writeMsgRecord(rl, seq uint64, subj string, mhdr, msg []byte // Set cache timestamp for last store. mb.lwts = ts // Decide if we write index info if flushing in place. - writeIndex := ts-mb.lwits > int64(2*time.Second) + writeIndex := ts-mb.lwits > wiThresh // Accounting - mb.updateAccounting(seq&^ebit, ts, rl) + mb.updateAccounting(seq, ts, rl) // Check if we are tracking per subject for our simple state. if len(subj) > 0 && mb.fss != nil { @@ -2822,6 +3008,11 @@ func (mb *msgBlock) closeFDsLocked() error { if buf, _ := mb.bytesPending(); len(buf) > 0 { return errPendingData } + mb.closeFDsLockedNoCheck() + return nil +} + +func (mb *msgBlock) closeFDsLockedNoCheck() { if mb.mfd != nil { mb.mfd.Close() mb.mfd = nil @@ -2830,7 +3021,6 @@ func (mb *msgBlock) closeFDsLocked() error { mb.ifd.Close() mb.ifd = nil } - return nil } // bytesPending returns the buffer to be used for writing to the underlying file. @@ -2864,6 +3054,11 @@ func (mb *msgBlock) blkSize() uint64 { // Update accounting on a write msg. // Lock should be held. func (mb *msgBlock) updateAccounting(seq uint64, ts int64, rl uint64) { + isDeleted := seq&ebit != 0 + if isDeleted { + seq = seq &^ ebit + } + if mb.first.seq == 0 || mb.first.ts == 0 { mb.first.seq = seq mb.first.ts = ts @@ -2871,9 +3066,12 @@ func (mb *msgBlock) updateAccounting(seq uint64, ts int64, rl uint64) { // Need atomics here for selectMsgBlock speed. atomic.StoreUint64(&mb.last.seq, seq) mb.last.ts = ts - mb.bytes += rl mb.rbytes += rl - mb.msgs++ + // Only update this accounting if message is not a deleted message. + if !isDeleted { + mb.bytes += rl + mb.msgs++ + } } // Lock should be held. @@ -2887,7 +3085,7 @@ func (fs *fileStore) writeMsgRecord(seq uint64, ts int64, subj string, hdr, msg } // Grab our current last message block. mb := fs.lmb - if mb == nil || mb.blkSize()+rl > fs.fcfg.BlockSize { + if mb == nil || mb.msgs > 0 && mb.blkSize()+rl > fs.fcfg.BlockSize { if mb, err = fs.newMsgBlockForWrite(); err != nil { return 0, err } @@ -3158,6 +3356,7 @@ func (mb *msgBlock) flushPendingMsgsLocked() (*LostStreamData, error) { if cap(mb.cache.buf) <= maxBufReuse { buf = mb.cache.buf[:0] } else { + recycleMsgBlockBuf(mb.cache.buf) buf = nil } if moreBytes > 0 { @@ -3227,6 +3426,15 @@ func (mb *msgBlock) loadBlock(buf []byte) ([]byte, error) { } } + if buf == nil { + buf = getMsgBlockBuf(sz) + if sz > cap(buf) { + // We know we will make a new one so just recycle for now. + recycleMsgBlockBuf(buf) + buf = nil + } + } + if sz > cap(buf) { buf = make([]byte, sz) } else { @@ -3325,7 +3533,7 @@ checkCache: // Fetch a message from this block, possibly reading in and caching the messages. // We assume the block was selected and is correct, so we do not do range checks. -func (mb *msgBlock) fetchMsg(seq uint64) (*fileStoredMsg, bool, error) { +func (mb *msgBlock) fetchMsg(seq uint64, sm *StoreMsg) (*StoreMsg, bool, error) { mb.mu.Lock() defer mb.mu.Unlock() @@ -3334,12 +3542,12 @@ func (mb *msgBlock) fetchMsg(seq uint64) (*fileStoredMsg, bool, error) { return nil, false, err } } - sm, err := mb.cacheLookup(seq) + fsm, err := mb.cacheLookup(seq, sm) if err != nil { return nil, false, err } - expireOk := seq == mb.last.seq && mb.llseq == seq-1 - return sm, expireOk, err + expireOk := seq == mb.last.seq && mb.llseq == seq + return fsm, expireOk, err } var ( @@ -3366,7 +3574,7 @@ const ebit = 1 << 63 // Will do a lookup from cache. // Lock should be held. -func (mb *msgBlock) cacheLookup(seq uint64) (*fileStoredMsg, error) { +func (mb *msgBlock) cacheLookup(seq uint64, sm *StoreMsg) (*StoreMsg, error) { if seq < mb.first.seq || seq > mb.last.seq { return nil, ErrStoreMsgNotFound } @@ -3412,13 +3620,20 @@ func (mb *msgBlock) cacheLookup(seq uint64) (*fileStoredMsg, error) { } // Parse from the raw buffer. - subj, hdr, msg, mseq, ts, err := msgFromBuf(buf, hh) - if err != nil { + fsm, err := mb.msgFromBuf(buf, sm, hh) + if err != nil || fsm == nil { return nil, err } - if seq != mseq { + + // Deleted messages that are decoded return a 0 for seqeunce. + if fsm.seq == 0 { + return nil, errDeletedMsg + } + + if seq != fsm.seq { + recycleMsgBlockBuf(mb.cache.buf) mb.cache.buf = nil - return nil, fmt.Errorf("sequence numbers for cache load did not match, %d vs %d", seq, mseq) + return nil, fmt.Errorf("sequence numbers for cache load did not match, %d vs %d", seq, fsm.seq) } // Clear the check bit here after we know all is good. @@ -3426,15 +3641,16 @@ func (mb *msgBlock) cacheLookup(seq uint64) (*fileStoredMsg, error) { mb.cache.idx[seq-mb.cache.fseq] = (bi | hbit) } - return &fileStoredMsg{subj, hdr, msg, seq, ts, mb, int64(bi)}, nil + return fsm, nil } // Used when we are checking if discarding a message due to max msgs per subject will give us // enough room for a max bytes condition. // Lock should be already held. func (fs *fileStore) sizeForSeq(seq uint64) int { + var smv StoreMsg if mb := fs.selectMsgBlock(seq); mb != nil { - if sm, _, _ := mb.fetchMsg(seq); sm != nil { + if sm, _, _ := mb.fetchMsg(seq, &smv); sm != nil { return int(fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)) } } @@ -3442,7 +3658,7 @@ func (fs *fileStore) sizeForSeq(seq uint64) int { } // Will return message for the given sequence number. -func (fs *fileStore) msgForSeq(seq uint64) (*fileStoredMsg, error) { +func (fs *fileStore) msgForSeq(seq uint64, sm *StoreMsg) (*StoreMsg, error) { // TODO(dlc) - Since Store, Remove, Skip all hold the write lock on fs this will // be stalled. Need another lock if want to happen in parallel. fs.mu.RLock() @@ -3466,7 +3682,7 @@ func (fs *fileStore) msgForSeq(seq uint64) (*fileStoredMsg, error) { return nil, err } - fsm, expireOk, err := mb.fetchMsg(seq) + fsm, expireOk, err := mb.fetchMsg(seq, sm) if err != nil { return nil, err } @@ -3481,9 +3697,10 @@ func (fs *fileStore) msgForSeq(seq uint64) (*fileStoredMsg, error) { } // Internal function to return msg parts from a raw buffer. -func msgFromBuf(buf []byte, hh hash.Hash64) (string, []byte, []byte, uint64, int64, error) { +// Lock should be held. +func (mb *msgBlock) msgFromBuf(buf []byte, sm *StoreMsg, hh hash.Hash64) (*StoreMsg, error) { if len(buf) < emptyRecordLen { - return _EMPTY_, nil, nil, 0, 0, errBadMsg + return nil, errBadMsg } var le = binary.LittleEndian @@ -3495,7 +3712,7 @@ func msgFromBuf(buf []byte, hh hash.Hash64) (string, []byte, []byte, uint64, int slen := int(le.Uint16(hdr[20:])) // Simple sanity check. if dlen < 0 || slen > dlen || int(rl) > len(buf) { - return _EMPTY_, nil, nil, 0, 0, errBadMsg + return nil, errBadMsg } data := buf[msgHdrSize : msgHdrSize+dlen] // Do checksum tests here if requested. @@ -3509,7 +3726,7 @@ func msgFromBuf(buf []byte, hh hash.Hash64) (string, []byte, []byte, uint64, int hh.Write(data[slen : dlen-8]) } if !bytes.Equal(hh.Sum(nil), data[len(data)-8:]) { - return _EMPTY_, nil, nil, 0, 0, errBadMsg + return nil, errBadMsg } } seq := le.Uint64(hdr[4:]) @@ -3517,55 +3734,93 @@ func msgFromBuf(buf []byte, hh hash.Hash64) (string, []byte, []byte, uint64, int seq = 0 } ts := int64(le.Uint64(hdr[12:])) - // FIXME(dlc) - We need to not allow appends to the underlying buffer, so we will - // fix the capacity. This will cause a copy though in stream:internalSendLoop when - // we append CRLF but this was causing a race. Need to rethink more to avoid this copy. + + // Create a StoreMsg if needed. + if sm == nil { + sm = new(StoreMsg) + } else { + sm.clear() + } + // To recycle the large blocks we can never pass back a reference, so need to copy for the upper + // layers and for us to be safe to expire, and recycle, the large msgBlocks. end := dlen - 8 - var mhdr, msg []byte + if hasHeaders { hl := le.Uint32(data[slen:]) bi := slen + 4 li := bi + int(hl) - mhdr = data[bi:li:li] - msg = data[li:end:end] + sm.buf = append(sm.buf, data[bi:end]...) + li, end = li-bi, end-bi + sm.hdr = sm.buf[0:li:li] + sm.msg = sm.buf[li:end] } else { - msg = data[slen:end:end] + sm.buf = append(sm.buf, data[slen:end]...) + sm.msg = sm.buf[0 : end-slen] + } + sm.seq, sm.ts = seq, ts + // Treat subject a bit different to not reference underlying buf. + if slen > 0 { + sm.subj = mb.subjString(data[:slen]) } - return string(data[:slen]), mhdr, msg, seq, ts, nil + + return sm, nil } -// LoadMsg will lookup the message by sequence number and return it if found. -func (fs *fileStore) LoadMsg(seq uint64) (string, []byte, []byte, int64, error) { - sm, err := fs.msgForSeq(seq) - if sm != nil { - return sm.subj, sm.hdr, sm.msg, sm.ts, nil +// Given the `key` byte slice, this function will return the subject +// as a copy of `key` or a configured subject as to minimize memory allocations. +// Lock should be held. +func (mb *msgBlock) subjString(key []byte) string { + if len(key) == 0 { + return _EMPTY_ } - return _EMPTY_, nil, nil, 0, err + + if lsubjs := len(mb.fs.cfg.Subjects); lsubjs > 0 { + if lsubjs == 1 { + // The cast for the comparison does not make a copy + if string(key) == mb.fs.cfg.Subjects[0] { + return mb.fs.cfg.Subjects[0] + } + } else { + for _, subj := range mb.fs.cfg.Subjects { + if string(key) == subj { + return subj + } + } + } + } + // Copy here to not reference underlying buffer. + var sb strings.Builder + sb.Write(key) + return sb.String() +} + +// LoadMsg will lookup the message by sequence number and return it if found. +func (fs *fileStore) LoadMsg(seq uint64, sm *StoreMsg) (*StoreMsg, error) { + return fs.msgForSeq(seq, sm) } // LoadLastMsg will return the last message we have that matches a given subject. // The subject can be a wildcard. -func (fs *fileStore) LoadLastMsg(subject string) (subj string, seq uint64, hdr, msg []byte, ts int64, err error) { - var sm *fileStoredMsg +func (fs *fileStore) LoadLastMsg(subject string, sm *StoreMsg) (*StoreMsg, error) { if subject == _EMPTY_ || subject == fwcs { - sm, _ = fs.msgForSeq(fs.lastSeq()) + sm, _ = fs.msgForSeq(fs.lastSeq(), sm) } else if ss := fs.FilteredState(1, subject); ss.Msgs > 0 { - sm, _ = fs.msgForSeq(ss.Last) + sm, _ = fs.msgForSeq(ss.Last, sm) + } else { + sm = nil } if sm == nil { - return _EMPTY_, 0, nil, nil, 0, ErrStoreMsgNotFound + return nil, ErrStoreMsgNotFound } - return sm.subj, sm.seq, sm.hdr, sm.msg, sm.ts, nil + return sm, nil } -// LoadNextMsg will find the next message matching the filter subject starting at the start sequence. -// The filter subject can be a wildcard. -func (fs *fileStore) LoadNextMsg(filter string, wc bool, start uint64) (subj string, seq uint64, hdr, msg []byte, ts int64, err error) { +func (fs *fileStore) LoadNextMsg(filter string, wc bool, start uint64, sm *StoreMsg) (*StoreMsg, uint64, error) { fs.mu.RLock() defer fs.mu.RUnlock() if fs.closed { - return _EMPTY_, 0, nil, nil, 0, ErrStoreClosed + return nil, 0, ErrStoreClosed } if start < fs.state.FirstSeq { start = fs.state.FirstSeq @@ -3576,17 +3831,17 @@ func (fs *fileStore) LoadNextMsg(filter string, wc bool, start uint64) (subj str if start > atomic.LoadUint64(&mb.last.seq) { continue } - if sm, expireOk, err := mb.firstMatching(filter, wc, start); err == nil { + if sm, expireOk, err := mb.firstMatching(filter, wc, start, sm); err == nil { if expireOk && mb != fs.lmb { mb.tryForceExpireCache() } - return sm.subj, sm.seq, sm.hdr, sm.msg, sm.ts, nil + return sm, sm.seq, nil } else if err != ErrStoreMsgNotFound { - return _EMPTY_, 0, nil, nil, 0, err + return nil, 0, err } } - return _EMPTY_, fs.state.LastSeq, nil, nil, 0, ErrStoreEOF + return nil, fs.state.LastSeq, ErrStoreEOF } // Type returns the type of the underlying store. @@ -3812,10 +4067,16 @@ func (mb *msgBlock) readIndexInfo() error { // Check if this is a short write index file. if bi < 0 || bi+checksumSize > len(buf) { - defer os.Remove(mb.ifn) + os.Remove(mb.ifn) return fmt.Errorf("short index file") } + // Check for consistency if accounting. If something is off bail and we will rebuild. + if mb.msgs != (mb.last.seq-mb.first.seq+1)-dmapLen { + os.Remove(mb.ifn) + return fmt.Errorf("accounting inconsistent") + } + // Checksum copy(mb.lchk[0:], buf[bi:bi+checksumSize]) bi += checksumSize @@ -3923,6 +4184,10 @@ func compareFn(subject string) func(string, string) bool { // PurgeEx will remove messages based on subject filters, sequence and number of messages to keep. // Will return the number of purged messages. func (fs *fileStore) PurgeEx(subject string, sequence, keep uint64) (purged uint64, err error) { + if sequence > 1 && keep > 0 { + return 0, ErrPurgeArgMismatch + } + if subject == _EMPTY_ || subject == fwcs { if keep == 0 && (sequence == 0 || sequence == 1) { return fs.Purge() @@ -3954,6 +4219,8 @@ func (fs *fileStore) PurgeEx(subject string, sequence, keep uint64) (purged uint maxp = ss.Msgs - keep } + var smv StoreMsg + fs.mu.Lock() for _, mb := range fs.blks { mb.mu.Lock() @@ -3968,22 +4235,25 @@ func (fs *fileStore) PurgeEx(subject string, sequence, keep uint64) (purged uint mb.loadMsgsWithLock() shouldExpire = true } - if sequence > 0 && sequence <= l { + if sequence > 1 && sequence <= l { l = sequence - 1 } for seq := f; seq <= l; seq++ { - if sm, _ := mb.cacheLookup(seq); sm != nil && eq(sm.subj, subject) { + if sm, _ := mb.cacheLookup(seq, &smv); sm != nil && eq(sm.subj, subject) { rl := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg) // Do fast in place remove. // Stats - fs.state.Msgs-- - fs.state.Bytes -= rl - mb.msgs-- - mb.bytes -= rl + if mb.msgs > 0 { + fs.state.Msgs-- + fs.state.Bytes -= rl + mb.msgs-- + mb.bytes -= rl + purged++ + } // FSS updates. fs.removePerSubject(sm.subj) - mb.removeSeqPerSubject(sm.subj, seq) + mb.removeSeqPerSubject(sm.subj, seq, &smv) // Check for first message. if seq == mb.first.seq { mb.selectNextFirst() @@ -4001,7 +4271,7 @@ func (fs *fileStore) PurgeEx(subject string, sequence, keep uint64) (purged uint } mb.dmap[seq] = struct{}{} } - purged++ + if maxp > 0 && purged >= maxp { break } @@ -4085,6 +4355,7 @@ func (fs *fileStore) purge(fseq uint64) (uint64, error) { } fs.lmb.first.seq = fs.state.FirstSeq fs.lmb.last.seq = fs.state.LastSeq + fs.lmb.last.ts = fs.state.LastTime.UnixNano() fs.lmb.writeIndexInfo() @@ -4137,9 +4408,19 @@ func (fs *fileStore) Compact(seq uint64) (uint64, error) { deleted++ } + var smv StoreMsg + var err error + var isEmpty bool + smb.mu.Lock() + // Since we loaded before we acquired our lock, double check here under lock that we have the messages loaded. + if smb.cacheNotLoaded() { + if err = smb.loadMsgsWithLock(); err != nil { + goto SKIP + } + } for mseq := smb.first.seq; mseq < seq; mseq++ { - sm, err := smb.cacheLookup(mseq) + sm, err := smb.cacheLookup(mseq, &smv) if err == errDeletedMsg { // Update dmap. if len(smb.dmap) > 0 { @@ -4150,18 +4431,20 @@ func (fs *fileStore) Compact(seq uint64) (uint64, error) { } } else if sm != nil { sz := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg) - smb.bytes -= sz - bytes += sz - smb.msgs-- - purged++ + if smb.msgs > 0 { + smb.bytes -= sz + bytes += sz + smb.msgs-- + purged++ + } // Update fss fs.removePerSubject(sm.subj) - smb.removeSeqPerSubject(sm.subj, mseq) + smb.removeSeqPerSubject(sm.subj, mseq, &smv) } } // Check if empty after processing, could happen if tail of messages are all deleted. - isEmpty := smb.msgs == 0 + isEmpty = smb.msgs == 0 if isEmpty { smb.dirtyCloseWithRemove(true) // Update fs first here as well. @@ -4174,7 +4457,41 @@ func (fs *fileStore) Compact(seq uint64) (uint64, error) { smb.selectNextFirst() fs.state.FirstSeq = smb.first.seq fs.state.FirstTime = time.Unix(0, smb.first.ts).UTC() + + // Check if we should reclaim the head space from this block. + // This will be optimistic only, so don't continue if we encounter any errors here. + if smb.bytes*2 < smb.rbytes { + var moff uint32 + moff, _, _, err = smb.slotInfo(int(smb.first.seq - smb.cache.fseq)) + if err != nil || moff >= uint32(len(smb.cache.buf)) { + goto SKIP + } + buf := smb.cache.buf[moff:] + // Don't reuse, copy to new recycled buf. + nbuf := getMsgBlockBuf(len(buf)) + nbuf = append(nbuf, buf...) + smb.closeFDsLockedNoCheck() + // Check for encryption. + if smb.bek != nil && len(nbuf) > 0 { + // Recreate to reset counter. + rbek, err := chacha20.NewUnauthenticatedCipher(smb.seed, smb.nonce) + if err != nil { + goto SKIP + } + cbuf := make([]byte, len(nbuf)) + rbek.XORKeyStream(cbuf, nbuf) + if err = ioutil.WriteFile(smb.mfn, cbuf, defaultFilePerms); err != nil { + goto SKIP + } + } else if err = ioutil.WriteFile(smb.mfn, nbuf, defaultFilePerms); err != nil { + goto SKIP + } + smb.clearCacheAndOffset() + smb.rbytes = uint64(len(nbuf)) + } } + +SKIP: smb.mu.Unlock() if !isEmpty { @@ -4198,7 +4515,7 @@ func (fs *fileStore) Compact(seq uint64) (uint64, error) { cb(-int64(purged), -int64(bytes), 0, _EMPTY_) } - return purged, nil + return purged, err } // Truncate will truncate a stream store up to and including seq. Sequence needs to be valid. @@ -4219,7 +4536,7 @@ func (fs *fileStore) Truncate(seq uint64) error { fs.mu.Unlock() return ErrInvalidSequence } - lsm, _, _ := nlmb.fetchMsg(seq) + lsm, _, _ := nlmb.fetchMsg(seq, nil) if lsm == nil { fs.mu.Unlock() return ErrInvalidSequence @@ -4323,6 +4640,27 @@ func (fs *fileStore) removeMsgBlock(mb *msgBlock) { } } +// When we have an empty block but want to keep the index for timestamp info etc. +// Lock should be held. +func (mb *msgBlock) closeAndKeepIndex() { + // We will leave a 0 length blk marker. + if mb.mfd != nil { + mb.mfd.Truncate(0) + } else { + // We were closed, so just write out an empty file. + ioutil.WriteFile(mb.mfn, nil, defaultFilePerms) + } + // Close + mb.dirtyCloseWithRemove(false) + // Make sure to write the index file so we can remember last seq and ts. + mb.writeIndexInfoLocked() + + // Clear any fss. + if mb.sfn != _EMPTY_ { + os.Remove(mb.sfn) + } +} + // Called by purge to simply get rid of the cache and close our fds. // Lock should not be held. func (mb *msgBlock) dirtyClose() { @@ -4372,7 +4710,7 @@ func (mb *msgBlock) dirtyCloseWithRemove(remove bool) { // Remove a seq from the fss and select new first. // Lock should be held. -func (mb *msgBlock) removeSeqPerSubject(subj string, seq uint64) { +func (mb *msgBlock) removeSeqPerSubject(subj string, seq uint64, smp *StoreMsg) { ss := mb.fss[subj] if ss == nil { return @@ -4395,8 +4733,12 @@ func (mb *msgBlock) removeSeqPerSubject(subj string, seq uint64) { } // TODO(dlc) - Might want to optimize this. + var smv StoreMsg + if smp == nil { + smp = &smv + } for tseq := seq + 1; tseq <= ss.Last; tseq++ { - if sm, _ := mb.cacheLookup(tseq); sm != nil { + if sm, _ := mb.cacheLookup(tseq, smp); sm != nil { if sm.subj == subj { ss.First = tseq return @@ -4410,6 +4752,15 @@ func (mb *msgBlock) generatePerSubjectInfo() error { mb.mu.Lock() defer mb.mu.Unlock() + if mb.fss == nil { + mb.fss = make(map[string]*SimpleState) + } + + // Check if this mb is empty. This can happen when its the last one and we are holding onto it for seq and timestamp info. + if mb.msgs == 0 { + return nil + } + var shouldExpire bool if mb.cacheNotLoaded() { if err := mb.loadMsgsWithLock(); err != nil { @@ -4417,13 +4768,11 @@ func (mb *msgBlock) generatePerSubjectInfo() error { } shouldExpire = true } - if mb.fss == nil { - mb.fss = make(map[string]*SimpleState) - } + var smv StoreMsg fseq, lseq := mb.first.seq, mb.last.seq for seq := fseq; seq <= lseq; seq++ { - sm, err := mb.cacheLookup(seq) + sm, err := mb.cacheLookup(seq, &smv) if err != nil { // Since we are walking by sequence we can ignore some errors that are benign to rebuilding our state. if err == ErrStoreMsgNotFound || err == errDeletedMsg { @@ -4479,8 +4828,6 @@ func (mb *msgBlock) readPerSubjectInfo() error { return mb.generatePerSubjectInfo() } - fss := make(map[string]*SimpleState) - bi := hdrLen readU64 := func() uint64 { if bi < 0 { @@ -4495,14 +4842,18 @@ func (mb *msgBlock) readPerSubjectInfo() error { return num } - for i, numEntries := uint64(0), readU64(); i < numEntries; i++ { + numEntries := readU64() + fss := make(map[string]*SimpleState, numEntries) + + mb.mu.Lock() + for i := uint64(0); i < numEntries; i++ { lsubj := readU64() - subj := buf[bi : bi+int(lsubj)] + // Make a copy or use a configured subject (to avoid mem allocation) + subj := mb.subjString(buf[bi : bi+int(lsubj)]) bi += int(lsubj) msgs, first, last := readU64(), readU64(), readU64() - fss[string(subj)] = &SimpleState{Msgs: msgs, First: first, Last: last} + fss[subj] = &SimpleState{Msgs: msgs, First: first, Last: last} } - mb.mu.Lock() mb.fss = fss mb.mu.Unlock() return nil @@ -4588,6 +4939,8 @@ func (fs *fileStore) closeAllMsgBlocks(sync bool) { func (fs *fileStore) Delete() error { if fs.isClosed() { + // Always attempt to remove since we could have been closed beforehand. + os.RemoveAll(fs.fcfg.StoreDir) return ErrStoreClosed } fs.Purge() @@ -4640,7 +4993,7 @@ func (fs *fileStore) Stop() error { fs.cancelSyncTimer() fs.cancelAgeChk() - var _cfs [256]*consumerFileStore + var _cfs [256]ConsumerStore cfs := append(_cfs[:0], fs.cfs...) fs.cfs = nil fs.mu.Unlock() @@ -4794,7 +5147,11 @@ func (fs *fileStore) streamSnapshot(w io.WriteCloser, state *StreamState, includ cfs := fs.cfs fs.mu.Unlock() - for _, o := range cfs { + for _, cs := range cfs { + o, ok := cs.(*consumerFileStore) + if !ok { + continue + } o.mu.Lock() // Grab our general meta data. // We do this now instead of pulling from files since they could be encrypted. @@ -4908,11 +5265,20 @@ func (fs *fileStore) ConsumerStore(name string, cfg *ConsumerConfig) (ConsumerSt if cfg == nil || name == _EMPTY_ { return nil, fmt.Errorf("bad consumer config") } + + // We now allow overrides from a stream being a filestore type and forcing a consumer to be memory store. + if cfg.MemoryStorage { + // Create directly here. + o := &consumerMemStore{ms: fs, cfg: *cfg} + fs.AddConsumer(o) + return o, nil + } + odir := filepath.Join(fs.fcfg.StoreDir, consumerDir, name) if err := os.MkdirAll(odir, defaultDirPerms); err != nil { return nil, fmt.Errorf("could not create consumer directory - %v", err) } - csi := &FileConsumerInfo{ConsumerConfig: *cfg} + csi := &FileConsumerInfo{Name: name, Created: time.Now().UTC(), ConsumerConfig: *cfg} o := &consumerFileStore{ fs: fs, cfg: csi, @@ -4984,9 +5350,7 @@ func (fs *fileStore) ConsumerStore(name string, cfg *ConsumerConfig) (ConsumerSt o.qch = make(chan struct{}) go o.flushLoop() - fs.mu.Lock() - fs.cfs = append(fs.cfs, o) - fs.mu.Unlock() + fs.AddConsumer(o) return o, nil } @@ -5082,6 +5446,26 @@ func (o *consumerFileStore) flushLoop() { } } +// SetStarting sets our starting stream sequence. +func (o *consumerFileStore) SetStarting(sseq uint64) error { + o.mu.Lock() + o.state.Delivered.Stream = sseq + buf, err := o.encodeState() + o.mu.Unlock() + if err != nil { + return err + } + return o.writeState(buf) +} + +// HasState returns if this store has a recorded state. +func (o *consumerFileStore) HasState() bool { + o.mu.Lock() + _, err := os.Stat(o.ifn) + o.mu.Unlock() + return err == nil +} + // UpdateDelivered is called whenever a new message has been delivered. func (o *consumerFileStore) UpdateDelivered(dseq, sseq, dc uint64, ts int64) error { o.mu.Lock() @@ -5212,73 +5596,22 @@ const seqsHdrSize = 6*binary.MaxVarintLen64 + hdrLen // Encode our consumer state, version 2. // Lock should be held. -func (o *consumerFileStore) encodeState() ([]byte, error) { + +func (o *consumerFileStore) EncodedState() ([]byte, error) { + o.mu.Lock() + defer o.mu.Unlock() + if o.closed { return nil, ErrStoreClosed } return encodeConsumerState(&o.state), nil } -func encodeConsumerState(state *ConsumerState) []byte { - var hdr [seqsHdrSize]byte - var buf []byte - - maxSize := seqsHdrSize - if lp := len(state.Pending); lp > 0 { - maxSize += lp*(3*binary.MaxVarintLen64) + binary.MaxVarintLen64 - } - if lr := len(state.Redelivered); lr > 0 { - maxSize += lr*(2*binary.MaxVarintLen64) + binary.MaxVarintLen64 - } - if maxSize == seqsHdrSize { - buf = hdr[:seqsHdrSize] - } else { - buf = make([]byte, maxSize) - } - - // Write header - buf[0] = magic - buf[1] = 2 - - n := hdrLen - n += binary.PutUvarint(buf[n:], state.AckFloor.Consumer) - n += binary.PutUvarint(buf[n:], state.AckFloor.Stream) - n += binary.PutUvarint(buf[n:], state.Delivered.Consumer) - n += binary.PutUvarint(buf[n:], state.Delivered.Stream) - n += binary.PutUvarint(buf[n:], uint64(len(state.Pending))) - - asflr := state.AckFloor.Stream - adflr := state.AckFloor.Consumer - - // These are optional, but always write len. This is to avoid a truncate inline. - if len(state.Pending) > 0 { - // To save space we will use now rounded to seconds to be base timestamp. - mints := time.Now().Round(time.Second).Unix() - // Write minimum timestamp we found from above. - n += binary.PutVarint(buf[n:], mints) - - for k, v := range state.Pending { - n += binary.PutUvarint(buf[n:], k-asflr) - n += binary.PutUvarint(buf[n:], v.Sequence-adflr) - // Downsample to seconds to save on space. - // Subsecond resolution not needed for recovery etc. - ts := v.Timestamp / 1_000_000_000 - n += binary.PutVarint(buf[n:], mints-ts) - } - } - - // We always write the redelivered len. - n += binary.PutUvarint(buf[n:], uint64(len(state.Redelivered))) - - // We expect these to be small. - if len(state.Redelivered) > 0 { - for k, v := range state.Redelivered { - n += binary.PutUvarint(buf[n:], k-asflr) - n += binary.PutUvarint(buf[n:], v) - } +func (o *consumerFileStore) encodeState() ([]byte, error) { + if o.closed { + return nil, ErrStoreClosed } - - return buf[:n] + return encodeConsumerState(&o.state), nil } func (o *consumerFileStore) UpdateConfig(cfg *ConsumerConfig) error { @@ -5508,10 +5841,14 @@ func (o *consumerFileStore) State() (*ConsumerState, error) { o.mu.Lock() defer o.mu.Unlock() + if o.closed { + return nil, ErrStoreClosed + } + state := &ConsumerState{} // See if we have a running state or if we need to read in from disk. - if o.state.Delivered.Consumer != 0 { + if o.state.Delivered.Consumer != 0 || o.state.Delivered.Stream != 0 { state.Delivered = o.state.Delivered state.AckFloor = o.state.AckFloor if len(o.state.Pending) > 0 { @@ -5699,7 +6036,7 @@ func (o *consumerFileStore) Stop() error { ifn, fs := o.ifn, o.fs o.mu.Unlock() - fs.removeConsumer(o) + fs.RemoveConsumer(o) if len(buf) > 0 { o.waitOnFlusher() @@ -5759,21 +6096,29 @@ func (o *consumerFileStore) delete(streamDeleted bool) error { } if !streamDeleted { - fs.removeConsumer(o) + fs.RemoveConsumer(o) } return err } -func (fs *fileStore) removeConsumer(cfs *consumerFileStore) { +func (fs *fileStore) AddConsumer(o ConsumerStore) error { fs.mu.Lock() defer fs.mu.Unlock() - for i, o := range fs.cfs { + fs.cfs = append(fs.cfs, o) + return nil +} + +func (fs *fileStore) RemoveConsumer(o ConsumerStore) error { + fs.mu.Lock() + defer fs.mu.Unlock() + for i, cfs := range fs.cfs { if o == cfs { fs.cfs = append(fs.cfs[:i], fs.cfs[i+1:]...) break } } + return nil } //////////////////////////////////////////////////////////////////////////////// diff --git a/vendor/github.com/nats-io/nats-server/v2/server/gateway.go b/vendor/github.com/nats-io/nats-server/v2/server/gateway.go index ce5b7e6d..8e4939d8 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/gateway.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/gateway.go @@ -1013,6 +1013,13 @@ func (c *client) processGatewayInfo(info *Info) { return } + // Check for duplicate server name with servers in our cluster + if s.isDuplicateServerName(info.Name) { + c.Errorf("Remote server has a duplicate name: %q", info.Name) + c.closeConnection(DuplicateServerName) + return + } + // Possibly add URLs that we get from the INFO protocol. if len(info.GatewayURLs) > 0 { cfg.updateURLs(info.GatewayURLs) @@ -1084,6 +1091,13 @@ func (c *client) processGatewayInfo(info *Info) { } else if isFirstINFO { // This is the first INFO of an inbound connection... + // Check for duplicate server name with servers in our cluster + if s.isDuplicateServerName(info.Name) { + c.Errorf("Remote server has a duplicate name: %q", info.Name) + c.closeConnection(DuplicateServerName) + return + } + s.registerInboundGatewayConnection(cid, c) c.Noticef("Inbound gateway connection from %q (%s) registered", info.Gateway, info.ID) @@ -2810,7 +2824,7 @@ func (c *client) handleGatewayReply(msg []byte) (processed bool) { // If route is nil, we will process the incoming message locally. if route == nil { // Check if this is a service reply subject (_R_) - isServiceReply := len(acc.imports.services) > 0 && isServiceReply(c.pa.subject) + isServiceReply := isServiceReply(c.pa.subject) var queues [][]byte if len(r.psubs)+len(r.qsubs) > 0 { diff --git a/vendor/github.com/nats-io/nats-server/v2/server/ipqueue.go b/vendor/github.com/nats-io/nats-server/v2/server/ipqueue.go index 2e514513..4f288e82 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/ipqueue.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/ipqueue.go @@ -15,35 +15,26 @@ package server import ( "sync" + "sync/atomic" ) const ipQueueDefaultMaxRecycleSize = 4 * 1024 -const ipQueueDefaultWarnThreshold = 32 * 1024 - -type ipQueueLogger interface { - // The ipQueue will invoke this function with the queue's name and the number - // of pending elements. This call CANNOT block. It is ok to drop the logging - // if desired, but not block. - log(name string, pending int) -} // This is a generic intra-process queue. type ipQueue struct { + inprogress int64 sync.RWMutex - ch chan struct{} - elts []interface{} - pos int - pool *sync.Pool - mrs int - name string - logger ipQueueLogger - lt int + ch chan struct{} + elts []interface{} + pos int + pool *sync.Pool + mrs int + name string + m *sync.Map } type ipQueueOpts struct { maxRecycleSize int - name string - logger ipQueueLogger } type ipQueueOpt func(*ipQueueOpts) @@ -56,27 +47,19 @@ func ipQueue_MaxRecycleSize(max int) ipQueueOpt { } } -// This option provides the logger to be used by this queue to log -// when the number of pending elements reaches a certain threshold. -func ipQueue_Logger(name string, l ipQueueLogger) ipQueueOpt { - return func(o *ipQueueOpts) { - o.name, o.logger = name, l - } -} - -func newIPQueue(opts ...ipQueueOpt) *ipQueue { +func (s *Server) newIPQueue(name string, opts ...ipQueueOpt) *ipQueue { qo := ipQueueOpts{maxRecycleSize: ipQueueDefaultMaxRecycleSize} for _, o := range opts { o(&qo) } q := &ipQueue{ - ch: make(chan struct{}, 1), - mrs: qo.maxRecycleSize, - pool: &sync.Pool{}, - name: qo.name, - logger: qo.logger, - lt: ipQueueDefaultWarnThreshold, + ch: make(chan struct{}, 1), + mrs: qo.maxRecycleSize, + pool: &sync.Pool{}, + name: name, + m: &s.ipQueues, } + s.ipQueues.Store(name, q) return q } @@ -101,9 +84,6 @@ func (q *ipQueue) push(e interface{}) int { } q.elts = append(q.elts, e) l++ - if l >= q.lt && q.logger != nil && (l <= q.lt+10 || q.lt%10000 == 0) { - q.logger.log(q.name, l) - } q.Unlock() if signal { select { @@ -132,6 +112,7 @@ func (q *ipQueue) pop() []interface{} { elts = q.elts[q.pos:] } q.elts, q.pos = nil, 0 + atomic.AddInt64(&q.inprogress, int64(len(elts))) q.Unlock() return elts } @@ -174,13 +155,24 @@ func (q *ipQueue) popOne() interface{} { // After a pop(), the slice can be recycled for the next push() when // a first element is added to the queue. +// This will also decrement the "in progress" count with the length +// of the slice. // Reason we use pointer to slice instead of slice is explained // here: https://staticcheck.io/docs/checks#SA6002 func (q *ipQueue) recycle(elts *[]interface{}) { - // If invoked with an nil list, don't recyle. + // If invoked with a nil list, nothing to do. + if elts == nil || *elts == nil { + return + } + // Update the in progress count. + if len(*elts) > 0 { + if atomic.AddInt64(&q.inprogress, int64(-(len(*elts)))) < 0 { + atomic.StoreInt64(&q.inprogress, 0) + } + } // We also don't want to recycle huge slices, so check against the max. // q.mrs is normally immutable but can be changed, in a safe way, in some tests. - if elts == nil || *elts == nil || cap(*elts) > q.mrs { + if cap(*elts) > q.mrs { return } q.resetAndReturnToPool(elts) @@ -199,6 +191,9 @@ func (q *ipQueue) len() int { // notified that there is something in the queue (reading from queue's `ch`) // may then get nothing if `drain()` is invoked before the `pop()` or `popOne()`. func (q *ipQueue) drain() { + if q == nil { + return + } q.Lock() if q.elts != nil { q.resetAndReturnToPool(&q.elts) @@ -212,3 +207,21 @@ func (q *ipQueue) drain() { } q.Unlock() } + +// Since the length of the queue goes to 0 after a pop(), it is good to +// have an insight on how many elements are yet to be processed after a pop(). +// For that reason, the queue maintains a count of elements returned through +// the pop() API. When the caller will call q.recycle(), this count will +// be reduced by the size of the slice returned by pop(). +func (q *ipQueue) inProgress() int64 { + return atomic.LoadInt64(&q.inprogress) +} + +// Remove this queue from the server's map of ipQueues. +// All ipQueue operations (such as push/pop/etc..) are still possible. +func (q *ipQueue) unregister() { + if q == nil { + return + } + q.m.Delete(q.name) +} diff --git a/vendor/github.com/nats-io/nats-server/v2/server/jetstream.go b/vendor/github.com/nats-io/nats-server/v2/server/jetstream.go index 6b13ac7a..e9f10ce2 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/jetstream.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/jetstream.go @@ -59,24 +59,32 @@ type JetStreamStats struct { } type JetStreamAccountLimits struct { - MaxMemory int64 `json:"max_memory"` - MaxStore int64 `json:"max_storage"` - MaxStreams int `json:"max_streams"` - MaxConsumers int `json:"max_consumers"` - MaxBytesRequired bool `json:"max_bytes_required"` + MaxMemory int64 `json:"max_memory"` + MaxStore int64 `json:"max_storage"` + MaxStreams int `json:"max_streams"` + MaxConsumers int `json:"max_consumers"` + MaxAckPending int `json:"max_ack_pending"` + MemoryMaxStreamBytes int64 `json:"memory_max_stream_bytes"` + StoreMaxStreamBytes int64 `json:"storage_max_stream_bytes"` + MaxBytesRequired bool `json:"max_bytes_required"` } -// JetStreamAccountStats returns current statistics about the account's JetStream usage. -type JetStreamAccountStats struct { +type JetStreamTier struct { Memory uint64 `json:"memory"` Store uint64 `json:"storage"` Streams int `json:"streams"` Consumers int `json:"consumers"` - Domain string `json:"domain,omitempty"` - API JetStreamAPIStats `json:"api"` Limits JetStreamAccountLimits `json:"limits"` } +// JetStreamAccountStats returns current statistics about the account's JetStream usage. +type JetStreamAccountStats struct { + JetStreamTier // in case tiers are used, reflects totals with limits not set + Domain string `json:"domain,omitempty"` + API JetStreamAPIStats `json:"api"` + Tiers map[string]JetStreamTier `json:"tiers,omitempty"` // indexed by tier name +} + type JetStreamAPIStats struct { Total uint64 `json:"total"` Errors uint64 `json:"errors"` @@ -93,6 +101,7 @@ type jetStream struct { storeReserved int64 memUsed int64 storeUsed int64 + clustered int32 mu sync.RWMutex srv *Server config JetStreamConfig @@ -104,44 +113,53 @@ type jetStream struct { oos bool } +type remoteUsage struct { + tiers map[string]*jsaUsage // indexed by tier name + api uint64 + err uint64 +} + +type jsaStorage struct { + total jsaUsage + local jsaUsage +} + // This represents a jetstream enabled account. // Worth noting that we include the jetstream pointer, this is because // in general we want to be very efficient when receiving messages on // an internal sub for a stream, so we will direct link to the stream // and walk backwards as needed vs multiple hash lookups and locks, etc. type jsAccount struct { - mu sync.RWMutex - js *jetStream - account *Account - limits JetStreamAccountLimits - memReserved int64 - storeReserved int64 - memTotal int64 - storeTotal int64 - apiTotal uint64 - apiErrors uint64 - usage jsaUsage - rusage map[string]*jsaUsage - storeDir string - streams map[string]*stream - templates map[string]*streamTemplate - store TemplateStore - - // Cluster support + mu sync.RWMutex + js *jetStream + account *Account + storeDir string + streams map[string]*stream + templates map[string]*streamTemplate + store TemplateStore + + // From server + sendq *ipQueue // of *pubMsg + + // Usage/limits related fields that will be protected by usageMu + usageMu sync.RWMutex + limits map[string]JetStreamAccountLimits // indexed by tierName + usage map[string]*jsaStorage // indexed by tierName + rusage map[string]*remoteUsage // indexed by node id + apiTotal uint64 + apiErrors uint64 + usageApi uint64 + usageErr uint64 updatesPub string updatesSub *subscription - // From server - sendq *ipQueue // of *pubMsg - lupdate time.Time - utimer *time.Timer + lupdate time.Time + utimer *time.Timer } // Track general usage for this account. type jsaUsage struct { mem int64 store int64 - api uint64 - err uint64 } // EnableJetStream will enable JetStream support on this server with the given configuration. @@ -553,8 +571,8 @@ func (s *Server) enableJetStreamAccounts() error { if s.globalAccountOnly() { gacc := s.GlobalAccount() gacc.mu.Lock() - if gacc.jsLimits == nil { - gacc.jsLimits = dynamicJSAccountLimits + if len(gacc.jsLimits) == 0 { + gacc.jsLimits = defaultJSAccountTiers } gacc.mu.Unlock() if err := s.configJetStream(gacc); err != nil { @@ -617,17 +635,20 @@ func (s *Server) configJetStream(acc *Account) error { if acc == nil { return nil } - if acc.jsLimits != nil { + acc.mu.RLock() + jsLimits := acc.jsLimits + acc.mu.RUnlock() + if jsLimits != nil { // Check if already enabled. This can be during a reload. if acc.JetStreamEnabled() { if err := acc.enableAllJetStreamServiceImportsAndMappings(); err != nil { return err } - if err := acc.UpdateJetStreamLimits(acc.jsLimits); err != nil { + if err := acc.UpdateJetStreamLimits(jsLimits); err != nil { return err } } else { - if err := acc.EnableJetStream(acc.jsLimits); err != nil { + if err := acc.EnableJetStream(jsLimits); err != nil { return err } if s.gateway.enabled { @@ -759,6 +780,10 @@ func (s *Server) migrateEphemerals() { var consumers []*consumerAssignment js.mu.Lock() + if cc.meta == nil { + js.mu.Unlock() + return + } ourID := cc.meta.ID() for _, asa := range cc.streams { for _, sa := range asa { @@ -797,7 +822,7 @@ func (s *Server) migrateEphemerals() { } o.mu.Unlock() } - state := o.readStoreState() + state, _ := o.store.State() o.deleteWithoutAdvisory() js.mu.Lock() // Delete old one. @@ -832,6 +857,13 @@ func (s *Server) shutdownJetStream() { s.Noticef("Initiating JetStream Shutdown...") defer s.Noticef("JetStream Shutdown") + // If we have folks blocked on sync requests, unblock. + // Send 1 is enough, but use select in case they were all present. + select { + case s.syncOutSem <- struct{}{}: + default: + } + var _a [512]*Account accounts := _a[:0] @@ -917,7 +949,7 @@ func (s *Server) getJetStream() *jetStream { return js } -func (a *Account) assignJetStreamLimits(limits *JetStreamAccountLimits) { +func (a *Account) assignJetStreamLimits(limits map[string]JetStreamAccountLimits) { a.mu.Lock() a.jsLimits = limits a.mu.Unlock() @@ -925,7 +957,7 @@ func (a *Account) assignJetStreamLimits(limits *JetStreamAccountLimits) { // EnableJetStream will enable JetStream on this account with the defined limits. // This is a helper for JetStreamEnableAccount. -func (a *Account) EnableJetStream(limits *JetStreamAccountLimits) error { +func (a *Account) EnableJetStream(limits map[string]JetStreamAccountLimits) error { a.mu.RLock() s := a.srv a.mu.RUnlock() @@ -944,8 +976,8 @@ func (a *Account) EnableJetStream(limits *JetStreamAccountLimits) error { // No limits means we dynamically set up limits. // We also place limits here so we know that the account is configured for JetStream. - if limits == nil { - limits = dynamicJSAccountLimits + if len(limits) == 0 { + limits = defaultJSAccountTiers } a.assignJetStreamLimits(limits) @@ -967,20 +999,20 @@ func (a *Account) EnableJetStream(limits *JetStreamAccountLimits) error { return err } - jsa := &jsAccount{js: js, account: a, limits: *limits, streams: make(map[string]*stream), sendq: sendq} - jsa.utimer = time.AfterFunc(usageTick, jsa.sendClusterUsageUpdateTimer) - jsa.storeDir = filepath.Join(js.config.StoreDir, a.Name) - - js.accounts[a.Name] = jsa - js.mu.Unlock() - sysNode := s.Node() + jsa := &jsAccount{js: js, account: a, limits: limits, streams: make(map[string]*stream), sendq: sendq, usage: make(map[string]*jsaStorage)} + jsa.storeDir = filepath.Join(js.config.StoreDir, a.Name) + + jsa.usageMu.Lock() + jsa.utimer = time.AfterFunc(usageTick, jsa.sendClusterUsageUpdateTimer) // Cluster mode updates to resource usage, but we always will turn on. System internal prevents echos. - jsa.mu.Lock() jsa.updatesPub = fmt.Sprintf(jsaUpdatesPubT, a.Name, sysNode) jsa.updatesSub, _ = s.sysSubscribe(fmt.Sprintf(jsaUpdatesSubT, a.Name), jsa.remoteUpdateUsage) - jsa.mu.Unlock() + jsa.usageMu.Unlock() + + js.accounts[a.Name] = jsa + js.mu.Unlock() // Stamp inside account as well. a.mu.Lock() @@ -993,8 +1025,16 @@ func (a *Account) EnableJetStream(limits *JetStreamAccountLimits) error { } s.Debugf("Enabled JetStream for account %q", a.Name) - s.Debugf(" Max Memory: %s", friendlyBytes(limits.MaxMemory)) - s.Debugf(" Max Storage: %s", friendlyBytes(limits.MaxStore)) + if l, ok := limits[_EMPTY_]; ok { + s.Debugf(" Max Memory: %s", friendlyBytes(l.MaxMemory)) + s.Debugf(" Max Storage: %s", friendlyBytes(l.MaxStore)) + } else { + for t, l := range limits { + s.Debugf(" Tier: %s", t) + s.Debugf(" Max Memory: %s", friendlyBytes(l.MaxMemory)) + s.Debugf(" Max Storage: %s", friendlyBytes(l.MaxStore)) + } + } // Clean up any old snapshots that were orphaned while staging. os.RemoveAll(filepath.Join(js.config.StoreDir, snapStagingDir)) @@ -1084,11 +1124,11 @@ func (a *Account) EnableJetStream(limits *JetStreamAccountLimits) error { } buf, err := ioutil.ReadFile(metafile) if err != nil { - s.Warnf(" Error reading metafile %q: %v", metasum, err) + s.Warnf(" Error reading metafile %q: %v", metafile, err) continue } if _, err := os.Stat(metasum); os.IsNotExist(err) { - s.Warnf(" Missing stream checksum for %q", metasum) + s.Warnf(" Missing stream checksum file %q", metasum) continue } sum, err := ioutil.ReadFile(metasum) @@ -1099,7 +1139,7 @@ func (a *Account) EnableJetStream(limits *JetStreamAccountLimits) error { hh.Write(buf) checksum := hex.EncodeToString(hh.Sum(nil)) if checksum != string(sum) { - s.Warnf(" Stream metafile checksums do not match %q vs %q", sum, checksum) + s.Warnf(" Stream metafile %q: checksums do not match %q vs %q", metafile, sum, checksum) continue } @@ -1119,7 +1159,7 @@ func (a *Account) EnableJetStream(limits *JetStreamAccountLimits) error { var cfg FileStreamInfo if err := json.Unmarshal(buf, &cfg); err != nil { - s.Warnf(" Error unmarshalling stream metafile: %v", err) + s.Warnf(" Error unmarshalling stream metafile %q: %v", metafile, err) continue } @@ -1129,6 +1169,9 @@ func (a *Account) EnableJetStream(limits *JetStreamAccountLimits) error { } } + // We had a bug that set a default de dupe window on mirror, despite that being not a valid config + fixCfgMirrorWithDedupWindow(&cfg.StreamConfig) + // We had a bug that could allow subjects in that had prefix or suffix spaces. We check for that here // and will patch them on the fly for now. We will warn about them. var hadSubjErr bool @@ -1187,7 +1230,7 @@ func (a *Account) EnableJetStream(limits *JetStreamAccountLimits) error { } buf, err := ioutil.ReadFile(metafile) if err != nil { - s.Warnf(" Error reading consumer metafile %q: %v", metasum, err) + s.Warnf(" Error reading consumer metafile %q: %v", metafile, err) continue } if _, err := os.Stat(metasum); os.IsNotExist(err) { @@ -1207,7 +1250,7 @@ func (a *Account) EnableJetStream(limits *JetStreamAccountLimits) error { var cfg FileConsumerInfo if err := json.Unmarshal(buf, &cfg); err != nil { - s.Warnf(" Error unmarshalling consumer metafile: %v", err) + s.Warnf(" Error unmarshalling consumer metafile %q: %v", metafile, err) continue } isEphemeral := !isDurableConsumer(&cfg.ConsumerConfig) @@ -1216,9 +1259,9 @@ func (a *Account) EnableJetStream(limits *JetStreamAccountLimits) error { // the consumer can reconnect. We will create it as a durable and switch it. cfg.ConsumerConfig.Durable = ofi.Name() } - obs, err := e.mset.addConsumer(&cfg.ConsumerConfig) + obs, err := e.mset.addConsumerWithAssignment(&cfg.ConsumerConfig, _EMPTY_, nil, true) if err != nil { - s.Warnf(" Error adding consumer: %v", err) + s.Warnf(" Error adding consumer %q: %v", cfg.Name, err) continue } if isEphemeral { @@ -1227,11 +1270,12 @@ func (a *Account) EnableJetStream(limits *JetStreamAccountLimits) error { if !cfg.Created.IsZero() { obs.setCreatedTime(cfg.Created) } + lseq := e.mset.lastSeq() obs.mu.Lock() - err = obs.readStoredState() + err = obs.readStoredState(lseq) obs.mu.Unlock() if err != nil { - s.Warnf(" Error restoring consumer state: %v", err) + s.Warnf(" Error restoring consumer %q state: %v", cfg.Name, err) } } } @@ -1244,16 +1288,28 @@ func (a *Account) EnableJetStream(limits *JetStreamAccountLimits) error { return nil } -// Return whether or not we require MaxBytes to be set. -func (a *Account) maxBytesRequired() bool { +// Return whether we require MaxBytes to be set and if > 0 an upper limit for stream size exists +// Both limits are independent of each other. +func (a *Account) maxBytesLimits(cfg *StreamConfig) (bool, int64) { a.mu.RLock() - defer a.mu.RUnlock() - jsa := a.js + a.mu.RUnlock() if jsa == nil { - return false + return false, 0 + } + jsa.usageMu.RLock() + selectedLimits, _, ok := jsa.selectLimits(cfg) + jsa.usageMu.RUnlock() + if !ok { + return false, 0 } - return jsa.limits.MaxBytesRequired + maxStreamBytes := int64(0) + if cfg.Storage == MemoryStorage { + maxStreamBytes = selectedLimits.MemoryMaxStreamBytes + } else { + maxStreamBytes = selectedLimits.StoreMaxStreamBytes + } + return selectedLimits.MaxBytesRequired, maxStreamBytes } // NumStreams will return how many streams we have. @@ -1324,7 +1380,7 @@ func (a *Account) lookupStream(name string) (*stream, error) { } // UpdateJetStreamLimits will update the account limits for a JetStream enabled account. -func (a *Account) UpdateJetStreamLimits(limits *JetStreamAccountLimits) error { +func (a *Account) UpdateJetStreamLimits(limits map[string]JetStreamAccountLimits) error { a.mu.RLock() s, jsa := a.srv, a.js a.mu.RUnlock() @@ -1340,42 +1396,58 @@ func (a *Account) UpdateJetStreamLimits(limits *JetStreamAccountLimits) error { return NewJSNotEnabledForAccountError() } - if limits == nil { - limits = dynamicJSAccountLimits + if len(limits) == 0 { + limits = defaultJSAccountTiers } // Calculate the delta between what we have and what we want. - jsa.mu.Lock() - dl := diffCheckedLimits(&jsa.limits, limits) - jsa.mu.Unlock() + jsa.usageMu.RLock() + dl := diffCheckedLimits(jsa.limits, limits) + jsa.usageMu.RUnlock() js.mu.Lock() // Check the limits against existing reservations. - if err := js.sufficientResources(&dl); err != nil { + if err := js.sufficientResources(dl); err != nil { js.mu.Unlock() return err } js.mu.Unlock() // Update - jsa.mu.Lock() - jsa.limits = *limits - jsa.mu.Unlock() + jsa.usageMu.Lock() + jsa.limits = limits + jsa.usageMu.Unlock() return nil } -func diffCheckedLimits(a, b *JetStreamAccountLimits) JetStreamAccountLimits { - return JetStreamAccountLimits{ - MaxMemory: b.MaxMemory - a.MaxMemory, - MaxStore: b.MaxStore - a.MaxStore, +func diffCheckedLimits(a, b map[string]JetStreamAccountLimits) map[string]JetStreamAccountLimits { + diff := map[string]JetStreamAccountLimits{} + for t, la := range a { + // in a, not in b will return 0 + lb := b[t] + diff[t] = JetStreamAccountLimits{ + MaxMemory: lb.MaxMemory - la.MaxMemory, + MaxStore: lb.MaxStore - la.MaxStore, + } + } + for t, lb := range b { + if la, ok := a[t]; !ok { + // only in b not in a. (in a and b already covered) + diff[t] = JetStreamAccountLimits{ + MaxMemory: lb.MaxMemory - la.MaxMemory, + MaxStore: lb.MaxStore - la.MaxStore, + } + } } + return diff } // JetStreamUsage reports on JetStream usage and limits for an account. func (a *Account) JetStreamUsage() JetStreamAccountStats { a.mu.RLock() jsa, aname := a.js, a.Name + accJsLimits := a.jsLimits a.mu.RUnlock() var stats JetStreamAccountStats @@ -1383,26 +1455,80 @@ func (a *Account) JetStreamUsage() JetStreamAccountStats { js := jsa.js js.mu.RLock() jsa.mu.RLock() - stats.Memory = uint64(jsa.memTotal) - stats.Store = uint64(jsa.storeTotal) + jsa.usageMu.RLock() + stats.Memory, stats.Store = jsa.storageTotals() stats.Domain = js.config.Domain stats.API = JetStreamAPIStats{ Total: jsa.apiTotal, Errors: jsa.apiErrors, } + l, defaultTier := jsa.limits[_EMPTY_] + if defaultTier { + stats.Limits = l + } else { + skipped := 0 + stats.Tiers = make(map[string]JetStreamTier) + for t, total := range jsa.usage { + if _, ok := jsa.limits[t]; !ok && (*total) == (jsaStorage{}) { + // skip tiers not present that don't contain a count + // In case this shows an empty stream, that tier will be added when iterating over streams + skipped++ + } else { + stats.Tiers[t] = JetStreamTier{ + Memory: uint64(total.total.mem), + Store: uint64(total.total.store), + Limits: jsa.limits[t], + } + } + } + if len(accJsLimits) != len(jsa.usage)-skipped { + // insert unused limits + for t, lim := range accJsLimits { + if _, ok := stats.Tiers[t]; !ok { + stats.Tiers[t] = JetStreamTier{Limits: lim} + } + } + } + } + jsa.usageMu.RUnlock() if cc := jsa.js.cluster; cc != nil { sas := cc.streams[aname] - stats.Streams = len(sas) + if defaultTier { + stats.Streams = len(sas) + } for _, sa := range sas { stats.Consumers += len(sa.consumers) + if !defaultTier { + tier := tierName(sa.Config) + u, ok := stats.Tiers[tier] + if !ok { + u = JetStreamTier{} + } + u.Streams++ + stats.Streams++ + u.Consumers += len(sa.consumers) + stats.Tiers[tier] = u + } } } else { - stats.Streams = len(jsa.streams) + if defaultTier { + stats.Streams = len(jsa.streams) + } for _, mset := range jsa.streams { - stats.Consumers += mset.numConsumers() + consCount := mset.numConsumers() + stats.Consumers += consCount + if !defaultTier { + u, ok := stats.Tiers[mset.tier] + if !ok { + u = JetStreamTier{} + } + u.Streams++ + stats.Streams++ + u.Consumers += consCount + stats.Tiers[mset.tier] = u + } } } - stats.Limits = jsa.limits jsa.mu.RUnlock() js.mu.RUnlock() } @@ -1456,7 +1582,7 @@ func (a *Account) jetStreamConfigured() bool { } a.mu.RLock() defer a.mu.RUnlock() - return a.jsLimits != nil + return len(a.jsLimits) > 0 } // JetStreamEnabled is a helper to determine if jetstream is enabled for an account. @@ -1473,10 +1599,12 @@ func (a *Account) JetStreamEnabled() bool { func (jsa *jsAccount) remoteUpdateUsage(sub *subscription, c *client, _ *Account, subject, _ string, msg []byte) { const usageSize = 32 - jsa.mu.Lock() + // jsa.js.srv is immutable and guaranteed to no be nil, so no lock needed. s := jsa.js.srv + + jsa.usageMu.Lock() if len(msg) < usageSize { - jsa.mu.Unlock() + jsa.usageMu.Unlock() s.Warnf("Ignoring remote usage update with size too short") return } @@ -1485,56 +1613,92 @@ func (jsa *jsAccount) remoteUpdateUsage(sub *subscription, c *client, _ *Account rnode = subject[li+1:] } if rnode == _EMPTY_ { - jsa.mu.Unlock() + jsa.usageMu.Unlock() s.Warnf("Received remote usage update with no remote node") return } + rUsage, ok := jsa.rusage[rnode] + if !ok { + if jsa.rusage == nil { + jsa.rusage = make(map[string]*remoteUsage) + } + rUsage = &remoteUsage{tiers: make(map[string]*jsaUsage)} + jsa.rusage[rnode] = rUsage + } + updateTotal := func(tierName string, memUsed, storeUsed int64) { + total, ok := jsa.usage[tierName] + if !ok { + total = &jsaStorage{} + jsa.usage[tierName] = total + } + // Update the usage for this remote. + if usage := rUsage.tiers[tierName]; usage != nil { + // Decrement our old values. + total.total.mem -= usage.mem + total.total.store -= usage.store + usage.mem, usage.store = memUsed, storeUsed + } else { + rUsage.tiers[tierName] = &jsaUsage{memUsed, storeUsed} + } + total.total.mem += memUsed + total.total.store += storeUsed + } + var le = binary.LittleEndian - memUsed, storeUsed := int64(le.Uint64(msg[0:])), int64(le.Uint64(msg[8:])) apiTotal, apiErrors := le.Uint64(msg[16:]), le.Uint64(msg[24:]) + memUsed, storeUsed := int64(le.Uint64(msg[0:])), int64(le.Uint64(msg[8:])) - if jsa.rusage == nil { - jsa.rusage = make(map[string]*jsaUsage) - } - // Update the usage for this remote. - if usage := jsa.rusage[rnode]; usage != nil { - // Decrement our old values. - jsa.memTotal -= usage.mem - jsa.storeTotal -= usage.store - jsa.apiTotal -= usage.api - jsa.apiErrors -= usage.err - usage.mem, usage.store = memUsed, storeUsed - usage.api, usage.err = apiTotal, apiErrors - } else { - jsa.rusage[rnode] = &jsaUsage{memUsed, storeUsed, apiTotal, apiErrors} - } - jsa.memTotal += memUsed - jsa.storeTotal += storeUsed + // we later extended the data structure to support multiple tiers + excessRecordCnt := uint32(0) + tierName := _EMPTY_ + if len(msg) >= 44 { + excessRecordCnt = le.Uint32(msg[32:]) + length := le.Uint64(msg[36:]) + tierName = string(msg[44 : 44+length]) + msg = msg[44+length:] + } + updateTotal(tierName, memUsed, storeUsed) + for ; excessRecordCnt > 0 && len(msg) >= 24; excessRecordCnt-- { + memUsed, storeUsed := int64(le.Uint64(msg[0:])), int64(le.Uint64(msg[8:])) + length := le.Uint64(msg[16:]) + tierName = string(msg[24 : 24+length]) + msg = msg[24+length:] + updateTotal(tierName, memUsed, storeUsed) + } + jsa.apiTotal -= rUsage.api + jsa.apiErrors -= rUsage.err + rUsage.api = apiTotal + rUsage.err = apiErrors jsa.apiTotal += apiTotal jsa.apiErrors += apiErrors - jsa.mu.Unlock() + jsa.usageMu.Unlock() } // Updates accounting on in use memory and storage. This is called from locally // by the lower storage layers. -func (jsa *jsAccount) updateUsage(storeType StorageType, delta int64) { - var isClustered bool - // Ok to check jsa.js here w/o lock. +func (jsa *jsAccount) updateUsage(tierName string, storeType StorageType, delta int64) { + // jsa.js is immutable and cannot be nil, so ok w/o lock. js := jsa.js - if js != nil { - isClustered = js.isClustered() - } + // updateUsage() may be invoked under the mset's lock, so we can't get + // the js' lock to check if clustered. So use this function that make + // use of an atomic to do the check without having data race reports. + isClustered := js.isClusteredNoLock() - jsa.mu.Lock() - defer jsa.mu.Unlock() + jsa.usageMu.Lock() + defer jsa.usageMu.Unlock() + s, ok := jsa.usage[tierName] + if !ok { + s = &jsaStorage{} + jsa.usage[tierName] = s + } if storeType == MemoryStorage { - jsa.usage.mem += delta - jsa.memTotal += delta + s.local.mem += delta + s.total.mem += delta atomic.AddInt64(&js.memUsed, delta) } else { - jsa.usage.store += delta - jsa.storeTotal += delta + s.local.store += delta + s.total.store += delta atomic.AddInt64(&js.storeUsed, delta) } // Publish our local updates if in clustered mode. @@ -1546,8 +1710,8 @@ func (jsa *jsAccount) updateUsage(storeType StorageType, delta int64) { const usageTick = 1500 * time.Millisecond func (jsa *jsAccount) sendClusterUsageUpdateTimer() { - jsa.mu.Lock() - defer jsa.mu.Unlock() + jsa.usageMu.Lock() + defer jsa.usageMu.Unlock() jsa.sendClusterUsageUpdate() if jsa.utimer != nil { jsa.utimer.Reset(usageTick) @@ -1555,11 +1719,8 @@ func (jsa *jsAccount) sendClusterUsageUpdateTimer() { } // Send updates to our account usage for this server. -// Lock should be held. +// jsa.usageMu lock should be held. func (jsa *jsAccount) sendClusterUsageUpdate() { - if jsa.js == nil || jsa.js.srv == nil || jsa.sendq == nil { - return - } // These values are absolute so we can limit send rates. now := time.Now() if now.Sub(jsa.lupdate) < 250*time.Millisecond { @@ -1567,13 +1728,39 @@ func (jsa *jsAccount) sendClusterUsageUpdate() { } jsa.lupdate = now - b := make([]byte, 32) + lenUsage := len(jsa.usage) + if lenUsage == 0 { + return + } + // every base record contains mem/store/len(tier) as well as the tier name + l := 24 * lenUsage + for tier := range jsa.usage { + l += len(tier) + } + if lenUsage > 0 { + // first record contains api/usage errors as well as count for extra base records + l += 20 + } var le = binary.LittleEndian - le.PutUint64(b[0:], uint64(jsa.usage.mem)) - le.PutUint64(b[8:], uint64(jsa.usage.store)) - le.PutUint64(b[16:], uint64(jsa.usage.api)) - le.PutUint64(b[24:], uint64(jsa.usage.err)) - + b := make([]byte, l) + i := 0 + + for tier, usage := range jsa.usage { + le.PutUint64(b[i+0:], uint64(usage.local.mem)) + le.PutUint64(b[i+8:], uint64(usage.local.store)) + if i == 0 { + le.PutUint64(b[i+16:], jsa.usageApi) + le.PutUint64(b[i+24:], jsa.usageErr) + le.PutUint32(b[i+32:], uint32(len(jsa.usage)-1)) + le.PutUint64(b[i+36:], uint64(len(tier))) + copy(b[i+44:], tier) + i += 44 + len(tier) + } else { + le.PutUint64(b[i+16:], uint64(len(tier))) + copy(b[i+24:], tier) + i += 24 + len(tier) + } + } jsa.sendq.push(newPubMsg(nil, jsa.updatesPub, _EMPTY_, nil, nil, b, noCompression, false, false)) } @@ -1594,65 +1781,135 @@ func (js *jetStream) limitsExceeded(storeType StorageType) bool { return js.wouldExceedLimits(storeType, 0) } -func (jsa *jsAccount) limitsExceeded(storeType StorageType) bool { +func tierName(cfg *StreamConfig) string { + // TODO (mh) this is where we could select based off a placement tag as well "qos:tier" + return fmt.Sprintf("R%d", cfg.Replicas) +} + +func isSameTier(cfgA, cfgB *StreamConfig) bool { + // TODO (mh) this is where we could select based off a placement tag as well "qos:tier" + return cfgA.Replicas == cfgB.Replicas +} + +func (jsa *jsAccount) jetStreamAndClustered() (*jetStream, bool) { jsa.mu.RLock() - defer jsa.mu.RUnlock() + js := jsa.js + jsa.mu.RUnlock() + return js, js.isClustered() +} + +// jsa.usageMu read lock should be held. +func (jsa *jsAccount) selectLimits(cfg *StreamConfig) (JetStreamAccountLimits, string, bool) { + if selectedLimits, ok := jsa.limits[_EMPTY_]; ok { + return selectedLimits, _EMPTY_, true + } + tier := tierName(cfg) + if selectedLimits, ok := jsa.limits[tier]; ok { + return selectedLimits, tier, true + } + return JetStreamAccountLimits{}, _EMPTY_, false +} +// Lock should be held. +func (jsa *jsAccount) countStreams(tier string, cfg *StreamConfig) int { + streams := len(jsa.streams) + if tier != _EMPTY_ { + streams = 0 + for _, sa := range jsa.streams { + if isSameTier(&sa.cfg, cfg) { + streams++ + } + } + } + return streams +} + +// jsa.usageMu read lock (at least) should be held. +func (jsa *jsAccount) storageTotals() (uint64, uint64) { + mem := uint64(0) + store := uint64(0) + for _, sa := range jsa.usage { + mem += uint64(sa.total.mem) + store += uint64(sa.total.store) + } + return mem, store +} + +func (jsa *jsAccount) limitsExceeded(storeType StorageType, tierName string) (bool, *ApiError) { + jsa.usageMu.RLock() + defer jsa.usageMu.RUnlock() + + selectedLimits, ok := jsa.limits[tierName] + if !ok { + return true, NewJSNoLimitsError() + } + inUse := jsa.usage[tierName] + if inUse == nil { + // Imply totals of 0 + return false, nil + } if storeType == MemoryStorage { - if jsa.limits.MaxMemory >= 0 && jsa.memTotal > jsa.limits.MaxMemory { - return true + totalMem := inUse.total.mem + if selectedLimits.MemoryMaxStreamBytes > 0 && totalMem > selectedLimits.MemoryMaxStreamBytes { + return true, nil + } + if selectedLimits.MaxMemory >= 0 && totalMem > selectedLimits.MaxMemory { + return true, nil } } else { - if jsa.limits.MaxStore >= 0 && jsa.storeTotal > jsa.limits.MaxStore { - return true + totalStore := inUse.total.store + if selectedLimits.StoreMaxStreamBytes > 0 && totalStore > selectedLimits.StoreMaxStreamBytes { + return true, nil + } + if selectedLimits.MaxStore >= 0 && totalStore > selectedLimits.MaxStore { + return true, nil } } - return false + return false, nil } // Check account limits. -func (jsa *jsAccount) checkAccountLimits(config *StreamConfig) error { - return jsa.checkLimits(config, false) +// Read Lock should be held +func (js *jetStream) checkAccountLimits(selected *JetStreamAccountLimits, config *StreamConfig, currentRes int64) error { + return js.checkLimits(selected, config, false, currentRes, 0) } // Check account and server limits. -func (jsa *jsAccount) checkAllLimits(config *StreamConfig) error { - return jsa.checkLimits(config, true) +// Read Lock should be held +func (js *jetStream) checkAllLimits(selected *JetStreamAccountLimits, config *StreamConfig, currentRes, maxBytesOffset int64) error { + return js.checkLimits(selected, config, true, currentRes, maxBytesOffset) } // Check if a new proposed msg set while exceed our account limits. // Lock should be held. -func (jsa *jsAccount) checkLimits(config *StreamConfig, checkServer bool) error { - if jsa.limits.MaxStreams > 0 && len(jsa.streams) >= jsa.limits.MaxStreams { - return NewJSMaximumStreamsLimitError() - } +func (js *jetStream) checkLimits(selected *JetStreamAccountLimits, config *StreamConfig, checkServer bool, currentRes, maxBytesOffset int64) error { // Check MaxConsumers - if config.MaxConsumers > 0 && jsa.limits.MaxConsumers > 0 && config.MaxConsumers > jsa.limits.MaxConsumers { + if config.MaxConsumers > 0 && selected.MaxConsumers > 0 && config.MaxConsumers > selected.MaxConsumers { return NewJSMaximumConsumersLimitError() } - + // stream limit is checked separately on stream create only! // Check storage, memory or disk. - return jsa.checkBytesLimits(config.MaxBytes, config.Storage, config.Replicas, checkServer) + return js.checkBytesLimits(selected, config.MaxBytes, config.Storage, config.Replicas, checkServer, currentRes, maxBytesOffset) } // Check if additional bytes will exceed our account limits and optionally the server itself. // This should account for replicas. -// Lock should be held. -func (jsa *jsAccount) checkBytesLimits(addBytes int64, storage StorageType, replicas int, checkServer bool) error { +// Read Lock should be held. +func (js *jetStream) checkBytesLimits(selectedLimits *JetStreamAccountLimits, addBytes int64, storage StorageType, replicas int, checkServer bool, currentRes, maxBytesOffset int64) error { if replicas < 1 { replicas = 1 } if addBytes < 0 { addBytes = 1 } - js, totalBytes := jsa.js, addBytes*int64(replicas) + totalBytes := (addBytes * int64(replicas)) + maxBytesOffset switch storage { case MemoryStorage: // Account limits defined. - if jsa.limits.MaxMemory >= 0 { - if jsa.memReserved+totalBytes > jsa.limits.MaxMemory { + if selectedLimits.MaxMemory >= 0 { + if currentRes+totalBytes > selectedLimits.MaxMemory { return NewJSMemoryResourcesExceededError() } } @@ -1662,8 +1919,8 @@ func (jsa *jsAccount) checkBytesLimits(addBytes int64, storage StorageType, repl } case FileStorage: // Account limits defined. - if jsa.limits.MaxStore >= 0 { - if jsa.storeReserved+totalBytes > jsa.limits.MaxStore { + if selectedLimits.MaxStore >= 0 { + if currentRes+totalBytes > selectedLimits.MaxStore { return NewJSStorageResourcesExceededError() } } @@ -1686,16 +1943,18 @@ func (jsa *jsAccount) delete() { var ts []string jsa.mu.Lock() + // The update timer and subs need to be protected by usageMu lock + jsa.usageMu.Lock() if jsa.utimer != nil { jsa.utimer.Stop() jsa.utimer = nil } - if jsa.updatesSub != nil && jsa.js.srv != nil { s := jsa.js.srv s.sysUnsubscribe(jsa.updatesSub) jsa.updatesSub = nil } + jsa.usageMu.Unlock() for _, ms := range jsa.streams { streams = append(streams, ms) @@ -1747,7 +2006,7 @@ func (js *jetStream) usageStats() *JetStreamStats { // Check to see if we have enough system resources for this account. // Lock should be held. -func (js *jetStream) sufficientResources(limits *JetStreamAccountLimits) error { +func (js *jetStream) sufficientResources(limits map[string]JetStreamAccountLimits) error { // If we are clustered we do not really know how many resources will be ultimately available. // This needs to be handled out of band. // If we are a single server, we can make decisions here. @@ -1755,31 +2014,44 @@ func (js *jetStream) sufficientResources(limits *JetStreamAccountLimits) error { return nil } + totalMaxBytes := func(limits map[string]JetStreamAccountLimits) (int64, int64) { + totalMaxMemory := int64(0) + totalMaxStore := int64(0) + for _, l := range limits { + if l.MaxMemory > 0 { + totalMaxMemory += l.MaxMemory + } + if l.MaxStore > 0 { + totalMaxStore += l.MaxStore + } + } + return totalMaxMemory, totalMaxStore + } + + totalMaxMemory, totalMaxStore := totalMaxBytes(limits) + // Reserved is now specific to the MaxBytes for streams. - if js.memReserved+limits.MaxMemory > js.config.MaxMemory { + if js.memReserved+totalMaxMemory > js.config.MaxMemory { return NewJSMemoryResourcesExceededError() } - if js.storeReserved+limits.MaxStore > js.config.MaxStore { + if js.storeReserved+totalMaxStore > js.config.MaxStore { return NewJSStorageResourcesExceededError() } // Since we know if we are here we are single server mode, check the account reservations. var storeReserved, memReserved int64 for _, jsa := range js.accounts { - jsa.mu.RLock() - if jsa.limits.MaxMemory > 0 { - memReserved += jsa.limits.MaxMemory - } - if jsa.limits.MaxStore > 0 { - storeReserved += jsa.limits.MaxStore - } - jsa.mu.RUnlock() + jsa.usageMu.RLock() + maxMemory, maxStore := totalMaxBytes(jsa.limits) + jsa.usageMu.RUnlock() + memReserved += maxMemory + storeReserved += maxStore } - if memReserved+limits.MaxMemory > js.config.MaxMemory { + if memReserved+totalMaxMemory > js.config.MaxMemory { return NewJSMemoryResourcesExceededError() } - if storeReserved+limits.MaxStore > js.config.MaxStore { + if storeReserved+totalMaxStore > js.config.MaxStore { return NewJSStorageResourcesExceededError() } @@ -1931,9 +2203,9 @@ func (a *Account) addStreamTemplate(tc *StreamTemplateConfig) (*streamTemplate, // FIXME(dlc) - Hacky tcopy := tc.deepCopy() tcopy.Config.Name = "_" - cfg, err := checkStreamCfg(tcopy.Config) - if err != nil { - return nil, err + cfg, apiErr := s.checkStreamCfg(tcopy.Config, a) + if apiErr != nil { + return nil, apiErr } tcopy.Config = &cfg t := &streamTemplate{ @@ -2022,7 +2294,7 @@ func (t *streamTemplate) processInboundTemplateMsg(_ *subscription, pc *client, t.mu.Unlock() if atLimit { - c.Warnf("JetStream could not create stream for account %q on subject %q, at limit", acc.Name, subject) + c.RateLimitWarnf("JetStream could not create stream for account %q on subject %q, at limit", acc.Name, subject) return } @@ -2033,7 +2305,7 @@ func (t *streamTemplate) processInboundTemplateMsg(_ *subscription, pc *client, mset, err := acc.addStream(&cfg) if err != nil { acc.validateStreams(t) - c.Warnf("JetStream could not create stream for account %q on subject %q", acc.Name, subject) + c.RateLimitWarnf("JetStream could not create stream for account %q on subject %q: %v", acc.Name, subject, err) return } @@ -2253,7 +2525,7 @@ func validateJetStreamOptions(o *Options) error { } else { for _, acc := range o.Accounts { if a == acc.GetName() { - if acc.jsLimits != nil && domain != _EMPTY_ { + if len(acc.jsLimits) > 0 && domain != _EMPTY_ { return fmt.Errorf("default_js_domain contains account name %q with enabled JetStream", a) } found = true @@ -2317,3 +2589,13 @@ func validateJetStreamOptions(o *Options) error { } return nil } + +// We had a bug that set a default de dupe window on mirror, despite that being not a valid config +func fixCfgMirrorWithDedupWindow(cfg *StreamConfig) { + if cfg == nil || cfg.Mirror == nil { + return + } + if cfg.Duplicates != 0 { + cfg.Duplicates = 0 + } +} diff --git a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_api.go b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_api.go index 31c0de05..5a4da584 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_api.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_api.go @@ -183,8 +183,9 @@ const ( // jsAckT is the template for the ack message stream coming back from a consumer // when they ACK/NAK, etc a message. - jsAckT = "$JS.ACK.%s.%s" - jsAckPre = "$JS.ACK." + jsAckT = "$JS.ACK.%s.%s" + jsAckPre = "$JS.ACK." + jsAckPreLen = len(jsAckPre) // jsFlowControl is for flow control subjects. jsFlowControlPre = "$JS.FC." @@ -203,6 +204,9 @@ const ( // JSAdvisoryConsumerMaxDeliveryExceedPre is a notification published when a message exceeds its delivery threshold. JSAdvisoryConsumerMaxDeliveryExceedPre = "$JS.EVENT.ADVISORY.CONSUMER.MAX_DELIVERIES" + // JSAdvisoryConsumerMsgNakPre is a notification published when a message has been naked + JSAdvisoryConsumerMsgNakPre = "$JS.EVENT.ADVISORY.CONSUMER.MSG_NAKED" + // JSAdvisoryConsumerMsgTerminatedPre is a notification published when a message has been terminated. JSAdvisoryConsumerMsgTerminatedPre = "$JS.EVENT.ADVISORY.CONSUMER.MSG_TERMINATED" @@ -289,7 +293,8 @@ func generateJSMappingTable(domain string) map[string]string { const JSMaxDescriptionLen = 4 * 1024 // JSMaxNameLen is the maximum name lengths for streams, consumers and templates. -const JSMaxNameLen = 256 +// Picked 255 as it seems to be a widely used file name limit +const JSMaxNameLen = 255 // Responses for API calls. @@ -603,6 +608,7 @@ const JSApiConsumerListResponseType = "io.nats.jetstream.api.v1.consumer_list_re type JSApiConsumerGetNextRequest struct { Expires time.Duration `json:"expires,omitempty"` Batch int `json:"batch,omitempty"` + MaxBytes int `json:"max_bytes,omitempty"` NoWait bool `json:"no_wait,omitempty"` Heartbeat time.Duration `json:"idle_heartbeat,omitempty"` } @@ -643,16 +649,22 @@ type JSApiStreamTemplateNamesResponse struct { const JSApiStreamTemplateNamesResponseType = "io.nats.jetstream.api.v1.stream_template_names_response" -// Default max API calls outstanding. -const defaultMaxJSApiOut = int64(4096) - -// Max API calls outstanding. -var maxJSApiOut = defaultMaxJSApiOut +// Structure that holds state for a JetStream API request that is processed +// in a separate long-lived go routine. This is to avoid possibly blocking +// ROUTE and GATEWAY connections. +type jsAPIRoutedReq struct { + jsub *subscription + sub *subscription + acc *Account + subject string + reply string + msg []byte + pa pubArg +} func (js *jetStream) apiDispatch(sub *subscription, c *client, acc *Account, subject, reply string, rmsg []byte) { - js.mu.RLock() + // No lock needed, those are immutable. s, rr := js.srv, js.apiSubs.Match(subject) - js.mu.RUnlock() hdr, _ := c.msgParts(rmsg) if len(getHeader(ClientInfoHdr, hdr)) == 0 { @@ -682,36 +694,36 @@ func (js *jetStream) apiDispatch(sub *subscription, c *client, acc *Account, sub } // If we are here we have received this request over a non client connection. - // We need to make sure not to block. We will spin a Go routine per but also make - // sure we do not have too many outstanding. - if apiOut := atomic.AddInt64(&js.apiInflight, 1); apiOut > maxJSApiOut { - atomic.AddInt64(&js.apiInflight, -1) - ci, acc, _, msg, err := s.getRequestInfo(c, rmsg) - if err == nil { - resp := &ApiResponse{Type: JSApiOverloadedType, Error: NewJSInsufficientResourcesError()} - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - } else { - s.Warnf(badAPIRequestT, rmsg) - } - s.Warnf("JetStream API limit exceeded: %d calls outstanding", apiOut) - return - } + // We need to make sure not to block. We will send the request to a long-lived + // go routine. - // If we are here we can properly dispatch this API call. - // Copy the message and the client. Client for the pubArgs - // but note the JSAPI only uses the hdr index to piece apart - // the header from the msg body. No other references are needed. - // FIXME(dlc) - Should cleanup eventually and make sending - // and receiving internal messages more formal. - rmsg = copyBytes(rmsg) + // Copy the state. Note the JSAPI only uses the hdr index to piece apart the + // header from the msg body. No other references are needed. + s.jsAPIRoutedReqs.push(&jsAPIRoutedReq{jsub, sub, acc, subject, reply, copyBytes(rmsg), c.pa}) +} + +func (s *Server) processJSAPIRoutedRequests() { + defer s.grWG.Done() + + s.mu.Lock() + queue := s.jsAPIRoutedReqs client := &client{srv: s, kind: JETSTREAM} - client.pa = c.pa + s.mu.Unlock() - // Dispatch the API call to its own Go routine. - go func() { - jsub.icb(sub, client, acc, subject, reply, rmsg) - atomic.AddInt64(&js.apiInflight, -1) - }() + for { + select { + case <-queue.ch: + reqs := queue.pop() + for _, req := range reqs { + r := req.(*jsAPIRoutedReq) + client.pa = r.pa + r.jsub.icb(r.sub, client, r.acc, r.subject, r.reply, r.msg) + } + queue.recycle(&reqs) + case <-s.quitCh: + return + } + } } func (s *Server) setJetStreamExportSubs() error { @@ -720,6 +732,11 @@ func (s *Server) setJetStreamExportSubs() error { return NewJSNotEnabledError() } + // Start the go routine that will process API requests received by the + // subscription below when they are coming from routes, etc.. + s.jsAPIRoutedReqs = s.newIPQueue("Routed JS API Requests") + s.startGoRoutine(s.processJSAPIRoutedRequests) + // This is the catch all now for all JetStream API calls. if _, err := s.sysSubscribe(jsAllAPI, js.apiDispatch); err != nil { return err @@ -845,12 +862,12 @@ func (a *Account) trackAPI() { jsa := a.js a.mu.RUnlock() if jsa != nil { - jsa.mu.Lock() - jsa.usage.api++ + jsa.usageMu.Lock() + jsa.usageApi++ jsa.apiTotal++ jsa.sendClusterUsageUpdate() atomic.AddInt64(&jsa.js.apiTotal, 1) - jsa.mu.Unlock() + jsa.usageMu.Unlock() } } @@ -859,15 +876,15 @@ func (a *Account) trackAPIErr() { jsa := a.js a.mu.RUnlock() if jsa != nil { - jsa.mu.Lock() - jsa.usage.api++ + jsa.usageMu.Lock() + jsa.usageApi++ jsa.apiTotal++ - jsa.usage.err++ + jsa.usageErr++ jsa.apiErrors++ jsa.sendClusterUsageUpdate() atomic.AddInt64(&jsa.js.apiTotal, 1) atomic.AddInt64(&jsa.js.apiErrors, 1) - jsa.mu.Unlock() + jsa.usageMu.Unlock() } } @@ -1145,6 +1162,31 @@ func (s *Server) jsonResponse(v interface{}) string { return string(b) } +// Read lock must be held +func (jsa *jsAccount) tieredReservation(tier string, cfg *StreamConfig) int64 { + reservation := int64(0) + if tier == _EMPTY_ { + for _, sa := range jsa.streams { + if sa.cfg.MaxBytes > 0 { + if sa.cfg.Storage == cfg.Storage && sa.cfg.Name != cfg.Name { + reservation += (int64(sa.cfg.Replicas) * sa.cfg.MaxBytes) + } + } + } + } else { + for _, sa := range jsa.streams { + if sa.cfg.Replicas == cfg.Replicas { + if sa.cfg.MaxBytes > 0 { + if isSameTier(&sa.cfg, cfg) && sa.cfg.Name != cfg.Name { + reservation += (int64(sa.cfg.Replicas) * sa.cfg.MaxBytes) + } + } + } + } + } + return reservation +} + // Request to create a stream. func (s *Server) jsStreamCreateRequest(sub *subscription, c *client, _ *Account, subject, reply string, rmsg []byte) { if c == nil || !s.JetStreamEnabled() { @@ -1182,6 +1224,7 @@ func (s *Server) jsStreamCreateRequest(sub *subscription, c *client, _ *Account, } return } + var cfg StreamConfig if err := json.Unmarshal(msg, &cfg); err != nil { resp.Error = NewJSInvalidJSONError() @@ -1196,132 +1239,9 @@ func (s *Server) jsStreamCreateRequest(sub *subscription, c *client, _ *Account, return } - hasStream := func(streamName string) (bool, int32, []string) { - var exists bool - var maxMsgSize int32 - var subs []string - if s.JetStreamIsClustered() { - if js, _ := s.getJetStreamCluster(); js != nil { - js.mu.RLock() - if sa := js.streamAssignment(acc.Name, streamName); sa != nil { - maxMsgSize = sa.Config.MaxMsgSize - subs = sa.Config.Subjects - exists = true - } - js.mu.RUnlock() - } - } else if mset, err := acc.lookupStream(streamName); err == nil { - maxMsgSize = mset.cfg.MaxMsgSize - subs = mset.cfg.Subjects - exists = true - } - return exists, maxMsgSize, subs - } - - var streamSubs []string - var deliveryPrefixes []string - var apiPrefixes []string - - // Do some pre-checking for mirror config to avoid cycles in clustered mode. - if cfg.Mirror != nil { - if len(cfg.Subjects) > 0 { - resp.Error = NewJSMirrorWithSubjectsError() - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - if len(cfg.Sources) > 0 { - resp.Error = NewJSMirrorWithSourcesError() - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - if cfg.Mirror.FilterSubject != _EMPTY_ { - resp.Error = NewJSMirrorWithSubjectFiltersError() - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - if cfg.Mirror.OptStartSeq > 0 && cfg.Mirror.OptStartTime != nil { - resp.Error = NewJSMirrorWithStartSeqAndTimeError() - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - if cfg.Duplicates != time.Duration(0) { - resp.Error = &ApiError{Code: 400, Description: "stream mirrors do not make use of a de-duplication window"} - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - // We do not require other stream to exist anymore, but if we can see it check payloads. - exists, maxMsgSize, subs := hasStream(cfg.Mirror.Name) - if len(subs) > 0 { - streamSubs = append(streamSubs, subs...) - } - if exists && cfg.MaxMsgSize > 0 && maxMsgSize > 0 && cfg.MaxMsgSize < maxMsgSize { - resp.Error = NewJSMirrorMaxMessageSizeTooBigError() - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - if cfg.Mirror.External != nil { - if cfg.Mirror.External.DeliverPrefix != _EMPTY_ { - deliveryPrefixes = append(deliveryPrefixes, cfg.Mirror.External.DeliverPrefix) - } - if cfg.Mirror.External.ApiPrefix != _EMPTY_ { - apiPrefixes = append(apiPrefixes, cfg.Mirror.External.ApiPrefix) - } - } - } - if len(cfg.Sources) > 0 { - for _, src := range cfg.Sources { - if src.External == nil { - continue - } - exists, maxMsgSize, subs := hasStream(src.Name) - if len(subs) > 0 { - streamSubs = append(streamSubs, subs...) - } - if src.External.DeliverPrefix != _EMPTY_ { - deliveryPrefixes = append(deliveryPrefixes, src.External.DeliverPrefix) - } - if src.External.ApiPrefix != _EMPTY_ { - apiPrefixes = append(apiPrefixes, src.External.ApiPrefix) - } - if exists && cfg.MaxMsgSize > 0 && maxMsgSize > 0 && cfg.MaxMsgSize < maxMsgSize { - resp.Error = NewJSSourceMaxMessageSizeTooBigError() - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - } - } - // check prefix overlap with subjects - for _, pfx := range deliveryPrefixes { - if !IsValidPublishSubject(pfx) { - resp.Error = NewJSStreamInvalidExternalDeliverySubjError(pfx) - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - for _, sub := range streamSubs { - if SubjectsCollide(sub, fmt.Sprintf("%s.%s", pfx, sub)) { - resp.Error = NewJSStreamExternalDelPrefixOverlapsError(pfx, sub) - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - } - } - // check if api prefixes overlap - for _, apiPfx := range apiPrefixes { - if !IsValidPublishSubject(apiPfx) { - resp.Error = &ApiError{Code: 400, Description: fmt.Sprintf("stream external api prefix %q must be a valid subject without wildcards", apiPfx)} - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - if SubjectsCollide(apiPfx, JSApiPrefix) { - resp.Error = NewJSStreamExternalApiOverlapError(apiPfx, JSApiPrefix) - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - } - - // Check for MaxBytes required. - if acc.maxBytesRequired() && cfg.MaxBytes <= 0 { - resp.Error = NewJSStreamMaxBytesRequiredError() + // Can't create a stream with a sealed state. + if cfg.Sealed { + resp.Error = NewJSStreamInvalidConfigError(fmt.Errorf("stream configuration for create can not be sealed")) s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) return } @@ -1332,13 +1252,23 @@ func (s *Server) jsStreamCreateRequest(sub *subscription, c *client, _ *Account, return } + if err := acc.jsNonClusteredStreamLimitsCheck(&cfg); err != nil { + resp.Error = err + s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) + return + } + mset, err := acc.addStream(&cfg) if err != nil { resp.Error = NewJSStreamCreateError(err, Unless(err)) s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) return } - resp.StreamInfo = &StreamInfo{Created: mset.createdTime(), State: mset.state(), Config: mset.config()} + resp.StreamInfo = &StreamInfo{ + Created: mset.createdTime(), + State: mset.state(), + Config: mset.config(), + } resp.DidCreate = true s.sendAPIResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(resp)) } @@ -1388,9 +1318,9 @@ func (s *Server) jsStreamUpdateRequest(sub *subscription, c *client, _ *Account, return } - cfg, err := checkStreamCfg(&ncfg) - if err != nil { - resp.Error = NewJSStreamInvalidConfigError(err) + cfg, apiErr := s.checkStreamCfg(&ncfg, acc) + if apiErr != nil { + resp.Error = apiErr s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) return } @@ -1402,8 +1332,15 @@ func (s *Server) jsStreamUpdateRequest(sub *subscription, c *client, _ *Account, return } + // Handle clustered version here. if s.JetStreamIsClustered() { - s.jsClusteredStreamUpdateRequest(ci, acc, subject, reply, rmsg, &cfg) + // If we are inline with client, we still may need to do a callout for stream info + // during this call, so place in Go routine to not block client. + if c.kind != ROUTER && c.kind != GATEWAY { + go s.jsClusteredStreamUpdateRequest(ci, acc, subject, reply, rmsg, &cfg) + } else { + s.jsClusteredStreamUpdateRequest(ci, acc, subject, reply, rmsg, &cfg) + } return } @@ -1420,9 +1357,14 @@ func (s *Server) jsStreamUpdateRequest(sub *subscription, c *client, _ *Account, return } - js, _ := s.getJetStreamCluster() - - resp.StreamInfo = &StreamInfo{Created: mset.createdTime(), State: mset.state(), Config: mset.config(), Cluster: js.clusterInfo(mset.raftGroup())} + resp.StreamInfo = &StreamInfo{ + Created: mset.createdTime(), + State: mset.state(), + Config: mset.config(), + Domain: s.getOpts().JetStreamDomain, + Mirror: mset.mirrorInfo(), + Sources: mset.sourcesInfo(), + } s.sendAPIResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(resp)) } @@ -1609,7 +1551,7 @@ func (s *Server) jsStreamListRequest(sub *subscription, c *client, _ *Account, s // Clustered mode will invoke a scatter and gather. if s.JetStreamIsClustered() { - // Need to copy these off before sending.. + // Need to copy these off before sending.. don't move this inside startGoRoutine!!! msg = copyBytes(msg) s.startGoRoutine(func() { s.jsClusteredStreamListRequest(acc, ci, filter, offset, subject, reply, msg) }) return @@ -1634,13 +1576,15 @@ func (s *Server) jsStreamListRequest(sub *subscription, c *client, _ *Account, s } for _, mset := range msets[offset:] { + config := mset.config() resp.Streams = append(resp.Streams, &StreamInfo{ Created: mset.createdTime(), State: mset.state(), - Config: mset.config(), + Config: config, + Domain: s.getOpts().JetStreamDomain, Mirror: mset.mirrorInfo(), - Sources: mset.sourcesInfo()}, - ) + Sources: mset.sourcesInfo(), + }) if len(resp.Streams) >= JSApiListLimit { break } @@ -1684,8 +1628,10 @@ func (s *Server) jsStreamInfoRequest(sub *subscription, c *client, a *Account, s js.mu.RLock() isLeader, sa := cc.isLeader(), js.streamAssignment(acc.Name, streamName) + var offline bool if sa != nil { clusterWideConsCount = len(sa.consumers) + offline = s.allPeersOffline(sa.Group) } js.mu.RUnlock() @@ -1709,6 +1655,10 @@ func (s *Server) jsStreamInfoRequest(sub *subscription, c *client, a *Account, s s.sendDelayedAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp), nil) } return + } else if isLeader && offline { + resp.Error = NewJSStreamOfflineError() + s.sendDelayedAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp), nil) + return } // Check to see if we are a member of the group and if the group has no leader. @@ -1720,8 +1670,27 @@ func (s *Server) jsStreamInfoRequest(sub *subscription, c *client, a *Account, s resp.Error = NewJSClusterNotAvailError() // Delaying an error response gives the leader a chance to respond before us s.sendDelayedAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp), sa.Group) + return + } + + // We may be in process of electing a leader, but if this is a scale up from 1 we will still be the state leader + // while the new members work through the election and catchup process. + // Double check for that instead of exiting here and being silent. e.g. nats stream update test --replicas=3 + js.mu.RLock() + rg := sa.Group + var ourID string + if cc.meta != nil { + ourID = cc.meta.ID() + } + bail := !rg.isMember(ourID) + if !bail { + // We know we are a member here, if this group is new and we are preferred allow us to answer. + bail = rg.Preferred != ourID || time.Since(rg.node.Created()) > lostQuorumInterval + } + js.mu.RUnlock() + if bail { + return } - return } } @@ -1756,11 +1725,14 @@ func (s *Server) jsStreamInfoRequest(sub *subscription, c *client, a *Account, s js, _ := s.getJetStreamCluster() resp.StreamInfo = &StreamInfo{ - Created: mset.createdTime(), - State: mset.stateWithDetail(details), - Config: config, - Domain: s.getOpts().JetStreamDomain, - Cluster: js.clusterInfo(mset.raftGroup()), + Created: mset.createdTime(), + State: mset.stateWithDetail(details), + Config: config, + Domain: s.getOpts().JetStreamDomain, + Cluster: js.clusterInfo(mset.raftGroup()), + Mirror: mset.mirrorInfo(), + Sources: mset.sourcesInfo(), + Alternates: js.streamAlternates(ci, config.Name), } if clusterWideConsCount > 0 { resp.StreamInfo.State.Consumers = clusterWideConsCount @@ -1790,7 +1762,7 @@ func (s *Server) jsStreamInfoRequest(sub *subscription, c *client, a *Account, s } // Check for out of band catchups. if mset.hasCatchupPeers() { - mset.checkClusterInfo(resp.StreamInfo) + mset.checkClusterInfo(resp.StreamInfo.Cluster) } s.sendAPIResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(resp)) @@ -1877,6 +1849,9 @@ func (s *Server) jsStreamLeaderStepDownRequest(sub *subscription, c *client, _ * // Call actual stepdown. if mset != nil { if node := mset.raftNode(); node != nil { + mset.setLeader(false) + // TODO (mh) eventually make sure all go routines exited and all channels are cleared + time.Sleep(250 * time.Millisecond) node.StepDown() } } @@ -1978,7 +1953,12 @@ func (s *Server) jsConsumerLeaderStepDownRequest(sub *subscription, c *client, _ } // Call actual stepdown. - o.raftNode().StepDown() + if n := o.raftNode(); n != nil { + o.setLeader(false) + // TODO (mh) eventually make sure all go routines exited and all channels are cleared + time.Sleep(250 * time.Millisecond) + n.StepDown() + } resp.Success = true s.sendAPIResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(resp)) @@ -2530,16 +2510,13 @@ func (s *Server) jsMsgGetRequest(sub *subscription, c *client, _ *Account, subje return } - var subj string - var hdr []byte - var data []byte - var ts int64 - seq := req.Seq + var svp StoreMsg + var sm *StoreMsg if req.Seq > 0 { - subj, hdr, data, ts, err = mset.store.LoadMsg(req.Seq) + sm, err = mset.store.LoadMsg(req.Seq, &svp) } else { - subj, seq, hdr, data, ts, err = mset.store.LoadLastMsg(req.LastFor) + sm, err = mset.store.LoadLastMsg(req.LastFor, &svp) } if err != nil { resp.Error = NewJSNoMessageFoundError() @@ -2547,11 +2524,11 @@ func (s *Server) jsMsgGetRequest(sub *subscription, c *client, _ *Account, subje return } resp.Message = &StoredMsg{ - Subject: subj, - Sequence: seq, - Header: hdr, - Data: data, - Time: time.Unix(0, ts).UTC(), + Subject: sm.subj, + Sequence: sm.seq, + Header: sm.hdr, + Data: sm.msg, + Time: time.Unix(0, sm.ts).UTC(), } s.sendAPIResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(resp)) } @@ -2677,6 +2654,23 @@ func (s *Server) jsStreamPurgeRequest(sub *subscription, c *client, _ *Account, s.sendAPIResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(resp)) } +func (acc *Account) jsNonClusteredStreamLimitsCheck(cfg *StreamConfig) *ApiError { + selectedLimits, tier, jsa, apiErr := acc.selectLimits(cfg) + if apiErr != nil { + return apiErr + } + jsa.mu.RLock() + defer jsa.mu.RUnlock() + if selectedLimits.MaxStreams > 0 && jsa.countStreams(tier, cfg) >= selectedLimits.MaxStreams { + return NewJSMaximumStreamsLimitError() + } + reserved := jsa.tieredReservation(tier, cfg) + if err := jsa.js.checkAllLimits(selectedLimits, cfg, reserved, 0); err != nil { + return NewJSStreamLimitsError(err, Unless(err)) + } + return nil +} + // Request to restore a stream. func (s *Server) jsStreamRestoreRequest(sub *subscription, c *client, _ *Account, subject, reply string, rmsg []byte) { if c == nil || !s.JetStreamIsLeader() { @@ -2713,11 +2707,25 @@ func (s *Server) jsStreamRestoreRequest(sub *subscription, c *client, _ *Account req.Config.Name = stream } + // check stream config at the start of the restore process, not at the end + cfg, apiErr := s.checkStreamCfg(&req.Config, acc) + if apiErr != nil { + resp.Error = apiErr + s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) + return + } + if s.JetStreamIsClustered() { s.jsClusteredStreamRestoreRequest(ci, acc, &req, stream, subject, reply, rmsg) return } + if err := acc.jsNonClusteredStreamLimitsCheck(&cfg); err != nil { + resp.Error = err + s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) + return + } + if _, err := acc.lookupStream(stream); err == nil { resp.Error = NewJSStreamNameExistError() s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) @@ -2782,7 +2790,7 @@ func (s *Server) processStreamRestore(ci *ClientInfo, acc *Account, cfg *StreamC // For signaling to upper layers. resultCh := make(chan result, 1) - activeQ := newIPQueue() // of int + activeQ := s.newIPQueue(fmt.Sprintf("[ACC:%s] stream '%s' restore", acc.Name, streamName)) // of int var total int @@ -2861,6 +2869,7 @@ func (s *Server) processStreamRestore(ci *ClientInfo, acc *Account, cfg *StreamC tfile.Close() os.Remove(tfile.Name()) sub.client.processUnsub(sub.sid) + activeQ.unregister() }() const activityInterval = 5 * time.Second @@ -3057,8 +3066,8 @@ func (s *Server) jsStreamSnapshotRequest(sub *subscription, c *client, _ *Accoun } // Default chunk size for now. -const defaultSnapshotChunkSize = 256 * 1024 -const defaultSnapshotWindowSize = 32 * 1024 * 1024 // 32MB +const defaultSnapshotChunkSize = 128 * 1024 +const defaultSnapshotWindowSize = 8 * 1024 * 1024 // 8MB // streamSnapshot will stream out our snapshot to the reply subject. func (s *Server) streamSnapshot(ci *ClientInfo, acc *Account, mset *stream, sr *SnapshotResult, req *JSApiStreamSnapshotRequest) { @@ -3121,7 +3130,7 @@ func (s *Server) streamSnapshot(ci *ClientInfo, acc *Account, mset *stream, sr * } // Wait on acks for flow control if past our window size. - // Wait up to 1ms for now if no acks received. + // Wait up to 10ms for now if no acks received. if atomic.LoadInt32(&out) > defaultSnapshotWindowSize { select { case <-acks: @@ -3177,32 +3186,19 @@ func (s *Server) jsConsumerCreate(sub *subscription, c *client, a *Account, subj return } - // We reject if flow control is set without heartbeats. - if req.Config.FlowControl && req.Config.Heartbeat == 0 { - resp.Error = NewJSConsumerWithFlowControlNeedsHeartbeatsError() - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - - // Make sure we have sane defaults. - setConsumerConfigDefaults(&req.Config) - - // Check if we have a BackOff defined that MaxDeliver is within range etc. - if lbo := len(req.Config.BackOff); lbo > 0 && req.Config.MaxDeliver <= lbo { - resp.Error = NewJSConsumerMaxDeliverBackoffError() - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } + var js *jetStream + isClustered := s.JetStreamIsClustered() // Determine if we should proceed here when we are in clustered mode. - if s.JetStreamIsClustered() { + if isClustered { if req.Config.Direct { // Check to see if we have this stream and are the stream leader. if !acc.JetStreamIsStreamLeader(streamName) { return } } else { - js, cc := s.getJetStreamCluster() + var cc *jetStreamCluster + js, cc = s.getJetStreamCluster() if js == nil || cc == nil { return } @@ -3262,11 +3258,18 @@ func (s *Server) jsConsumerCreate(sub *subscription, c *client, a *Account, subj } } - if s.JetStreamIsClustered() && !req.Config.Direct { + if isClustered && !req.Config.Direct { s.jsClusteredConsumerRequest(ci, acc, subject, reply, rmsg, req.Stream, &req.Config) return } + // If we are here we are single server mode. + if req.Config.Replicas > 1 { + resp.Error = NewJSStreamReplicasNotSupportedError() + s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) + return + } + stream, err := acc.lookupStream(req.Stream) if err != nil { resp.Error = NewJSStreamNotFoundError(Unless(err)) @@ -3461,6 +3464,7 @@ func (s *Server) jsConsumerListRequest(sub *subscription, c *client, _ *Account, // Clustered mode will invoke a scatter and gather. if s.JetStreamIsClustered() { + // Need to copy these off before sending.. don't move this inside startGoRoutine!!! msg = copyBytes(msg) s.startGoRoutine(func() { s.jsClusteredConsumerListRequest(acc, ci, offset, streamName, subject, reply, msg) @@ -3530,6 +3534,10 @@ func (s *Server) jsConsumerInfoRequest(sub *subscription, c *client, _ *Account, js.mu.RLock() isLeader, sa, ca := cc.isLeader(), js.streamAssignment(acc.Name, streamName), js.consumerAssignment(acc.Name, streamName, consumerName) ourID := cc.meta.ID() + var offline bool + if ca != nil { + offline = s.allPeersOffline(ca.Group) + } js.mu.RUnlock() if isLeader && ca == nil { @@ -3557,6 +3565,10 @@ func (s *Server) jsConsumerInfoRequest(sub *subscription, c *client, _ *Account, s.sendDelayedAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp), nil) } return + } else if isLeader && offline { + resp.Error = NewJSConsumerOfflineError() + s.sendDelayedAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp), nil) + return } // Check to see if we are a member of the group and if the group has no leader. @@ -3574,14 +3586,15 @@ func (s *Server) jsConsumerInfoRequest(sub *subscription, c *client, _ *Account, s.sendDelayedAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp), ca.Group) return } - if ca == nil { - return - } // We have a consumer assignment. js.mu.RLock() var node RaftNode + var leaderNotPartOfGroup bool if rg := ca.Group; rg != nil && rg.node != nil && rg.isMember(ourID) { node = rg.node + if gl := node.GroupLeader(); gl != _EMPTY_ && !rg.isMember(gl) { + leaderNotPartOfGroup = true + } } js.mu.RUnlock() // Check if we should ignore all together. @@ -3599,7 +3612,12 @@ func (s *Server) jsConsumerInfoRequest(sub *subscription, c *client, _ *Account, } return } + // If we are a member and we have a group leader or we had a previous leader consider bailing out. if node != nil && (node.GroupLeader() != _EMPTY_ || node.HadPreviousLeader()) { + if leaderNotPartOfGroup { + resp.Error = NewJSConsumerOfflineError() + s.sendDelayedAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp), nil) + } return } // If we are here we are a member and this is just a new consumer that does not have a leader yet. diff --git a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_cluster.go b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_cluster.go index 584c55d3..7fc1350e 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_cluster.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_cluster.go @@ -350,7 +350,8 @@ func (cc *jetStreamCluster) isCurrent() bool { return cc.meta.Current() } -// isStreamCurrent will determine if this node is a participant for the stream and if its up to date. +// isStreamCurrent will determine if the stream is up to date. +// For R1 it will make sure the stream is present on this server. // Read lock should be held. func (cc *jetStreamCluster) isStreamCurrent(account, stream string) bool { if cc == nil { @@ -366,12 +367,11 @@ func (cc *jetStreamCluster) isStreamCurrent(account, stream string) bool { return false } rg := sa.Group - if rg == nil || rg.node == nil { + if rg == nil { return false } - isCurrent := rg.node.Current() - if isCurrent { + if rg.node == nil || rg.node.Current() { // Check if we are processing a snapshot and are catching up. acc, err := cc.s.LookupAccount(account) if err != nil { @@ -384,9 +384,37 @@ func (cc *jetStreamCluster) isStreamCurrent(account, stream string) bool { if mset.isCatchingUp() { return false } + // Success. + return true } - return isCurrent + return false +} + +// isConsumerCurrent will determine if the consumer is up to date. +// For R1 it will make sure the consunmer is present on this server. +// Read lock should be held. +func (cc *jetStreamCluster) isConsumerCurrent(account, stream, consumer string) bool { + if cc == nil { + // Non-clustered mode + return true + } + acc, err := cc.s.LookupAccount(account) + if err != nil { + return false + } + mset, err := acc.lookupStream(stream) + if err != nil { + return false + } + o := mset.lookupConsumer(consumer) + if o == nil { + return false + } + if n := o.raftNode(); n != nil && !n.Current() { + return false + } + return true } func (a *Account) getJetStreamFromAccount() (*Server, *jetStream, *jsAccount) { @@ -495,6 +523,14 @@ func (js *jetStream) isClustered() bool { return isClustered } +// isClusteredNoLock returns if we are clustered, but unlike isClustered() does +// not use the jetstream's lock, instead, uses an atomic operation. +// There are situations where some code wants to know if we are clustered but +// can't use js.isClustered() without causing a lock inversion. +func (js *jetStream) isClusteredNoLock() bool { + return atomic.LoadInt32(&js.clustered) == 1 +} + func (js *jetStream) setupMetaGroup() error { s := js.srv s.Noticef("Creating JetStream metadata controller") @@ -557,7 +593,7 @@ func (js *jetStream) setupMetaGroup() error { } // Start up our meta node. - n, err := s.startRaftNode(cfg) + n, err := s.startRaftNode(sysAcc.GetName(), cfg) if err != nil { s.Warnf("Could not start metadata controller: %v", err) return err @@ -579,6 +615,7 @@ func (js *jetStream) setupMetaGroup() error { s: s, c: c, } + atomic.StoreInt32(&js.clustered, 1) c.registerWithAccount(sacc) js.srv.startGoRoutine(js.monitorCluster) @@ -630,6 +667,9 @@ func (js *jetStream) isGroupLeaderless(rg *raftGroup) bool { cc := js.cluster // If we are not a member we can not say.. + if cc.meta == nil { + return false + } if !rg.isMember(cc.meta.ID()) { return false } @@ -773,6 +813,12 @@ func (cc *jetStreamCluster) isConsumerLeader(account, stream, consumer string) b return false } +// During recovery track any stream and consumer delete operations. +type recoveryRemovals struct { + streams map[string]*streamAssignment + consumers map[string]*consumerAssignment +} + func (js *jetStream) monitorCluster() { s, n := js.server(), js.getMetaGroup() qch, lch, aq := n.QuitC(), n.LeadChangeC(), n.ApplyQ() @@ -816,6 +862,11 @@ func (js *jetStream) monitorCluster() { } } + rm := &recoveryRemovals{ + streams: make(map[string]*streamAssignment), + consumers: make(map[string]*consumerAssignment), + } + for { select { case <-s.quitCh: @@ -828,12 +879,21 @@ func (js *jetStream) monitorCluster() { if cei == nil { // Signals we have replayed all of our metadata. isRecovering = false + // Process any removes that are still valid after recovery. + for _, sa := range rm.streams { + js.processStreamRemoval(sa) + } + for _, ca := range rm.consumers { + js.processConsumerRemoval(ca) + } + // Clear. + rm = nil s.Debugf("Recovered JetStream cluster metadata") continue } ce := cei.(*CommittedEntry) // FIXME(dlc) - Deal with errors. - if didSnap, didRemoval, err := js.applyMetaEntries(ce.Entries, isRecovering); err == nil { + if didSnap, didRemoval, err := js.applyMetaEntries(ce.Entries, isRecovering, rm); err == nil { _, nb := n.Applied(ce.Index) if js.hasPeerEntries(ce.Entries) || didSnap || (didRemoval && time.Since(lastSnapTime) > 2*time.Second) { // Since we received one make sure we have our own since we do not store @@ -941,6 +1001,15 @@ type writeableStreamAssignment struct { Consumers []*consumerAssignment } +func (js *jetStream) clusterStreamConfig(accName, streamName string) (StreamConfig, bool) { + js.mu.RLock() + defer js.mu.RUnlock() + if sa, ok := js.cluster.streams[accName][streamName]; ok { + return *sa.Config, true + } + return StreamConfig{}, false +} + func (js *jetStream) metaSnapshot() []byte { var streams []writeableStreamAssignment @@ -989,6 +1058,7 @@ func (js *jetStream) applyMetaSnapshot(buf []byte, isRecovering bool) error { // Build our new version here outside of js. streams := make(map[string]map[string]*streamAssignment) for _, wsa := range wsas { + fixCfgMirrorWithDedupWindow(wsa.Config) as := streams[wsa.Client.serviceAccount()] if as == nil { as = make(map[string]*streamAssignment) @@ -1219,6 +1289,56 @@ func (js *jetStream) processRemovePeer(peer string) { } } +// Remove old peers after the new peers are caught up. +// We are the old stream leader here. +func (js *jetStream) removeOldPeers(mset *stream, newPreferred string) { + // Make sure still valid. + mset.mu.Lock() + isValid := mset.qch != nil + mset.mu.Unlock() + + if !isValid { + return + } + + sa := mset.streamAssignment() + ci := js.clusterInfo(mset.raftGroup()) + + js.mu.Lock() + defer js.mu.Unlock() + + // Make sure still valid. + if js.srv == nil || !js.srv.isRunning() { + return + } + + cc, csa := js.cluster, sa.copyGroup() + numExpandedPeers := len(csa.Group.Peers) + csa.Group.Peers = csa.Group.Peers[:0] + + for _, r := range ci.Replicas { + if r.cluster != ci.Name { + csa.Group.Peers = append(csa.Group.Peers, r.peer) + } + } + + // Now do consumers actually first here, followed by the owning stream. + for _, ca := range csa.consumers { + cca := ca.copyGroup() + numPeers := len(cca.Group.Peers) + if numPeers == numExpandedPeers { + cca.Group.Peers = csa.Group.Peers + cca.Group.Preferred = _EMPTY_ + } else { + cca.Group.Peers = cca.Group.Peers[len(cca.Group.Peers)-1:] + } + cc.meta.ForwardProposal(encodeAddConsumerAssignment(cca)) + } + + csa.Group.Preferred = newPreferred + cc.meta.ForwardProposal(encodeUpdateStreamAssignment(csa)) +} + // Assumes all checks have already been done. func (js *jetStream) removePeerFromStream(sa *streamAssignment, peer string) bool { js.mu.Lock() @@ -1265,7 +1385,7 @@ func (js *jetStream) hasPeerEntries(entries []*Entry) bool { return false } -func (js *jetStream) applyMetaEntries(entries []*Entry, isRecovering bool) (bool, bool, error) { +func (js *jetStream) applyMetaEntries(entries []*Entry, isRecovering bool, rm *recoveryRemovals) (bool, bool, error) { var didSnap, didRemove bool for _, e := range entries { if e.Type == EntrySnapshot { @@ -1290,6 +1410,8 @@ func (js *jetStream) applyMetaEntries(entries []*Entry, isRecovering bool) (bool } if isRecovering { js.setStreamAssignmentRecovering(sa) + key := sa.Client.Account + ":" + sa.Config.Name + delete(rm.streams, key) } didRemove = js.processStreamAssignment(sa) case removeStreamOp: @@ -1300,9 +1422,12 @@ func (js *jetStream) applyMetaEntries(entries []*Entry, isRecovering bool) (bool } if isRecovering { js.setStreamAssignmentRecovering(sa) + key := sa.Client.Account + ":" + sa.Config.Name + rm.streams[key] = sa + } else { + js.processStreamRemoval(sa) + didRemove = true } - js.processStreamRemoval(sa) - didRemove = true case assignConsumerOp: ca, err := decodeConsumerAssignment(buf[1:]) if err != nil { @@ -1311,6 +1436,8 @@ func (js *jetStream) applyMetaEntries(entries []*Entry, isRecovering bool) (bool } if isRecovering { js.setConsumerAssignmentRecovering(ca) + key := ca.Client.Account + ":" + ca.Name + delete(rm.consumers, key) } js.processConsumerAssignment(ca) case assignCompressedConsumerOp: @@ -1321,6 +1448,8 @@ func (js *jetStream) applyMetaEntries(entries []*Entry, isRecovering bool) (bool } if isRecovering { js.setConsumerAssignmentRecovering(ca) + key := ca.Client.Account + ":" + ca.Name + delete(rm.consumers, key) } js.processConsumerAssignment(ca) case removeConsumerOp: @@ -1331,9 +1460,12 @@ func (js *jetStream) applyMetaEntries(entries []*Entry, isRecovering bool) (bool } if isRecovering { js.setConsumerAssignmentRecovering(ca) + key := ca.Client.Account + ":" + ca.Name + rm.consumers[key] = ca + } else { + js.processConsumerRemoval(ca) + didRemove = true } - js.processConsumerRemoval(ca) - didRemove = true case updateStreamOp: sa, err := decodeStreamAssignment(buf[1:]) if err != nil { @@ -1378,7 +1510,7 @@ func (rg *raftGroup) setPreferred() { } // createRaftGroup is called to spin up this raft group if needed. -func (js *jetStream) createRaftGroup(rg *raftGroup, storage StorageType) error { +func (js *jetStream) createRaftGroup(accName string, rg *raftGroup, storage StorageType) error { js.mu.Lock() defer js.mu.Unlock() s, cc := js.srv, js.cluster @@ -1411,7 +1543,7 @@ func (js *jetStream) createRaftGroup(rg *raftGroup, storage StorageType) error { var store StreamStore if storage == FileStorage { fs, err := newFileStore( - FileStoreConfig{StoreDir: storeDir, BlockSize: 4_000_000, AsyncFlush: false, SyncInterval: 5 * time.Minute}, + FileStoreConfig{StoreDir: storeDir, BlockSize: defaultMediumBlockSize, AsyncFlush: false, SyncInterval: 5 * time.Minute}, StreamConfig{Name: rg.Name, Storage: FileStorage}, ) if err != nil { @@ -1434,7 +1566,7 @@ func (js *jetStream) createRaftGroup(rg *raftGroup, storage StorageType) error { s.bootstrapRaftNode(cfg, rg.Peers, true) } - n, err := s.startRaftNode(cfg) + n, err := s.startRaftNode(accName, cfg) if err != nil || n == nil { s.Debugf("Error creating raft group: %v", err) return err @@ -1442,10 +1574,9 @@ func (js *jetStream) createRaftGroup(rg *raftGroup, storage StorageType) error { rg.node = n // See if we are preferred and should start campaign immediately. - if n.ID() == rg.Preferred { + if n.ID() == rg.Preferred && n.Term() == 0 { n.Campaign() } - return nil } @@ -1480,7 +1611,7 @@ func (mset *stream) removeNode() { } // Monitor our stream node for this stream. -func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment) { +func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment, sendSnapshot bool) { s, cc, n := js.server(), js.cluster, sa.Group.node defer s.grWG.Done() @@ -1489,7 +1620,7 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment) { return } - qch, lch, aq := n.QuitC(), n.LeadChangeC(), n.ApplyQ() + qch, lch, aq, uch := n.QuitC(), n.LeadChangeC(), n.ApplyQ(), mset.updateC() s.Debugf("Starting stream monitor for '%s > %s' [%s]", sa.Client.serviceAccount(), sa.Config.Name, n.Group()) defer s.Debugf("Exiting stream monitor for '%s > %s' [%s]", sa.Client.serviceAccount(), sa.Config.Name, n.Group()) @@ -1508,11 +1639,13 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment) { const ( compactInterval = 2 * time.Minute - compactSizeMin = 32 * 1024 * 1024 - compactNumMin = 8192 + compactSizeMin = 8 * 1024 * 1024 + compactNumMin = 65536 ) - t := time.NewTicker(compactInterval) + // Spread these out for large numbers on server restart. + rci := time.Duration(rand.Int63n(int64(time.Minute))) + t := time.NewTicker(compactInterval + rci) defer t.Stop() js.mu.RLock() @@ -1545,6 +1678,35 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment) { restoreDoneCh := make(<-chan error) isRecovering := true + // For migration tracking. + var migrating bool + var peerGroup peerMigrateType + var mmt *time.Ticker + var mmtc <-chan time.Time + + startMigrationMonitoring := func() { + if mmt == nil { + mmt = time.NewTicker(1 * time.Second) + mmtc = mmt.C + } + } + + stopMigrationMonitoring := func() { + if mmt != nil { + mmt.Stop() + mmtc = nil + } + } + defer stopMigrationMonitoring() + + // This is triggered during a scale up from 1 to clustered mode. We need the new followers to catchup, + // similar to how we trigger the catchup mechanism post a backup/restore. It's ok to do here and preferred + // over waiting to be elected, this just queues it up for the new members to see first and trigger the above + // RAFT layer catchup mechanism. + if sendSnapshot && mset != nil && n != nil { + n.SendSnapshot(mset.stateSnapshot()) + } + for { select { case <-s.quitCh: @@ -1602,9 +1764,65 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment) { } else if n.GroupLeader() != noLeader { js.setStreamAssignmentRecovering(sa) } + + // Process our leader change. js.processStreamLeaderChange(mset, isLeader) + + // We may receive a leader change after the stream assignment which would cancel us + // monitoring for this closely. So re-assess our state here as well. + migrating, peerGroup = mset.isMigrating() + + // Check for migrations here. We set the state on the stream assignment update below. + if isLeader && migrating { + if peerGroup == oldPeerGroup { + startMigrationMonitoring() + } else { + stopMigrationMonitoring() + } + } case <-t.C: doSnapshot() + case <-uch: + // We get this when we have a new stream assignment caused by an update. We want + // to know if we are migrating. + migrating, peerGroup = mset.isMigrating() + // If we are migrating and in the old peer group and we are leader, monitor for the + // new peers to be caught up. We could not be leader yet, so we will do same check below + // on leadership change. + if isLeader && migrating && peerGroup == oldPeerGroup { + doSnapshot() + startMigrationMonitoring() + } + case <-mmtc: + if !isLeader { + // We are no longer leader, so not our job. + stopMigrationMonitoring() + continue + } + // Check to see that we have someone caught up. + // TODO(dlc) - For now start checking after a second in order to give proper time to kick in any catchup logic needed. + // What we really need to do longer term is know if we need catchup and make sure that process has kicked off and/or completed. + ci := js.clusterInfo(mset.raftGroup()) + // The polling interval of one second allows this to be kicked in if needed. + if mset.hasCatchupPeers() { + mset.checkClusterInfo(ci) + } + // Track the new peers and check the ones that are current. + var newPeers []*PeerInfo + quorum := mset.cfg.Replicas/2 + 1 + for _, r := range ci.Replicas { + if r.cluster != ci.Name { + if r.Current { + newPeers = append(newPeers, r) + } + } + } + // If all are current we are good, or if we have some offline and we have a quorum. + if lnp := len(newPeers); lnp >= quorum { + stopMigrationMonitoring() + // Remove the old peers and transfer leadership. + time.AfterFunc(2*time.Second, func() { js.removeOldPeers(mset, newPeers[0].peer) }) + } case err := <-restoreDoneCh: // We have completed a restore from snapshot on this server. The stream assignment has // already been assigned but the replicas will need to catch up out of band. Consumers @@ -1617,9 +1835,10 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment) { sa.Restore = nil // If we were successful lookup up our stream now. if err == nil { - mset, err = acc.lookupStream(sa.Config.Name) - if mset != nil { + if mset, err = acc.lookupStream(sa.Config.Name); mset != nil { mset.setStreamAssignment(sa) + // Make sure to update our updateC which would have been nil. + uch = mset.updateC() } } if err != nil { @@ -1647,7 +1866,7 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment) { panic("Finished restore but not leader") } // Trigger the stream followers to catchup. - if n := mset.raftNode(); n != nil { + if n = mset.raftNode(); n != nil { n.SendSnapshot(mset.stateSnapshot()) } js.processStreamLeaderChange(mset, isLeader) @@ -1656,11 +1875,13 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment) { // These are not currently assigned so we will need to do so here. if consumers := mset.getPublicConsumers(); len(consumers) > 0 { for _, o := range consumers { - rg := cc.createGroupForConsumer(sa) + name, cfg := o.String(), o.config() + rg := cc.createGroupForConsumer(&cfg, sa) // Pick a preferred leader. rg.setPreferred() - name, cfg := o.String(), o.config() + // Place our initial state here as well for assignment distribution. + state, _ := o.store.State() ca := &consumerAssignment{ Group: rg, Stream: sa.Config.Name, @@ -1668,7 +1889,7 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment) { Config: &cfg, Client: sa.Client, Created: o.createdTime(), - State: o.readStoreState(), + State: state, } // We make these compressed in case state is complex. @@ -1701,11 +1922,47 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment) { } } +// When we are migration denotes if we ourselves are part of the old peer set or the new one. +// Both types will be running at the same time as we scale up to extend into the new cluster. +// Once detected we will us our type to dictate our behavior. +type peerMigrateType int8 + +const ( + oldPeerGroup = peerMigrateType(iota) + newPeerGroup +) + +// Determine if we are migrating and if so if we are part of the old or new set. +func (mset *stream) isMigrating() (bool, peerMigrateType) { + mset.mu.RLock() + s, js, sa := mset.srv, mset.js, mset.sa + mset.mu.RUnlock() + + js.mu.RLock() + defer js.mu.RUnlock() + + // During migration we will always be R>1, even when we start R1. + // So if we do not have a group or node we no we are not migrating. + if sa == nil || sa.Group == nil || sa.Group.node == nil { + return false, oldPeerGroup + } + // The sign of migration is if our group peer count != configured replica count. + if sa.Config.Replicas == len(sa.Group.Peers) { + return false, oldPeerGroup + } + // So we believe we are migrating here, need to determine if we are the old set or new set. + // We can shor circuit this based on our group assigned cluster vs our own. + if sa.Group.Cluster == s.cachedClusterName() { + return true, newPeerGroup + } + return true, oldPeerGroup +} + // resetClusteredState is called when a clustered stream had a sequence mismatch and needs to be reset. func (mset *stream) resetClusteredState(err error) bool { mset.mu.RLock() s, js, jsa, sa, acc, node := mset.srv, mset.js, mset.jsa, mset.sa, mset.acc, mset.node - stype, isLeader := mset.cfg.Storage, mset.isLeader() + stype, isLeader, tierName := mset.cfg.Storage, mset.isLeader(), mset.tier mset.mu.RUnlock() // Stepdown regardless if we are the leader here. @@ -1720,7 +1977,7 @@ func (mset *stream) resetClusteredState(err error) bool { } // Account - if jsa.limitsExceeded(stype) { + if exceeded, _ := jsa.limitsExceeded(stype, tierName); exceeded { s.Warnf("stream '%s > %s' errored, account resources exceeded", acc, mset.name()) return false } @@ -1807,7 +2064,8 @@ func (js *jetStream) applyStreamEntries(mset *stream, ce *CommittedEntry, isReco // We can skip if we know this is less than what we already have. if lseq < last { - s.Debugf("Apply stream entries skipping message with sequence %d with last of %d", lseq, last) + s.Debugf("Apply stream entries for '%s > %s' skipping message with sequence %d with last of %d", + mset.account(), mset.name(), lseq, last) continue } @@ -1831,7 +2089,8 @@ func (js *jetStream) applyStreamEntries(mset *stream, ce *CommittedEntry, isReco if isClusterResetErr(err) || isOutOfSpaceErr(err) { return err } - s.Debugf("Apply stream entries error processing message: %v", err) + s.Debugf("Apply stream entries for '%s > %s' got error processing message: %v", + mset.account(), mset.name(), err) } case deleteMsgOp: md, err := decodeMsgDelete(buf[1:]) @@ -1858,8 +2117,8 @@ func (js *jetStream) applyStreamEntries(mset *stream, ce *CommittedEntry, isReco } if err != nil && !isRecovering { - s.Debugf("JetStream cluster failed to delete msg %d from stream %q for account %q: %v", - md.Seq, md.Stream, md.Client.serviceAccount(), err) + s.Debugf("JetStream cluster failed to delete stream msg %d from '%s > %s': %v", + md.Seq, md.Client.serviceAccount(), md.Stream, err) } js.mu.RLock() @@ -1928,8 +2187,10 @@ func (js *jetStream) applyStreamEntries(mset *stream, ce *CommittedEntry, isReco if err := json.Unmarshal(e.Data, &snap); err != nil { return err } - if err := mset.processSnapshot(&snap); err != nil { - return err + if !mset.IsLeader() { + if err := mset.processSnapshot(&snap); err != nil { + return err + } } } } else if e.Type == EntryRemovePeer { @@ -2204,6 +2465,7 @@ func (js *jetStream) processStreamAssignment(sa *streamAssignment) bool { js.mu.Lock() if node := sa.Group.node; node != nil { if node.Leader() { + node.UpdateKnownPeers(sa.Group.Peers) node.StepDown() } node.ProposeRemovePeer(ourID) @@ -2270,15 +2532,22 @@ func (js *jetStream) processUpdateStreamAssignment(sa *streamAssignment) { sa.consumers = osa.consumers sa.err = osa.err + // If we detect we are scaling down to 1, non-clustered, and we had a previous node, clear it here. + if sa.Config.Replicas == 1 && sa.Group.node != nil { + sa.Group.node = nil + } + // Update our state. accStreams[stream] = sa cc.streams[acc.Name] = accStreams - // Make sure we respond. + // Make sure we respond if we are a member. if isMember { sa.responded = false + } else { + // Make sure to clean up any old node in case this stream moves back here. + sa.Group.node = nil } - js.mu.Unlock() // Check if this is for us.. @@ -2286,8 +2555,11 @@ func (js *jetStream) processUpdateStreamAssignment(sa *streamAssignment) { js.processClusterUpdateStream(acc, osa, sa) } else if mset, _ := acc.lookupStream(sa.Config.Name); mset != nil { // We have one here even though we are not a member. This can happen on re-assignment. - s.Debugf("JetStream removing stream '%s > %s' from this server, re-assigned", sa.Client.serviceAccount(), sa.Config.Name) + s.Debugf("JetStream removing stream '%s > %s' from this server", sa.Client.serviceAccount(), sa.Config.Name) if node := mset.raftNode(); node != nil { + if node.Leader() { + node.StepDown(sa.Group.Preferred) + } node.ProposeRemovePeer(ourID) } mset.stop(true, false) @@ -2304,37 +2576,39 @@ func (js *jetStream) processClusterUpdateStream(acc *Account, osa, sa *streamAss js.mu.Lock() s, rg := js.srv, sa.Group client, subject, reply := sa.Client, sa.Subject, sa.Reply - alreadyRunning, numReplicas := osa.Group.node != nil, sa.Config.Replicas + alreadyRunning, numReplicas := osa.Group.node != nil, len(rg.Peers) needsNode := rg.node == nil - storage := sa.Config.Storage + storage, cfg := sa.Config.Storage, sa.Config hasResponded := sa.responded sa.responded = true js.mu.Unlock() - mset, err := acc.lookupStream(sa.Config.Name) + mset, err := acc.lookupStream(cfg.Name) if err == nil && mset != nil { var needsSetLeader bool if !alreadyRunning && numReplicas > 1 { if needsNode { - js.createRaftGroup(rg, storage) + js.createRaftGroup(acc.GetName(), rg, storage) } - s.startGoRoutine(func() { js.monitorStream(mset, sa) }) + s.startGoRoutine(func() { js.monitorStream(mset, sa, needsNode) }) } else if numReplicas == 1 && alreadyRunning { // We downgraded to R1. Make sure we cleanup the raft node and the stream monitor. mset.removeNode() // Make sure we are leader now that we are R1. needsSetLeader = true - // In case we nned to shutdown the cluster specific subs, etc. + // In case we need to shutdown the cluster specific subs, etc. mset.setLeader(false) js.mu.Lock() - sa.Group.node = nil + rg.node = nil js.mu.Unlock() } - mset.setStreamAssignment(sa) - if err = mset.update(sa.Config); err != nil { - s.Warnf("JetStream cluster error updating stream %q for account %q: %v", sa.Config.Name, acc.Name, err) - mset.setStreamAssignment(osa) + + if err = mset.update(cfg); err != nil { + s.Warnf("JetStream cluster error updating stream %q for account %q: %v", cfg.Name, acc.Name, err) } + // Set the new stream assignment. + mset.setStreamAssignment(sa) + // Make sure we are the leader now that we are R1. if needsSetLeader { mset.setLeader(true) @@ -2396,6 +2670,7 @@ func (js *jetStream) processClusterUpdateStream(acc *Account, osa, sa *streamAss Mirror: mset.mirrorInfo(), Sources: mset.sourcesInfo(), } + s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp)) } @@ -2413,7 +2688,7 @@ func (js *jetStream) processClusterCreateStream(acc *Account, sa *streamAssignme js.mu.RUnlock() // Process the raft group and make sure it's running if needed. - err := js.createRaftGroup(rg, storage) + err := js.createRaftGroup(acc.GetName(), rg, storage) // If we are restoring, create the stream if we are R>1 and not the preferred who handles the // receipt of the snapshot itself. @@ -2487,7 +2762,7 @@ func (js *jetStream) processClusterCreateStream(acc *Account, sa *streamAssignme // Start our monitoring routine. if rg.node != nil { if !alreadyRunning { - s.startGoRoutine(func() { js.monitorStream(mset, sa) }) + s.startGoRoutine(func() { js.monitorStream(mset, sa, false) }) } } else { // Single replica stream, process manually here. @@ -2534,8 +2809,9 @@ func (js *jetStream) processClusterCreateStream(acc *Account, sa *streamAssignme js.mu.RUnlock() for _, o := range consumers { - rg := cc.createGroupForConsumer(sa) name, cfg := o.String(), o.config() + rg := cc.createGroupForConsumer(&cfg, sa) + // Place our initial state here as well for assignment distribution. ca := &consumerAssignment{ Group: rg, @@ -2617,36 +2893,37 @@ func (js *jetStream) processClusterDeleteStream(sa *streamAssignment, isMember, js.mu.RLock() s := js.srv hadLeader := sa.Group.node == nil || sa.Group.node.GroupLeader() != noLeader - js.mu.RUnlock() - - acc, err := s.LookupAccount(sa.Client.serviceAccount()) - if err != nil { - s.Debugf("JetStream cluster failed to lookup account %q: %v", sa.Client.serviceAccount(), err) - return + offline := s.allPeersOffline(sa.Group) + var isMetaLeader bool + if cc := js.cluster; cc != nil { + isMetaLeader = cc.isLeader() } + js.mu.RUnlock() var resp = JSApiStreamDeleteResponse{ApiResponse: ApiResponse{Type: JSApiStreamDeleteResponseType}} + var err error + var acc *Account - // Go ahead and delete the stream. - mset, err := acc.lookupStream(sa.Config.Name) - if err != nil { - resp.Error = NewJSStreamNotFoundError(Unless(err)) - } else if mset != nil { - err = mset.stop(true, wasLeader) + // Go ahead and delete the stream if we have it and the account here. + if acc, _ = s.LookupAccount(sa.Client.serviceAccount()); acc != nil { + if mset, _ := acc.lookupStream(sa.Config.Name); mset != nil { + err = mset.stop(true, wasLeader) + } } + // Always delete the node if present. if sa.Group.node != nil { sa.Group.node.Delete() } if !isMember || !wasLeader && hadLeader { - return + if !(offline && isMetaLeader) { + return + } } if err != nil { - if resp.Error == nil { - resp.Error = NewJSStreamGeneralError(err, Unless(err)) - } + resp.Error = NewJSStreamGeneralError(err, Unless(err)) s.sendAPIErrResponse(sa.Client, acc, sa.Subject, sa.Reply, _EMPTY_, s.jsonResponse(resp)) } else { resp.Success = true @@ -2658,7 +2935,7 @@ func (js *jetStream) processClusterDeleteStream(sa *streamAssignment, isMember, func (js *jetStream) processConsumerAssignment(ca *consumerAssignment) { js.mu.RLock() s, cc := js.srv, js.cluster - accName, stream, consumer := ca.Client.serviceAccount(), ca.Stream, ca.Name + accName, stream, consumerName := ca.Client.serviceAccount(), ca.Stream, ca.Name noMeta := cc == nil || cc.meta == nil var ourID string if !noMeta { @@ -2674,14 +2951,15 @@ func (js *jetStream) processConsumerAssignment(ca *consumerAssignment) { return } - if _, err := s.LookupAccount(accName); err != nil { + acc, err := s.LookupAccount(accName) + if err != nil { ll := fmt.Sprintf("Account [%s] lookup for consumer create failed: %v", accName, err) if isMember { // If we can not lookup the account and we are a member, send this result back to the metacontroller leader. result := &consumerAssignmentResult{ Account: accName, Stream: stream, - Consumer: consumer, + Consumer: consumerName, Response: &JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}}, } result.Response.Error = NewJSNoAccountError() @@ -2699,6 +2977,9 @@ func (js *jetStream) processConsumerAssignment(ca *consumerAssignment) { return } + // Might need this below. + numReplicas := sa.Config.Replicas + // Track if this existed already. var wasExisting bool @@ -2728,17 +3009,62 @@ func (js *jetStream) processConsumerAssignment(ca *consumerAssignment) { if isMember { js.processClusterCreateConsumer(ca, state, wasExisting) } else { + // We need to be removed here, we are no longer assigned. + // Grab consumer if we have it. + var o *consumer + if mset, _ := acc.lookupStream(sa.Config.Name); mset != nil { + o = mset.lookupConsumer(ca.Name) + } + // Check if we have a raft node running, meaning we are no longer part of the group but were. js.mu.Lock() if node := ca.Group.node; node != nil { + // We have one here even though we are not a member. This can happen on re-assignment. + s.Debugf("JetStream removing consumer '%s > %s > %s' from this server", sa.Client.serviceAccount(), sa.Config.Name, ca.Name) if node.Leader() { - node.StepDown() + s.Debugf("JetStream consumer '%s > %s > %s' is being removed and was the leader, will perform stepdown", + sa.Client.serviceAccount(), sa.Config.Name, ca.Name) + + peers, cn := node.Peers(), s.cachedClusterName() + migrating := numReplicas != len(peers) + + // Select a new peer to transfer to. If we are a migrating make sure its from the new cluster. + var npeer string + for _, r := range peers { + if !r.Current { + continue + } + if !migrating { + npeer = r.ID + break + } else if sir, ok := s.nodeToInfo.Load(r.ID); ok && sir != nil { + si := sir.(nodeInfo) + if si.cluster != cn { + npeer = r.ID + break + } + } + } + // Clear the raftnode from our consumer so that a subsequent o.delete will not also issue a stepdown. + if o != nil { + o.clearRaftNode() + } + // Manually handle the stepdown and deletion of the node. + node.UpdateKnownPeers(ca.Group.Peers) + node.StepDown(npeer) + node.Delete() + } else { + node.UpdateKnownPeers(ca.Group.Peers) } - node.ProposeRemovePeer(ourID) } + // Always clear the old node. ca.Group.node = nil ca.err = nil js.mu.Unlock() + + if o != nil { + o.deleteWithoutAdvisory() + } } } @@ -2810,17 +3136,30 @@ func (js *jetStream) processClusterCreateConsumer(ca *consumerAssignment, state return } + // Check if we already have this consumer running. + o := mset.lookupConsumer(ca.Name) + if !alreadyRunning { // Process the raft group and make sure its running if needed. - js.createRaftGroup(rg, mset.config().Storage) + storage := mset.config().Storage + if ca.Config.MemoryStorage { + storage = MemoryStorage + } + js.createRaftGroup(acc.GetName(), rg, storage) + } else { + // If we are clustered update the known peers. + js.mu.RLock() + if node := rg.node; node != nil { + node.UpdateKnownPeers(ca.Group.Peers) + } + js.mu.RUnlock() } // Check if we already have this consumer running. var didCreate bool - o := mset.lookupConsumer(ca.Name) if o == nil { // Add in the consumer if needed. - o, err = mset.addConsumerWithAssignment(ca.Config, ca.Name, ca) + o, err = mset.addConsumerWithAssignment(ca.Config, ca.Name, ca, false) didCreate = true } else { if err := o.updateConfig(ca.Config); err != nil { @@ -2839,15 +3178,32 @@ func (js *jetStream) processClusterCreateConsumer(ca *consumerAssignment, state } // Check if we already had a consumer assignment and its still pending. cca, oca := ca, o.consumerAssignment() + o.mu.Lock() + leader := o.isLeader() + o.mu.Unlock() + + var sendState bool js.mu.Lock() - if oca != nil && !oca.responded { - // We can't over ride info for replying here otherwise leader once elected can not respond. - // So just update Config, leave off client and reply to the originals. - cac := *oca - cac.Config = ca.Config - cca = &cac + if oca != nil { + if !oca.responded { + // We can't override info for replying here otherwise leader once elected can not respond. + // So just update Config, leave off client and reply to the originals. + cac := *oca + cac.Config = ca.Config + cca = &cac + } + // If we look like we are scaling up, let's send our current state to the group. + sendState = len(ca.Group.Peers) > len(oca.Group.Peers) && leader } + n := rg.node js.mu.Unlock() + + if sendState && n != nil { + if snap, err := o.store.EncodedState(); err == nil { + n.SendSnapshot(snap) + } + } + // Set CA for our consumer. o.setConsumerAssignment(cca) s.Debugf("JetStream cluster, consumer was already running") @@ -2857,7 +3213,6 @@ func (js *jetStream) processClusterCreateConsumer(ca *consumerAssignment, state if state != nil && o != nil { err = o.setStoreState(state) } - if err != nil { if IsNatsErr(err, JSConsumerStoreFailedErrF) { s.Warnf("Consumer create failed for '%s > %s > %s': %v", ca.Client.serviceAccount(), ca.Stream, ca.Name, err) @@ -2917,6 +3272,10 @@ func (js *jetStream) processClusterCreateConsumer(ca *consumerAssignment, state // Start our monitoring routine. if rg.node == nil { // Single replica consumer, process manually here. + js.mu.Lock() + // to force response in case we think we have responded before. + ca.responded = false + js.mu.Unlock() js.processConsumerLeaderChange(o, true) } else { if !alreadyRunning { @@ -2942,40 +3301,39 @@ func (js *jetStream) processClusterDeleteConsumer(ca *consumerAssignment, isMemb } js.mu.RLock() s := js.srv - js.mu.RUnlock() - - acc, err := s.LookupAccount(ca.Client.serviceAccount()) - if err != nil { - s.Warnf("JetStream cluster failed to lookup account %q: %v", ca.Client.serviceAccount(), err) - return + offline := s.allPeersOffline(ca.Group) + var isMetaLeader bool + if cc := js.cluster; cc != nil { + isMetaLeader = cc.isLeader() } + js.mu.RUnlock() var resp = JSApiConsumerDeleteResponse{ApiResponse: ApiResponse{Type: JSApiConsumerDeleteResponseType}} + var err error + var acc *Account - // Go ahead and delete the consumer. - mset, err := acc.lookupStream(ca.Stream) - if err != nil { - resp.Error = NewJSStreamNotFoundError(Unless(err)) - } else if mset != nil { - if o := mset.lookupConsumer(ca.Name); o != nil { - err = o.stopWithFlags(true, false, true, wasLeader) - } else { - resp.Error = NewJSConsumerNotFoundError() + // Go ahead and delete the consumer if we have it and the account. + if acc, _ = s.LookupAccount(ca.Client.serviceAccount()); acc != nil { + if mset, _ := acc.lookupStream(ca.Stream); mset != nil { + if o := mset.lookupConsumer(ca.Name); o != nil { + err = o.stopWithFlags(true, false, true, wasLeader) + } } } + // Always delete the node if present. if ca.Group.node != nil { ca.Group.node.Delete() } if !wasLeader || ca.Reply == _EMPTY_ { - return + if !(offline && isMetaLeader) { + return + } } if err != nil { - if resp.Error == nil { - resp.Error = NewJSStreamNotFoundError(Unless(err)) - } + resp.Error = NewJSStreamNotFoundError(Unless(err)) s.sendAPIErrResponse(ca.Client, acc, ca.Subject, ca.Reply, _EMPTY_, s.jsonResponse(resp)) } else { resp.Success = true @@ -3058,6 +3416,15 @@ func (o *consumer) raftGroup() *raftGroup { return o.ca.Group } +func (o *consumer) clearRaftNode() { + if o == nil { + return + } + o.mu.Lock() + defer o.mu.Unlock() + o.node = nil +} + func (o *consumer) raftNode() RaftNode { if o == nil { return nil @@ -3076,35 +3443,45 @@ func (js *jetStream) monitorConsumer(o *consumer, ca *consumerAssignment) { return } - qch, lch, aq := n.QuitC(), n.LeadChangeC(), n.ApplyQ() + qch, lch, aq, uch := n.QuitC(), n.LeadChangeC(), n.ApplyQ(), o.updateC() s.Debugf("Starting consumer monitor for '%s > %s > %s' [%s]", o.acc.Name, ca.Stream, ca.Name, n.Group()) defer s.Debugf("Exiting consumer monitor for '%s > %s > %s' [%s]", o.acc.Name, ca.Stream, ca.Name, n.Group()) const ( compactInterval = 2 * time.Minute - compactSizeMin = 8 * 1024 * 1024 - compactNumMin = 8192 + compactSizeMin = 64 * 1024 // What is stored here is always small for consumers. + compactNumMin = 1024 + minSnapDelta = 2 * time.Second ) - t := time.NewTicker(compactInterval) + // Spread these out for large numbers on server restart. + rci := time.Duration(rand.Int63n(int64(time.Minute))) + t := time.NewTicker(compactInterval + rci) defer t.Stop() - st := o.store.Type() var lastSnap []byte + var lastSnapTime time.Time - doSnapshot := func() { - // Memory store consumers do not keep state in the store itself. - // Just compact to our applied index. - if st == MemoryStorage { - _, _, applied := n.Progress() - n.Compact(applied) - } else if state, err := o.store.State(); err == nil && state != nil { - // FileStore version. - if snap := encodeConsumerState(state); !bytes.Equal(lastSnap, snap) { - if err := n.InstallSnapshot(snap); err == nil { - lastSnap = snap - } + doSnapshot := func(force bool) { + // Bail if trying too fast and not in a forced situation. + if !force && time.Since(lastSnapTime) < minSnapDelta { + return + } + + // Check several things to see if we need a snapshot. + needSnap := force || n.NeedSnapshot() + if !needSnap { + // Check if we should compact etc. based on size of log. + ne, nb := n.Size() + needSnap = nb > 0 && ne >= compactNumMin || nb > compactSizeMin + } + + if snap, err := o.store.EncodedState(); err == nil && (!bytes.Equal(lastSnap, snap) || needSnap) { + if err := n.InstallSnapshot(snap); err == nil { + lastSnap, lastSnapTime = snap, time.Now() + } else { + s.Warnf("Failed to install snapshot for '%s > %s > %s' [%s]: %v", o.acc.Name, ca.Stream, ca.Name, n.Group(), err) } } } @@ -3126,7 +3503,7 @@ func (js *jetStream) monitorConsumer(o *consumer, ca *consumerAssignment) { if cei == nil { recovering = false if n.NeedSnapshot() { - doSnapshot() + doSnapshot(true) } continue } @@ -3135,7 +3512,7 @@ func (js *jetStream) monitorConsumer(o *consumer, ca *consumerAssignment) { ne, nb := n.Applied(ce.Index) // If we have at least min entries to compact, go ahead and snapshot/compact. if nb > 0 && ne >= compactNumMin || nb > compactSizeMin { - doSnapshot() + doSnapshot(false) } } else { s.Warnf("Error applying consumer entries to '%s > %s'", ca.Client.serviceAccount(), ca.Name) @@ -3146,9 +3523,12 @@ func (js *jetStream) monitorConsumer(o *consumer, ca *consumerAssignment) { if recovering && !isLeader { js.setConsumerAssignmentRecovering(ca) } - js.processConsumerLeaderChange(o, isLeader) + if err := js.processConsumerLeaderChange(o, isLeader); err == nil && isLeader { + doSnapshot(true) + } + case <-uch: case <-t.C: - doSnapshot() + doSnapshot(false) } } } @@ -3218,7 +3598,9 @@ func (js *jetStream) applyConsumerEntries(o *consumer, ce *CommittedEntry, isLea o.mu.Lock() if !o.isLeader() { var le = binary.LittleEndian - o.sseq = le.Uint64(buf[1:]) + if sseq := le.Uint64(buf[1:]); sseq > o.sseq { + o.sseq = sseq + } } o.mu.Unlock() case addPendingRequest: @@ -3254,7 +3636,7 @@ func (o *consumer) processReplicatedAck(dseq, sseq uint64) { o.store.UpdateAcks(dseq, sseq) mset := o.mset - if mset == nil || mset.cfg.Retention == LimitsPolicy { + if mset == nil || o.retention == LimitsPolicy { o.mu.Unlock() return } @@ -3320,10 +3702,17 @@ func decodeDeliveredUpdate(buf []byte) (dseq, sseq, dc uint64, ts int64, err err return dseq, sseq, dc, ts, nil } -func (js *jetStream) processConsumerLeaderChange(o *consumer, isLeader bool) { +func (js *jetStream) processConsumerLeaderChange(o *consumer, isLeader bool) error { + stepDownIfLeader := func() error { + if node := o.raftNode(); node != nil && isLeader { + node.StepDown() + } + return errors.New("failed to update consumer leader status") + } + ca := o.consumerAssignment() if ca == nil { - return + return stepDownIfLeader() } js.mu.Lock() s, account, err := js.srv, ca.Client.serviceAccount(), ca.err @@ -3336,7 +3725,7 @@ func (js *jetStream) processConsumerLeaderChange(o *consumer, isLeader bool) { consumerName := o.String() acc, _ := s.LookupAccount(account) if acc == nil { - return + return stepDownIfLeader() } if isLeader { @@ -3355,22 +3744,11 @@ func (js *jetStream) processConsumerLeaderChange(o *consumer, isLeader bool) { // Tell consumer to switch leader status. o.setLeader(isLeader) - // Synchronize others to our version of state. - if isLeader { - if n := o.raftNode(); n != nil { - if state, err := o.store.State(); err == nil && state != nil { - if snap := encodeConsumerState(state); len(snap) > 0 { - n.SendSnapshot(snap) - } - } - } - } - if !isLeader || hasResponded { if isLeader { o.clearInitialInfo() } - return + return nil } var resp = JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}} @@ -3384,6 +3762,8 @@ func (js *jetStream) processConsumerLeaderChange(o *consumer, isLeader bool) { o.sendCreateAdvisory() } } + + return nil } // Determines if we should send lost quorum advisory. We throttle these after first one. @@ -3624,14 +4004,20 @@ func (js *jetStream) stopUpdatesSub() { func (js *jetStream) processLeaderChange(isLeader bool) { if isLeader { js.srv.Noticef("Self is new JetStream cluster metadata leader") - } else if node := js.getMetaGroup().GroupLeader(); node == _EMPTY_ { - js.srv.Noticef("JetStream cluster no metadata leader") - } else if srv := js.srv.serverNameForNode(node); srv == _EMPTY_ { - js.srv.Noticef("JetStream cluster new remote metadata leader") - } else if clst := js.srv.clusterNameForNode(node); clst == _EMPTY_ { - js.srv.Noticef("JetStream cluster new metadata leader: %s", srv) } else { - js.srv.Noticef("JetStream cluster new metadata leader: %s/%s", srv, clst) + var node string + if meta := js.getMetaGroup(); meta != nil { + node = meta.GroupLeader() + } + if node == _EMPTY_ { + js.srv.Noticef("JetStream cluster no metadata leader") + } else if srv := js.srv.serverNameForNode(node); srv == _EMPTY_ { + js.srv.Noticef("JetStream cluster new remote metadata leader") + } else if clst := js.srv.clusterNameForNode(node); clst == _EMPTY_ { + js.srv.Noticef("JetStream cluster new metadata leader: %s", srv) + } else { + js.srv.Noticef("JetStream cluster new metadata leader: %s/%s", srv, clst) + } } js.mu.Lock() @@ -3735,6 +4121,7 @@ func (cc *jetStreamCluster) selectPeerGroup(r int, cluster string, cfg *StreamCo } var nodes []wn + // peers is a randomized list s, peers := cc.s, cc.meta.Peers() // Map existing. @@ -3749,6 +4136,19 @@ func (cc *jetStreamCluster) selectPeerGroup(r int, cluster string, cfg *StreamCo } } + uniqueTagPrefix := s.getOpts().JetStreamUniqueTag + if uniqueTagPrefix != _EMPTY_ { + for _, tag := range tags { + if strings.HasPrefix(tag, uniqueTagPrefix) { + // disable uniqueness check of explicitly listed in tags + uniqueTagPrefix = _EMPTY_ + break + } + } + } + var uniqueTags = make(map[string]struct{}) + maxHaAssets := s.getOpts().JetStreamLimits.MaxHAAssets + for _, p := range peers { si, ok := s.nodeToInfo.Load(p.ID) if !ok || si == nil { @@ -3803,14 +4203,39 @@ func (cc *jetStreamCluster) selectPeerGroup(r int, cluster string, cfg *StreamCo // Otherwise check if we have enough room if maxBytes set. if maxBytes > 0 && maxBytes > available { + s.Warnf("%s@%s (Max Bytes: %d) exceeds available %s storage of %d bytes", + ni.name, ni.cluster, maxBytes, cfg.Storage.String(), available) + continue + } + // HAAssets contain _meta_ which we want to ignore + if maxHaAssets > 0 && ni.stats != nil && ni.stats.HAAssets > maxHaAssets { + s.Warnf("%s@%s (HA Asset Count: %d) exceeds max ha asset limit of %d for stream placement", + ni.name, ni.cluster, ni.stats.HAAssets, maxHaAssets) continue } - // Add to our list of potential nodes. - nodes = append(nodes, wn{p.ID, available}) - } - // If we could not select enough peers, fail. - if len(nodes) < (r - len(existing)) { + if uniqueTagPrefix != _EMPTY_ { + // default requires the unique prefix to be present + isUnique := false + for _, t := range ni.tags { + if strings.HasPrefix(t, uniqueTagPrefix) { + if _, ok := uniqueTags[t]; !ok { + uniqueTags[t] = struct{}{} + isUnique = true + } + break + } + } + if !isUnique { + continue + } + } + // Add to our list of potential nodes. + nodes = append(nodes, wn{p.ID, available}) + } + + // If we could not select enough peers, fail. + if len(nodes) < (r - len(existing)) { return nil } // Sort based on available from most to least. @@ -3836,13 +4261,37 @@ func groupNameForConsumer(peers []string, storage StorageType) string { } func groupName(prefix string, peers []string, storage StorageType) string { - var gns string - if len(peers) == 1 { - gns = peers[0] + gns := string(getHash(nuid.Next())) + return fmt.Sprintf("%s-R%d%s-%s", prefix, len(peers), storage.String()[:1], gns) +} + +// returns stream count for this tier as well as applicable reservation size (not including reservations for cfg) +// jetStream read lock should be held +func tieredStreamAndReservationCount(asa map[string]*streamAssignment, tier string, cfg *StreamConfig) (int, int64) { + numStreams := len(asa) + reservation := int64(0) + if tier == _EMPTY_ { + for _, sa := range asa { + if sa.Config.MaxBytes > 0 && sa.Config.Name != cfg.Name { + if sa.Config.Storage == cfg.Storage { + reservation += (int64(sa.Config.Replicas) * sa.Config.MaxBytes) + } + } + } } else { - gns = string(getHash(nuid.Next())) + numStreams = 0 + for _, sa := range asa { + if isSameTier(sa.Config, cfg) { + numStreams++ + if sa.Config.MaxBytes > 0 { + if sa.Config.Storage == cfg.Storage && sa.Config.Name != cfg.Name { + reservation += (int64(sa.Config.Replicas) * sa.Config.MaxBytes) + } + } + } + } } - return fmt.Sprintf("%s-R%d%s-%s", prefix, len(peers), storage.String()[:1], gns) + return numStreams, reservation } // createGroupForStream will create a group for assignment for the stream. @@ -3876,52 +4325,69 @@ func (js *jetStream) createGroupForStream(ci *ClientInfo, cfg *StreamConfig) *ra return nil } -func (s *Server) jsClusteredStreamRequest(ci *ClientInfo, acc *Account, subject, reply string, rmsg []byte, config *StreamConfig) { - js, cc := s.getJetStreamCluster() - if js == nil || cc == nil { - return - } - - var resp = JSApiStreamCreateResponse{ApiResponse: ApiResponse{Type: JSApiStreamCreateResponseType}} - +func (acc *Account) selectLimits(cfg *StreamConfig) (*JetStreamAccountLimits, string, *jsAccount, *ApiError) { // Grab our jetstream account info. acc.mu.RLock() jsa := acc.js acc.mu.RUnlock() if jsa == nil { - resp.Error = NewJSNotEnabledForAccountError() - s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) + return nil, _EMPTY_, nil, NewJSNotEnabledForAccountError() + } + + jsa.usageMu.RLock() + selectedLimits, tierName, ok := jsa.selectLimits(cfg) + jsa.usageMu.RUnlock() + + if !ok { + return nil, _EMPTY_, nil, NewJSNoLimitsError() + } + return &selectedLimits, tierName, jsa, nil +} + +// Read lock needs to be held +func (js *jetStream) jsClusteredStreamLimitsCheck(acc *Account, cfg *StreamConfig) *ApiError { + selectedLimits, tier, _, apiErr := acc.selectLimits(cfg) + if apiErr != nil { + return apiErr + } + + asa := js.cluster.streams[acc.Name] + numStreams, reservations := tieredStreamAndReservationCount(asa, tier, cfg) + + if selectedLimits.MaxStreams > 0 && numStreams >= selectedLimits.MaxStreams { + return NewJSMaximumStreamsLimitError() + } + // Check for account limits here before proposing. + if err := js.checkAccountLimits(selectedLimits, cfg, reservations); err != nil { + return NewJSStreamLimitsError(err, Unless(err)) + } + return nil +} + +func (s *Server) jsClusteredStreamRequest(ci *ClientInfo, acc *Account, subject, reply string, rmsg []byte, config *StreamConfig) { + js, cc := s.getJetStreamCluster() + if js == nil || cc == nil { return } - ccfg, err := checkStreamCfg(config) - if err != nil { - resp.Error = NewJSStreamInvalidConfigError(err, Unless(err)) + var resp = JSApiStreamCreateResponse{ApiResponse: ApiResponse{Type: JSApiStreamCreateResponseType}} + + ccfg, apiErr := s.checkStreamCfg(config, acc) + if apiErr != nil { + resp.Error = apiErr s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) return } cfg := &ccfg - // Check for stream limits here before proposing. These need to be tracked from meta layer, not jsa. js.mu.RLock() + apiErr = js.jsClusteredStreamLimitsCheck(acc, cfg) asa := cc.streams[acc.Name] - numStreams := len(asa) js.mu.RUnlock() - - jsa.mu.RLock() - exceeded := jsa.limits.MaxStreams > 0 && numStreams >= jsa.limits.MaxStreams - jsa.mu.RUnlock() - - if exceeded { - resp.Error = NewJSMaximumStreamsLimitError() - s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) - return - } - - // Check for account limits here before proposing. - if err := jsa.checkAccountLimits(cfg); err != nil { - resp.Error = NewJSStreamLimitsError(err, Unless(err)) + // Check for stream limits here before proposing. These need to be tracked from meta layer, not jsa. + if apiErr != nil { + resp.Error = apiErr s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) return } @@ -4006,7 +4472,10 @@ func (s *Server) jsClusteredStreamUpdateRequest(ci *ClientInfo, acc *Account, su } var newCfg *StreamConfig if jsa := js.accounts[acc.Name]; jsa != nil { - if ncfg, err := jsa.configUpdateCheck(osa.Config, cfg); err != nil { + js.mu.Unlock() + ncfg, err := jsa.configUpdateCheck(osa.Config, cfg, s) + js.mu.Lock() + if err != nil { resp.Error = NewJSStreamUpdateError(err, Unless(err)) s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) return @@ -4018,7 +4487,7 @@ func (s *Server) jsClusteredStreamUpdateRequest(ci *ClientInfo, acc *Account, su s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) return } - // Check for mirrot changes which are not allowed. + // Check for mirror changes which are not allowed. if !reflect.DeepEqual(newCfg.Mirror, osa.Config.Mirror) { resp.Error = NewJSStreamMirrorNotUpdatableError() s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) @@ -4041,11 +4510,34 @@ func (s *Server) jsClusteredStreamUpdateRequest(ci *ClientInfo, acc *Account, su } } + // Make copy so to not change original. + rg := osa.copyGroup().Group + + // Check for a move update. + // TODO(dlc) - Should add a resolve from Tags to cluster and check that vs reflect. + isMoveRequest := newCfg.Placement != nil && !reflect.DeepEqual(osa.Config.Placement, newCfg.Placement) + // Check for replica changes. - rg := osa.Group + isReplicaChange := newCfg.Replicas != osa.Config.Replicas + + // We stage consumer updates and do them after the stream update. var consumers []*consumerAssignment - if newCfg.Replicas != len(rg.Peers) { + // Check if this is a move request and we are already moving this stream. + if isMoveRequest && osa.Config.Replicas != len(rg.Peers) { + resp.Error = NewJSStreamMoveInProgressError() + s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) + return + } + + // Can not move and scale at same time. + if isMoveRequest && isReplicaChange { + resp.Error = NewJSStreamMoveAndScaleError() + s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) + return + } + + if isReplicaChange { // We are adding new peers here. if newCfg.Replicas > len(rg.Peers) { peers := cc.selectPeerGroup(newCfg.Replicas, rg.Cluster, newCfg, rg.Peers) @@ -4063,8 +4555,79 @@ func (s *Server) jsClusteredStreamUpdateRequest(ci *ClientInfo, acc *Account, su } rg.Peers = peers } else { - // We are deleting nodes here. - rg.Peers = rg.Peers[:newCfg.Replicas] + // We are deleting nodes here. We want to do our best to preserve the current leader. + // We have support now from above that guarantees we are in our own Go routine, so can + // ask for stream info from the stream leader to make sure we keep the leader in the new list. + var curLeader string + if !s.allPeersOffline(rg) { + // Need to release js lock. + js.mu.Unlock() + s.mu.Lock() + inbox := s.newRespInbox() + results := make(chan *StreamInfo, 1) + // Store our handler. + s.sys.replies[inbox] = func(sub *subscription, _ *client, _ *Account, subject, _ string, msg []byte) { + var si StreamInfo + if err := json.Unmarshal(msg, &si); err != nil { + s.Warnf("Error unmarshaling clustered stream info response:%v", err) + return + } + select { + case results <- &si: + default: + s.Warnf("Failed placing remote stream info result on internal channel") + } + } + s.mu.Unlock() + + isubj := fmt.Sprintf(clusterStreamInfoT, ci.serviceAccount(), cfg.Name) + s.sendInternalMsgLocked(isubj, inbox, nil, nil) + + const timeout = 2 * time.Second + notActive := time.NewTimer(timeout) + defer notActive.Stop() + + select { + case <-s.quitCh: + break + case <-notActive.C: + s.Warnf("Did not receive stream info results for '%s > %s'", acc, cfg.Name) + case si := <-results: + if si.Cluster != nil { + // The leader here is the server name, but need to convert to internal name. + curLeader = string(getHash(si.Cluster.Leader)) + } + } + // Clean up here. + s.mu.Lock() + if s.sys != nil && s.sys.replies != nil { + delete(s.sys.replies, inbox) + } + s.mu.Unlock() + // Re-acquire here. + js.mu.Lock() + } + // If we identified a leader make sure its part of the new group. + selected := make([]string, 0, newCfg.Replicas) + + if curLeader != _EMPTY_ { + selected = append(selected, curLeader) + } + for _, peer := range rg.Peers { + if len(selected) == newCfg.Replicas { + break + } + if peer == curLeader { + continue + } + if si, ok := s.nodeToInfo.Load(peer); ok && si != nil { + if si.(nodeInfo).offline { + continue + } + selected = append(selected, peer) + } + } + rg.Peers = selected } // Need to remap any consumers. @@ -4085,15 +4648,56 @@ func (s *Server) jsClusteredStreamUpdateRequest(ci *ClientInfo, acc *Account, su consumers = append(consumers, cca) } } + + } else if isMoveRequest { + nrg := js.createGroupForStream(ci, newCfg) + if nrg == nil { + resp.Error = NewJSInsufficientResourcesError() + s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) + return + } + // Only change if resolved clusters are different. + if rg.Cluster != nrg.Cluster { + // If we are R1, make sure original is leader during scale up for move. + if len(rg.Peers) == 1 { + rg.Preferred = rg.Peers[0] + } + // Add in new peers since we will extend the peer group to straddle both clusters. + rg.Peers = append(rg.Peers, nrg.Peers...) + rg.Cluster = nrg.Cluster + + for _, ca := range osa.consumers { + cca := ca.copyGroup() + // Ephemerals are R=1, so only auto-remap if consumer peer count == nrg peer count. + numPeers := len(ca.Group.Peers) + if numPeers == len(nrg.Peers) { + cca.Group.Peers = append(cca.Group.Peers, nrg.Peers...) + } else { + // This is an ephemeral, so R1. Just randomly pick a single peer from the new set. + pi := rand.Int31n(int32(len(nrg.Peers))) + cca.Group.Peers = append(cca.Group.Peers, nrg.Peers[pi]) + } + // Make sure to set if not already set. + if cca.Group.Preferred == _EMPTY_ { + cca.Group.Preferred = cca.Group.Peers[0] + } + // We can not propose here before the stream itself so we collect them. + consumers = append(consumers, cca) + } + } + } else { + // All other updates make sure no preferred is set. + rg.Preferred = _EMPTY_ } - sa := &streamAssignment{Group: rg, Sync: osa.Sync, Config: newCfg, Subject: subject, Reply: reply, Client: ci} + sa := &streamAssignment{Group: rg, Sync: osa.Sync, Created: osa.Created, Config: newCfg, Subject: subject, Reply: reply, Client: ci} cc.meta.Propose(encodeUpdateStreamAssignment(sa)) // Process any staged consumers. for _, ca := range consumers { cc.meta.Propose(encodeAddConsumerAssignment(ca)) } + } func (s *Server) jsClusteredStreamDeleteRequest(ci *ClientInfo, acc *Account, stream, subject, reply string, rmsg []byte) { @@ -4187,6 +4791,12 @@ func (s *Server) jsClusteredStreamRestoreRequest( cfg := &req.Config resp := JSApiStreamRestoreResponse{ApiResponse: ApiResponse{Type: JSApiStreamRestoreResponseType}} + if err := js.jsClusteredStreamLimitsCheck(acc, cfg); err != nil { + resp.Error = err + s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) + return + } + if sa := js.streamAssignment(ci.serviceAccount(), cfg.Name); sa != nil { resp.Error = NewJSStreamNameExistError() s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) @@ -4208,6 +4818,7 @@ func (s *Server) jsClusteredStreamRestoreRequest( cc.meta.Propose(encodeAddStreamAssignment(sa)) } +// Determine if all peers for this group are offline. func (s *Server) allPeersOffline(rg *raftGroup) bool { if rg == nil { return false @@ -4233,7 +4844,7 @@ func (s *Server) jsClusteredStreamListRequest(acc *Account, ci *ClientInfo, filt return } - js.mu.Lock() + js.mu.RLock() var streams []*streamAssignment for _, sa := range cc.streams[acc.Name] { @@ -4283,8 +4894,9 @@ func (s *Server) jsClusteredStreamListRequest(acc *Account, ci *ClientInfo, filt Streams: make([]*StreamInfo, 0, len(streams)), } + js.mu.RUnlock() + if len(streams) == 0 { - js.mu.Unlock() resp.Limit = JSApiListLimit resp.Offset = offset s.sendAPIResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(resp)) @@ -4322,7 +4934,9 @@ func (s *Server) jsClusteredStreamListRequest(acc *Account, ci *ClientInfo, filt var missingNames []string sent := map[string]int{} + // Send out our requests here. + js.mu.RLock() for _, sa := range streams { if s.allPeersOffline(sa.Group) { // Place offline onto our results by hand here. @@ -4336,14 +4950,14 @@ func (s *Server) jsClusteredStreamListRequest(acc *Account, ci *ClientInfo, filt } } // Don't hold lock. - js.mu.Unlock() + js.mu.RUnlock() const timeout = 4 * time.Second notActive := time.NewTimer(timeout) defer notActive.Stop() LOOP: - for { + for len(sent) > 0 { select { case <-s.quitCh: return @@ -4352,7 +4966,6 @@ LOOP: for sName := range sent { missingNames = append(missingNames, sName) } - resp.Missing = missingNames break LOOP case si := <-rc: consCount := sent[si.Config.Name] @@ -4378,6 +4991,7 @@ LOOP: resp.Total = len(resp.Streams) resp.Limit = JSApiListLimit resp.Offset = offset + resp.Missing = missingNames s.sendAPIResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(resp)) } @@ -4391,7 +5005,7 @@ func (s *Server) jsClusteredConsumerListRequest(acc *Account, ci *ClientInfo, of return } - js.mu.Lock() + js.mu.RLock() var consumers []*consumerAssignment if sas := cc.streams[acc.Name]; sas != nil { @@ -4426,8 +5040,9 @@ func (s *Server) jsClusteredConsumerListRequest(acc *Account, ci *ClientInfo, of Consumers: []*ConsumerInfo{}, } + js.mu.RUnlock() + if len(consumers) == 0 { - js.mu.Unlock() resp.Limit = JSApiListLimit resp.Offset = offset s.sendAPIResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(resp)) @@ -4465,35 +5080,38 @@ func (s *Server) jsClusteredConsumerListRequest(acc *Account, ci *ClientInfo, of var missingNames []string sent := map[string]struct{}{} + + // Send out our requests here. + js.mu.RLock() for _, ca := range consumers { if s.allPeersOffline(ca.Group) { // Place offline onto our results by hand here. ci := &ConsumerInfo{Config: ca.Config, Created: ca.Created, Cluster: js.offlineClusterInfo(ca.Group)} resp.Consumers = append(resp.Consumers, ci) - missingNames = append(missingNames, ci.Name) + missingNames = append(missingNames, ca.Name) } else { isubj := fmt.Sprintf(clusterConsumerInfoT, ca.Client.serviceAccount(), stream, ca.Name) s.sendInternalMsgLocked(isubj, inbox, nil, nil) sent[ca.Name] = struct{}{} } } - js.mu.Unlock() + // Don't hold lock. + js.mu.RUnlock() const timeout = 4 * time.Second notActive := time.NewTimer(timeout) defer notActive.Stop() LOOP: - for { + for len(sent) > 0 { select { case <-s.quitCh: return case <-notActive.C: - s.Warnf("Did not receive all consumer info results for %q", acc) + s.Warnf("Did not receive all consumer info results for '%s > %s'", acc, stream) for cName := range sent { missingNames = append(missingNames, cName) } - resp.Missing = missingNames break LOOP case ci := <-rc: delete(sent, ci.Name) @@ -4515,6 +5133,7 @@ LOOP: resp.Total = len(resp.Consumers) resp.Limit = JSApiListLimit resp.Offset = offset + resp.Missing = missingNames s.sendAPIResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(resp)) } @@ -4647,16 +5266,28 @@ func encodeDeleteStreamAssignment(sa *streamAssignment) []byte { func decodeStreamAssignment(buf []byte) (*streamAssignment, error) { var sa streamAssignment err := json.Unmarshal(buf, &sa) + if err != nil { + return nil, err + } + fixCfgMirrorWithDedupWindow(sa.Config) return &sa, err } -// createGroupForConsumer will create a new group with same peer set as the stream. -func (cc *jetStreamCluster) createGroupForConsumer(sa *streamAssignment) *raftGroup { - peers := sa.Group.Peers +// createGroupForConsumer will create a new group from same peer set as the stream. +func (cc *jetStreamCluster) createGroupForConsumer(cfg *ConsumerConfig, sa *streamAssignment) *raftGroup { + peers := copyStrings(sa.Group.Peers) if len(peers) == 0 { return nil } - return &raftGroup{Name: groupNameForConsumer(peers, sa.Config.Storage), Storage: sa.Config.Storage, Peers: peers} + if cfg.Replicas > 0 && cfg.Replicas != len(peers) { + rand.Shuffle(len(peers), func(i, j int) { peers[i], peers[j] = peers[j], peers[i] }) + peers = peers[:cfg.Replicas] + } + storage := sa.Config.Storage + if cfg.MemoryStorage { + storage = MemoryStorage + } + return &raftGroup{Name: groupNameForConsumer(peers, storage), Storage: storage, Peers: peers} } // jsClusteredConsumerRequest is first point of entry to create a consumer with R > 1. @@ -4666,11 +5297,33 @@ func (s *Server) jsClusteredConsumerRequest(ci *ClientInfo, acc *Account, subjec return } + var resp = JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}} + + streamCfg, ok := js.clusterStreamConfig(acc.Name, stream) + if !ok { + resp.Error = NewJSStreamNotFoundError() + s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) + return + } + selectedLimits, _, _, apiErr := acc.selectLimits(&streamCfg) + if apiErr != nil { + resp.Error = apiErr + s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) + return + } + srvLim := &s.getOpts().JetStreamLimits + // Make sure we have sane defaults + setConsumerConfigDefaults(cfg, srvLim, selectedLimits) + + if err := checkConsumerCfg(cfg, srvLim, &streamCfg, acc, selectedLimits, false); err != nil { + resp.Error = err + s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) + return + } + js.mu.Lock() defer js.mu.Unlock() - var resp = JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}} - // Lookup the stream assignment. sa := js.streamAssignment(acc.Name, stream) if sa == nil { @@ -4735,7 +5388,7 @@ func (s *Server) jsClusteredConsumerRequest(ci *ClientInfo, acc *Account, subjec // If this is new consumer. if ca == nil { - rg := cc.createGroupForConsumer(sa) + rg := cc.createGroupForConsumer(cfg, sa) if rg == nil { resp.Error = NewJSInsufficientResourcesError() s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) @@ -4744,6 +5397,9 @@ func (s *Server) jsClusteredConsumerRequest(ci *ClientInfo, acc *Account, subjec // Pick a preferred leader. rg.setPreferred() + // Inherit cluster from stream. + rg.Cluster = sa.Group.Cluster + // We need to set the ephemeral here before replicating. if !isDurableConsumer(cfg) { // We chose to have ephemerals be R=1 unless stream is interest or workqueue. @@ -4762,6 +5418,23 @@ func (s *Server) jsClusteredConsumerRequest(ci *ClientInfo, acc *Account, subjec break } } + if len(rg.Peers) > 1 { + if maxHaAssets := s.getOpts().JetStreamLimits.MaxHAAssets; maxHaAssets != 0 { + for _, peer := range rg.Peers { + if ni, ok := s.nodeToInfo.Load(peer); ok { + ni := ni.(nodeInfo) + if stats := ni.stats; stats != nil && stats.HAAssets > maxHaAssets { + resp.Error = NewJSInsufficientResourcesError() + s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) + s.Warnf("%s@%s (HA Asset Count: %d) exceeds max ha asset limit of %d"+ + " for (durable) consumer %s placement on stream %s", + ni.name, ni.cluster, ni.stats.HAAssets, maxHaAssets, oname, stream) + return + } + } + } + } + } ca = &consumerAssignment{ Group: rg, Stream: stream, @@ -4962,7 +5635,7 @@ func (mset *stream) processClusteredInboundMsg(subject, reply string, hdr, msg [ mset.mu.RLock() canRespond := !mset.cfg.NoAck && len(reply) > 0 name, stype := mset.cfg.Name, mset.cfg.Storage - s, js, jsa, st, rf, outq, node := mset.srv, mset.js, mset.jsa, mset.cfg.Storage, mset.cfg.Replicas, mset.outq, mset.node + s, js, jsa, st, rf, tierName, outq, node := mset.srv, mset.js, mset.jsa, mset.cfg.Storage, mset.cfg.Replicas, mset.tier, mset.outq, mset.node maxMsgSize, lseq := int(mset.cfg.MaxMsgSize), mset.lseq mset.mu.RUnlock() @@ -4987,24 +5660,42 @@ func (mset *stream) processClusteredInboundMsg(subject, reply string, hdr, msg [ // Check here pre-emptively if we have exceeded our account limits. var exceeded bool - jsa.mu.RLock() + jsa.usageMu.Lock() + jsaLimits, ok := jsa.limits[tierName] + if !ok { + jsa.usageMu.Unlock() + err := fmt.Errorf("no JetStream resource limits found account: %q", jsa.acc().Name) + s.RateLimitWarnf(err.Error()) + if canRespond { + var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}} + resp.Error = NewJSNoLimitsError() + response, _ = json.Marshal(resp) + outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, response, nil, 0)) + } + return err + } + t, ok := jsa.usage[tierName] + if !ok { + t = &jsaStorage{} + jsa.usage[tierName] = t + } if st == MemoryStorage { - total := jsa.storeTotal + int64(memStoreMsgSize(subject, hdr, msg)*uint64(rf)) - if jsa.limits.MaxMemory > 0 && total > jsa.limits.MaxMemory { + total := t.total.store + int64(memStoreMsgSize(subject, hdr, msg)*uint64(rf)) + if jsaLimits.MaxMemory > 0 && total > jsaLimits.MaxMemory { exceeded = true } } else { - total := jsa.storeTotal + int64(fileStoreMsgSize(subject, hdr, msg)*uint64(rf)) - if jsa.limits.MaxStore > 0 && total > jsa.limits.MaxStore { + total := t.total.store + int64(fileStoreMsgSize(subject, hdr, msg)*uint64(rf)) + if jsaLimits.MaxStore > 0 && total > jsaLimits.MaxStore { exceeded = true } } - jsa.mu.RUnlock() + jsa.usageMu.Unlock() // If we have exceeded our account limits go ahead and return. if exceeded { err := fmt.Errorf("JetStream resource limits exceeded for account: %q", jsa.acc().Name) - s.Warnf(err.Error()) + s.RateLimitWarnf(err.Error()) if canRespond { var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}} resp.Error = NewJSAccountResourcesExceededError() @@ -5017,7 +5708,7 @@ func (mset *stream) processClusteredInboundMsg(subject, reply string, hdr, msg [ // Check msgSize if we have a limit set there. Again this works if it goes through but better to be pre-emptive. if maxMsgSize >= 0 && (len(hdr)+len(msg)) > maxMsgSize { err := fmt.Errorf("JetStream message size exceeds limits for '%s > %s'", jsa.acc().Name, mset.cfg.Name) - s.Warnf(err.Error()) + s.RateLimitWarnf(err.Error()) if canRespond { var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}} resp.Error = NewJSStreamMessageExceedsMaximumError() @@ -5031,7 +5722,7 @@ func (mset *stream) processClusteredInboundMsg(subject, reply string, hdr, msg [ // Again this works if it goes through but better to be pre-emptive. if len(hdr) > math.MaxUint16 { err := fmt.Errorf("JetStream header size exceeds limits for '%s > %s'", jsa.acc().Name, mset.cfg.Name) - s.Warnf(err.Error()) + s.RateLimitWarnf(err.Error()) if canRespond { var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}} resp.Error = NewJSStreamHeaderExceedsMaximumError() @@ -5185,8 +5876,11 @@ func (mset *stream) isCatchingUp() bool { return mset.catchup } +// Maximum requests for the whole server that can be in flight. +const maxConcurrentSyncRequests = 8 + // Process a stream snapshot. -func (mset *stream) processSnapshot(snap *streamSnapshot) error { +func (mset *stream) processSnapshot(snap *streamSnapshot) (e error) { // Update any deletes, etc. mset.processSnapshotDeletes(snap) @@ -5195,8 +5889,9 @@ func (mset *stream) processSnapshot(snap *streamSnapshot) error { mset.clfs = snap.Failed mset.store.FastState(&state) sreq := mset.calculateSyncRequest(&state, snap) + s, js, subject, n := mset.srv, mset.js, mset.sa.Sync, mset.node - qname := fmt.Sprintf("Stream %q snapshot", mset.cfg.Name) + qname := fmt.Sprintf("[ACC:%s] stream '%s' snapshot", mset.acc.Name, mset.cfg.Name) mset.mu.Unlock() // Make sure our state's first sequence is <= the leader's snapshot. @@ -5215,8 +5910,20 @@ func (mset *stream) processSnapshot(snap *streamSnapshot) error { } // Pause the apply channel for our raft group while we catch up. - n.PauseApply() - defer n.ResumeApply() + if err := n.PauseApply(); err != nil { + return err + } + + // ErrStreamStopped is when a catchup is terminated due to the stream going away. + var ErrStreamStopped = errors.New("stream has been stopped") + + defer func() { + if e == ErrServerNotRunning || e == ErrStreamStopped { + // Wipe our raft state if exiting with these errors. + n.Wipe() + } + n.ResumeApply() + }() // Set our catchup state. mset.setCatchingUp() @@ -5225,7 +5932,7 @@ func (mset *stream) processSnapshot(snap *streamSnapshot) error { var sub *subscription var err error - const activityInterval = 5 * time.Second + const activityInterval = 10 * time.Second notActive := time.NewTimer(activityInterval) defer notActive.Stop() @@ -5238,21 +5945,61 @@ func (mset *stream) processSnapshot(snap *streamSnapshot) error { for _, o := range mset.consumers { o.mu.Lock() if o.isLeader() { - // This expects mset lock to be held. - o.setInitialPendingAndStart() + o.streamNumPending() } o.mu.Unlock() } mset.mu.Unlock() }() + var releaseSem bool + releaseSyncOutSem := func() { + if !releaseSem { + return + } + // Need to use select for the server shutdown case. + select { + case s.syncOutSem <- struct{}{}: + default: + } + releaseSem = false + } + // On exit, we will release our semaphore if we acquired it. + defer releaseSyncOutSem() + RETRY: + // On retry, we need to release the semaphore we got. Call will be no-op + // if releaseSem boolean has not been set to true on successfully getting + // the semaphore. + releaseSyncOutSem() + + if n.GroupLeader() == _EMPTY_ { + return fmt.Errorf("catchup for stream '%s > %s' aborted, no leader", mset.account(), mset.name()) + } + // If we have a sub clear that here. if sub != nil { s.sysUnsubscribe(sub) sub = nil } + // Block here if we have too many requests in flight. + <-s.syncOutSem + releaseSem = true + if !s.isRunning() { + return ErrServerNotRunning + } + + // We may have been blocked for a bit, so the reset need to ensure that we + // consume the already fired timer. + if !notActive.Stop() { + select { + case <-notActive.C: + default: + } + } + notActive.Reset(activityInterval) + // Grab sync request again on failures. if sreq == nil { mset.mu.Lock() @@ -5271,7 +6018,8 @@ RETRY: reply string } - msgsQ := newIPQueue(ipQueue_Logger(qname, s.ipqLog)) // of *im + msgsQ := s.newIPQueue(qname) // of *im + defer msgsQ.unregister() // Send our catchup request here. reply := syncReplySubject() @@ -5282,9 +6030,9 @@ RETRY: }) if err != nil { s.Errorf("Could not subscribe to stream catchup: %v", err) - return err + err = nil + goto RETRY } - b, _ := json.Marshal(sreq) s.sendInternalMsgLocked(subject, reply, nil, b) @@ -5305,10 +6053,12 @@ RETRY: // Check for eof signaling. if len(msg) == 0 { + msgsQ.recycle(&mrecs) return nil } if lseq, err := mset.processCatchupMsg(msg); err == nil { if lseq >= last { + msgsQ.recycle(&mrecs) return nil } } else if isOutOfSpaceErr(err) { @@ -5319,9 +6069,11 @@ RETRY: } else { s.Warnf("Catchup for stream '%s > %s' errored, account resources exceeded: %v", mset.account(), mset.name(), err) } + msgsQ.recycle(&mrecs) return err } else { s.Warnf("Catchup for stream '%s > %s' errored, will retry: %v", mset.account(), mset.name(), err) + msgsQ.recycle(&mrecs) goto RETRY } if mrec.reply != _EMPTY_ { @@ -5331,15 +6083,16 @@ RETRY: msgsQ.recycle(&mrecs) case <-notActive.C: s.Warnf("Catchup for stream '%s > %s' stalled", mset.account(), mset.name()) - notActive.Reset(activityInterval) goto RETRY case <-s.quitCh: - return nil + return ErrServerNotRunning case <-qch: - return nil + return ErrStreamStopped case isLeader := <-lch: - js.processStreamLeaderChange(mset, isLeader) - return nil + if isLeader { + n.StepDown() + goto RETRY + } } } } @@ -5357,9 +6110,15 @@ func (mset *stream) processCatchupMsg(msg []byte) (uint64, error) { mset.mu.RLock() st := mset.cfg.Storage + ddloaded := mset.ddloaded + tierName := mset.tier mset.mu.RUnlock() - if mset.js.limitsExceeded(st) || mset.jsa.limitsExceeded(st) { + if mset.js.limitsExceeded(st) { + return 0, NewJSInsufficientResourcesError() + } else if exceeded, apiErr := mset.jsa.limitsExceeded(st, tierName); apiErr != nil { + return 0, apiErr + } else if exceeded { return 0, NewJSInsufficientResourcesError() } @@ -5377,6 +6136,18 @@ func (mset *stream) processCatchupMsg(msg []byte) (uint64, error) { // Update our lseq. mset.setLastSeq(seq) + // Check for MsgId and if we have one here make sure to update our internal map. + if len(hdr) > 0 { + if msgId := getMsgId(hdr); msgId != _EMPTY_ { + if !ddloaded { + mset.mu.Lock() + mset.rebuildDedupe() + mset.mu.Unlock() + } + mset.storeMsgId(&ddentry{msgId, seq, ts}) + } + } + return seq, nil } @@ -5447,7 +6218,15 @@ func (js *jetStream) clusterInfo(rg *raftGroup) *ClusterInfo { } if sir, ok := s.nodeToInfo.Load(rp.ID); ok && sir != nil { si := sir.(nodeInfo) - pi := &PeerInfo{Name: si.name, Current: current, Offline: si.offline, Active: lastSeen, Lag: rp.Lag} + pi := &PeerInfo{ + Name: si.name, + Current: current, + Offline: si.offline, + Active: lastSeen, + Lag: rp.Lag, + cluster: si.cluster, + peer: rp.ID, + } ci.Replicas = append(ci.Replicas, pi) } } @@ -5455,8 +6234,8 @@ func (js *jetStream) clusterInfo(rg *raftGroup) *ClusterInfo { return ci } -func (mset *stream) checkClusterInfo(si *StreamInfo) { - for _, r := range si.Cluster.Replicas { +func (mset *stream) checkClusterInfo(ci *ClusterInfo) { + for _, r := range ci.Replicas { peer := string(getHash(r.Name)) if lag := mset.lagForCatchupPeer(peer); lag > 0 { r.Current = false @@ -5465,6 +6244,59 @@ func (mset *stream) checkClusterInfo(si *StreamInfo) { } } +// Return a list of alternates, ranked by preference order to the request, of stream mirrors. +// This allows clients to select or get more information about read replicas that could be a +// better option to connect to versus the original source. +func (js *jetStream) streamAlternates(ci *ClientInfo, stream string) []StreamAlternate { + if js == nil { + return nil + } + + js.mu.RLock() + defer js.mu.RUnlock() + + s, cc := js.srv, js.cluster + // Track our domain. + domain := s.getOpts().JetStreamDomain + + // No clustering just return nil. + if cc == nil { + return nil + } + acc, _ := s.LookupAccount(ci.serviceAccount()) + if acc == nil { + return nil + } + + // Collect our ordering first for clusters. + weights := make(map[string]int) + all := []string{ci.Cluster} + all = append(all, ci.Alternates...) + + for i := 0; i < len(all); i++ { + weights[all[i]] = len(all) - i + } + + var alts []StreamAlternate + for _, sa := range cc.streams[acc.Name] { + // Add in ourselves and any mirrors. + if sa.Config.Name == stream || (sa.Config.Mirror != nil && sa.Config.Mirror.Name == stream) { + alts = append(alts, StreamAlternate{Name: sa.Config.Name, Domain: domain, Cluster: sa.Group.Cluster}) + } + } + // If just us don't fill in. + if len(alts) == 1 { + return nil + } + + // Sort based on our weights that originate from the request itself. + sort.Slice(alts, func(i, j int) bool { + return weights[alts[i].Cluster] > weights[alts[j].Cluster] + }) + + return alts +} + func (mset *stream) handleClusterStreamInfoRequest(sub *subscription, c *client, _ *Account, subject, reply string, _ []byte) { mset.mu.RLock() sysc, js, sa, config := mset.sysc, mset.srv.js, mset.sa, mset.cfg @@ -5494,21 +6326,87 @@ func (mset *stream) handleClusterStreamInfoRequest(sub *subscription, c *client, // Check for out of band catchups. if mset.hasCatchupPeers() { - mset.checkClusterInfo(si) + mset.checkClusterInfo(si.Cluster) } sysc.sendInternalMsg(reply, _EMPTY_, nil, si) } +const maxTotalCatchupOutBytes = int64(128 * 1024 * 1024) // 128MB for now, for the total server. + +// Current total outstanding catchup bytes. +func (s *Server) gcbTotal() int64 { + s.gcbMu.RLock() + defer s.gcbMu.RUnlock() + return s.gcbOut +} + +// Adds `sz` to the server's total outstanding catchup bytes and to `localsz` +// under the gcbMu lock. The `localsz` points to the local outstanding catchup +// bytes of the runCatchup go routine of a given stream. +func (s *Server) gcbAdd(localsz *int64, sz int64) { + s.gcbMu.Lock() + atomic.AddInt64(localsz, sz) + s.gcbOut += sz + if s.gcbOut >= maxTotalCatchupOutBytes && s.gcbKick == nil { + s.gcbKick = make(chan struct{}) + } + s.gcbMu.Unlock() +} + +// Removes `sz` from the server's total outstanding catchup bytes and from +// `localsz`, but only if `localsz` is non 0, which would signal that gcSubLast +// has already been invoked. See that function for details. +// Must be invoked under the gcbMu lock. +func (s *Server) gcbSubLocked(localsz *int64, sz int64) { + if atomic.LoadInt64(localsz) == 0 { + return + } + atomic.AddInt64(localsz, -sz) + s.gcbOut -= sz + if s.gcbKick != nil && s.gcbOut < maxTotalCatchupOutBytes { + close(s.gcbKick) + s.gcbKick = nil + } +} + +// Locked version of gcbSubLocked() +func (s *Server) gcbSub(localsz *int64, sz int64) { + s.gcbMu.Lock() + s.gcbSubLocked(localsz, sz) + s.gcbMu.Unlock() +} + +// Similar to gcbSub() but reset `localsz` to 0 at the end under the gcbMu lock. +// This will signal further calls to gcbSub() for this `localsz` pointer that +// nothing should be done because runCatchup() has exited and any remaining +// outstanding bytes value has already been decremented. +func (s *Server) gcbSubLast(localsz *int64) { + s.gcbMu.Lock() + s.gcbSubLocked(localsz, *localsz) + *localsz = 0 + s.gcbMu.Unlock() +} + +// Returns our kick chan, or nil if it does not exist. +func (s *Server) cbKickChan() <-chan struct{} { + s.gcbMu.RLock() + defer s.gcbMu.RUnlock() + return s.gcbKick +} + func (mset *stream) runCatchup(sendSubject string, sreq *streamSyncRequest) { s := mset.srv defer s.grWG.Done() - const maxOutBytes = int64(1 * 1024 * 1024) // 1MB for now. - const maxOutMsgs = int32(16384) + const maxOutBytes = int64(32 * 1024 * 1024) // 32MB for now, these are all internal, from server to server + const maxOutMsgs = int32(128 * 1024) outb := int64(0) outm := int32(0) + // On abnormal exit make sure to update global total. + defer s.gcbSubLast(&outb) + // Flow control processing. ackReplySize := func(subj string) int64 { if li := strings.LastIndexByte(subj, btsep); li > 0 && li < len(subj) { @@ -5524,9 +6422,10 @@ func (mset *stream) runCatchup(sendSubject string, sreq *streamSyncRequest) { ackReply := syncAckSubject() ackSub, _ := s.sysSubscribe(ackReply, func(sub *subscription, c *client, _ *Account, subject, reply string, msg []byte) { sz := ackReplySize(subject) - atomic.AddInt64(&outb, -sz) + s.gcbSub(&outb, sz) atomic.AddInt32(&outm, -1) mset.updateCatchupPeer(sreq.Peer) + // Kick ourselves and anyone else who might have stalled on global state. select { case nextBatchC <- struct{}{}: default: @@ -5535,9 +6434,6 @@ func (mset *stream) runCatchup(sendSubject string, sreq *streamSyncRequest) { defer s.sysUnsubscribe(ackSub) ackReplyT := strings.ReplaceAll(ackReply, ".*", ".%d") - // EOF - defer s.sendInternalMsgLocked(sendSubject, _EMPTY_, nil, nil) - const activityInterval = 5 * time.Second notActive := time.NewTimer(activityInterval) defer notActive.Stop() @@ -5547,23 +6443,40 @@ func (mset *stream) runCatchup(sendSubject string, sreq *streamSyncRequest) { mset.setCatchupPeer(sreq.Peer, last-seq) defer mset.clearCatchupPeer(sreq.Peer) - sendNextBatch := func() { - for ; seq <= last && atomic.LoadInt64(&outb) <= maxOutBytes && atomic.LoadInt32(&outm) <= maxOutMsgs; seq++ { - subj, hdr, msg, ts, err := mset.store.LoadMsg(seq) + sendNextBatchAndContinue := func() bool { + // Update our activity timer. + notActive.Reset(activityInterval) + + var smv StoreMsg + for ; seq <= last && atomic.LoadInt64(&outb) <= maxOutBytes && atomic.LoadInt32(&outm) <= maxOutMsgs && s.gcbTotal() <= maxTotalCatchupOutBytes; seq++ { + sm, err := mset.store.LoadMsg(seq, &smv) // if this is not a deleted msg, bail out. if err != nil && err != ErrStoreMsgNotFound && err != errDeletedMsg { - // break, something changed. - seq = last + 1 - return + s.Warnf("Error loading message for catchup '%s > %s': %v", mset.account(), mset.name(), err) + return false } // S2? - em := encodeStreamMsg(subj, _EMPTY_, hdr, msg, seq, ts) + var em []byte + if sm != nil { + em = encodeStreamMsg(sm.subj, _EMPTY_, sm.hdr, sm.msg, sm.seq, sm.ts) + } else { + // Skip record for deleted msg. + em = encodeStreamMsg(_EMPTY_, _EMPTY_, nil, nil, seq, 0) + } // Place size in reply subject for flow control. - reply := fmt.Sprintf(ackReplyT, len(em)) - atomic.AddInt64(&outb, int64(len(em))) + l := int64(len(em)) + reply := fmt.Sprintf(ackReplyT, l) + s.gcbAdd(&outb, l) atomic.AddInt32(&outm, 1) s.sendInternalMsgLocked(sendSubject, reply, nil, em) + if seq == last { + s.Noticef("Catchup for stream '%s > %s' complete", mset.account(), mset.name()) + // EOF + s.sendInternalMsgLocked(sendSubject, _EMPTY_, nil, nil) + return false + } } + return true } // Grab stream quit channel. @@ -5577,6 +6490,9 @@ func (mset *stream) runCatchup(sendSubject string, sreq *streamSyncRequest) { // Run as long as we are still active and need catchup. // FIXME(dlc) - Purge event? Stream delete? for { + // Get this each time, will be non-nil if globally blocked and we will close to wake everyone up. + cbKick := s.cbKickChan() + select { case <-s.quitCh: return @@ -5586,12 +6502,11 @@ func (mset *stream) runCatchup(sendSubject string, sreq *streamSyncRequest) { s.Warnf("Catchup for stream '%s > %s' stalled", mset.account(), mset.name()) return case <-nextBatchC: - // Update our activity timer. - notActive.Reset(activityInterval) - sendNextBatch() - // Check if we are finished. - if seq > last { - s.Debugf("Done resync for stream '%s > %s'", mset.account(), mset.name()) + if !sendNextBatchAndContinue() { + return + } + case <-cbKick: + if !sendNextBatchAndContinue() { return } } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_errors_generated.go b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_errors_generated.go index 3ff7d762..be507d1a 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_errors_generated.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_errors_generated.go @@ -107,9 +107,15 @@ const ( // JSConsumerMaxDeliverBackoffErr max deliver is required to be > length of backoff values JSConsumerMaxDeliverBackoffErr ErrorIdentifier = 10116 + // JSConsumerMaxPendingAckExcessErrF consumer max ack pending exceeds system limit of {limit} + JSConsumerMaxPendingAckExcessErrF ErrorIdentifier = 10121 + // JSConsumerMaxPendingAckPolicyRequiredErr consumer requires ack policy for max ack pending JSConsumerMaxPendingAckPolicyRequiredErr ErrorIdentifier = 10082 + // JSConsumerMaxRequestBatchExceededF consumer max request batch exceeds server limit of {limit} + JSConsumerMaxRequestBatchExceededF ErrorIdentifier = 10125 + // JSConsumerMaxRequestBatchNegativeErr consumer max request batch needs to be > 0 JSConsumerMaxRequestBatchNegativeErr ErrorIdentifier = 10114 @@ -128,6 +134,9 @@ const ( // JSConsumerNotFoundErr consumer not found JSConsumerNotFoundErr ErrorIdentifier = 10014 + // JSConsumerOfflineErr consumer is offline + JSConsumerOfflineErr ErrorIdentifier = 10119 + // JSConsumerOnMappedErr consumer direct on a mapped consumer JSConsumerOnMappedErr ErrorIdentifier = 10092 @@ -146,6 +155,9 @@ const ( // JSConsumerReplacementWithDifferentNameErr consumer replacement durable config not the same JSConsumerReplacementWithDifferentNameErr ErrorIdentifier = 10106 + // JSConsumerReplicasExceedsStream consumer config replica count exceeds parent stream + JSConsumerReplicasExceedsStream ErrorIdentifier = 10126 + // JSConsumerSmallHeartbeatErr consumer idle heartbeat needs to be >= 100ms JSConsumerSmallHeartbeatErr ErrorIdentifier = 10083 @@ -182,7 +194,7 @@ const ( // JSMemoryResourcesExceededErr insufficient memory resources available JSMemoryResourcesExceededErr ErrorIdentifier = 10028 - // JSMirrorConsumerSetupFailedErrF Generic mirror consumer setup failure string ({err}) + // JSMirrorConsumerSetupFailedErrF generic mirror consumer setup failure string ({err}) JSMirrorConsumerSetupFailedErrF ErrorIdentifier = 10029 // JSMirrorMaxMessageSizeTooBigErr stream mirror must have max message size >= source @@ -197,12 +209,15 @@ const ( // JSMirrorWithSubjectFiltersErr stream mirrors can not contain filtered subjects JSMirrorWithSubjectFiltersErr ErrorIdentifier = 10033 - // JSMirrorWithSubjectsErr stream mirrors can not also contain subjects + // JSMirrorWithSubjectsErr stream mirrors can not contain subjects JSMirrorWithSubjectsErr ErrorIdentifier = 10034 // JSNoAccountErr account not found JSNoAccountErr ErrorIdentifier = 10035 + // JSNoLimitsErr no JetStream default or applicable tiered limit present + JSNoLimitsErr ErrorIdentifier = 10120 + // JSNoMessageFoundErr no message found JSNoMessageFoundErr ErrorIdentifier = 10037 @@ -278,15 +293,24 @@ const ( // JSStreamMaxBytesRequired account requires a stream config to have max bytes set JSStreamMaxBytesRequired ErrorIdentifier = 10113 + // JSStreamMaxStreamBytesExceeded stream max bytes exceeds account limit max stream bytes + JSStreamMaxStreamBytesExceeded ErrorIdentifier = 10122 + // JSStreamMessageExceedsMaximumErr message size exceeds maximum allowed JSStreamMessageExceedsMaximumErr ErrorIdentifier = 10054 - // JSStreamMirrorNotUpdatableErr Mirror configuration can not be updated + // JSStreamMirrorNotUpdatableErr stream mirror configuration can not be updated JSStreamMirrorNotUpdatableErr ErrorIdentifier = 10055 // JSStreamMismatchErr stream name in subject does not match request JSStreamMismatchErr ErrorIdentifier = 10056 + // JSStreamMoveAndScaleErr can not move and scale a stream in a single update + JSStreamMoveAndScaleErr ErrorIdentifier = 10123 + + // JSStreamMoveInProgress stream move already in progress + JSStreamMoveInProgress ErrorIdentifier = 10124 + // JSStreamMsgDeleteFailedF Generic message deletion failure error string ({err}) JSStreamMsgDeleteFailedF ErrorIdentifier = 10057 @@ -299,6 +323,9 @@ const ( // JSStreamNotMatchErr expected stream does not match JSStreamNotMatchErr ErrorIdentifier = 10060 + // JSStreamOfflineErr stream is offline + JSStreamOfflineErr ErrorIdentifier = 10118 + // JSStreamPurgeFailedF Generic stream purge failure error string ({err}) JSStreamPurgeFailedF ErrorIdentifier = 10110 @@ -390,19 +417,23 @@ var ( JSConsumerInvalidPolicyErrF: {Code: 400, ErrCode: 10094, Description: "{err}"}, JSConsumerInvalidSamplingErrF: {Code: 400, ErrCode: 10095, Description: "failed to parse consumer sampling configuration: {err}"}, JSConsumerMaxDeliverBackoffErr: {Code: 400, ErrCode: 10116, Description: "max deliver is required to be > length of backoff values"}, + JSConsumerMaxPendingAckExcessErrF: {Code: 400, ErrCode: 10121, Description: "consumer max ack pending exceeds system limit of {limit}"}, JSConsumerMaxPendingAckPolicyRequiredErr: {Code: 400, ErrCode: 10082, Description: "consumer requires ack policy for max ack pending"}, + JSConsumerMaxRequestBatchExceededF: {Code: 400, ErrCode: 10125, Description: "consumer max request batch exceeds server limit of {limit}"}, JSConsumerMaxRequestBatchNegativeErr: {Code: 400, ErrCode: 10114, Description: "consumer max request batch needs to be > 0"}, JSConsumerMaxRequestExpiresToSmall: {Code: 400, ErrCode: 10115, Description: "consumer max request expires needs to be >= 1ms"}, JSConsumerMaxWaitingNegativeErr: {Code: 400, ErrCode: 10087, Description: "consumer max waiting needs to be positive"}, JSConsumerNameExistErr: {Code: 400, ErrCode: 10013, Description: "consumer name already in use"}, JSConsumerNameTooLongErrF: {Code: 400, ErrCode: 10102, Description: "consumer name is too long, maximum allowed is {max}"}, JSConsumerNotFoundErr: {Code: 404, ErrCode: 10014, Description: "consumer not found"}, + JSConsumerOfflineErr: {Code: 500, ErrCode: 10119, Description: "consumer is offline"}, JSConsumerOnMappedErr: {Code: 400, ErrCode: 10092, Description: "consumer direct on a mapped consumer"}, JSConsumerPullNotDurableErr: {Code: 400, ErrCode: 10085, Description: "consumer in pull mode requires a durable name"}, JSConsumerPullRequiresAckErr: {Code: 400, ErrCode: 10084, Description: "consumer in pull mode requires ack policy"}, JSConsumerPullWithRateLimitErr: {Code: 400, ErrCode: 10086, Description: "consumer in pull mode can not have rate limit set"}, JSConsumerPushMaxWaitingErr: {Code: 400, ErrCode: 10080, Description: "consumer in push mode can not set max waiting"}, JSConsumerReplacementWithDifferentNameErr: {Code: 400, ErrCode: 10106, Description: "consumer replacement durable config not the same"}, + JSConsumerReplicasExceedsStream: {Code: 400, ErrCode: 10126, Description: "consumer config replica count exceeds parent stream"}, JSConsumerSmallHeartbeatErr: {Code: 400, ErrCode: 10083, Description: "consumer idle heartbeat needs to be >= 100ms"}, JSConsumerStoreFailedErrF: {Code: 500, ErrCode: 10104, Description: "error creating store for consumer: {err}"}, JSConsumerWQConsumerNotDeliverAllErr: {Code: 400, ErrCode: 10101, Description: "consumer must be deliver all on workqueue stream"}, @@ -420,8 +451,9 @@ var ( JSMirrorWithSourcesErr: {Code: 400, ErrCode: 10031, Description: "stream mirrors can not also contain other sources"}, JSMirrorWithStartSeqAndTimeErr: {Code: 400, ErrCode: 10032, Description: "stream mirrors can not have both start seq and start time configured"}, JSMirrorWithSubjectFiltersErr: {Code: 400, ErrCode: 10033, Description: "stream mirrors can not contain filtered subjects"}, - JSMirrorWithSubjectsErr: {Code: 400, ErrCode: 10034, Description: "stream mirrors can not also contain subjects"}, + JSMirrorWithSubjectsErr: {Code: 400, ErrCode: 10034, Description: "stream mirrors can not contain subjects"}, JSNoAccountErr: {Code: 503, ErrCode: 10035, Description: "account not found"}, + JSNoLimitsErr: {Code: 400, ErrCode: 10120, Description: "no JetStream default or applicable tiered limit present"}, JSNoMessageFoundErr: {Code: 404, ErrCode: 10037, Description: "no message found"}, JSNotEmptyRequestErr: {Code: 400, ErrCode: 10038, Description: "expected an empty request payload"}, JSNotEnabledErr: {Code: 503, ErrCode: 10076, Description: "JetStream not enabled"}, @@ -447,13 +479,17 @@ var ( JSStreamInvalidExternalDeliverySubjErrF: {Code: 400, ErrCode: 10024, Description: "stream external delivery prefix {prefix} must not contain wildcards"}, JSStreamLimitsErrF: {Code: 500, ErrCode: 10053, Description: "{err}"}, JSStreamMaxBytesRequired: {Code: 400, ErrCode: 10113, Description: "account requires a stream config to have max bytes set"}, + JSStreamMaxStreamBytesExceeded: {Code: 400, ErrCode: 10122, Description: "stream max bytes exceeds account limit max stream bytes"}, JSStreamMessageExceedsMaximumErr: {Code: 400, ErrCode: 10054, Description: "message size exceeds maximum allowed"}, - JSStreamMirrorNotUpdatableErr: {Code: 400, ErrCode: 10055, Description: "Mirror configuration can not be updated"}, + JSStreamMirrorNotUpdatableErr: {Code: 400, ErrCode: 10055, Description: "stream mirror configuration can not be updated"}, JSStreamMismatchErr: {Code: 400, ErrCode: 10056, Description: "stream name in subject does not match request"}, + JSStreamMoveAndScaleErr: {Code: 400, ErrCode: 10123, Description: "can not move and scale a stream in a single update"}, + JSStreamMoveInProgress: {Code: 400, ErrCode: 10124, Description: "stream move already in progress"}, JSStreamMsgDeleteFailedF: {Code: 500, ErrCode: 10057, Description: "{err}"}, JSStreamNameExistErr: {Code: 400, ErrCode: 10058, Description: "stream name already in use"}, JSStreamNotFoundErr: {Code: 404, ErrCode: 10059, Description: "stream not found"}, JSStreamNotMatchErr: {Code: 400, ErrCode: 10060, Description: "expected stream does not match"}, + JSStreamOfflineErr: {Code: 500, ErrCode: 10118, Description: "stream is offline"}, JSStreamPurgeFailedF: {Code: 500, ErrCode: 10110, Description: "{err}"}, JSStreamReplicasNotSupportedErr: {Code: 500, ErrCode: 10074, Description: "replicas > 1 not supported in non-clustered mode"}, JSStreamReplicasNotUpdatableErr: {Code: 400, ErrCode: 10061, Description: "Replicas configuration can not be updated"}, @@ -463,7 +499,7 @@ var ( JSStreamSequenceNotMatchErr: {Code: 503, ErrCode: 10063, Description: "expected stream sequence does not match"}, JSStreamSnapshotErrF: {Code: 500, ErrCode: 10064, Description: "snapshot failed: {err}"}, JSStreamStoreFailedF: {Code: 503, ErrCode: 10077, Description: "{err}"}, - JSStreamSubjectOverlapErr: {Code: 500, ErrCode: 10065, Description: "subjects overlap with an existing stream"}, + JSStreamSubjectOverlapErr: {Code: 400, ErrCode: 10065, Description: "subjects overlap with an existing stream"}, JSStreamTemplateCreateErrF: {Code: 500, ErrCode: 10066, Description: "{err}"}, JSStreamTemplateDeleteErrF: {Code: 500, ErrCode: 10067, Description: "{err}"}, JSStreamTemplateNotFoundErr: {Code: 404, ErrCode: 10068, Description: "template not found"}, @@ -861,6 +897,22 @@ func NewJSConsumerMaxDeliverBackoffError(opts ...ErrorOption) *ApiError { return ApiErrors[JSConsumerMaxDeliverBackoffErr] } +// NewJSConsumerMaxPendingAckExcessError creates a new JSConsumerMaxPendingAckExcessErrF error: "consumer max ack pending exceeds system limit of {limit}" +func NewJSConsumerMaxPendingAckExcessError(limit interface{}, opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + e := ApiErrors[JSConsumerMaxPendingAckExcessErrF] + args := e.toReplacerArgs([]interface{}{"{limit}", limit}) + return &ApiError{ + Code: e.Code, + ErrCode: e.ErrCode, + Description: strings.NewReplacer(args...).Replace(e.Description), + } +} + // NewJSConsumerMaxPendingAckPolicyRequiredError creates a new JSConsumerMaxPendingAckPolicyRequiredErr error: "consumer requires ack policy for max ack pending" func NewJSConsumerMaxPendingAckPolicyRequiredError(opts ...ErrorOption) *ApiError { eopts := parseOpts(opts) @@ -871,6 +923,22 @@ func NewJSConsumerMaxPendingAckPolicyRequiredError(opts ...ErrorOption) *ApiErro return ApiErrors[JSConsumerMaxPendingAckPolicyRequiredErr] } +// NewJSConsumerMaxRequestBatchExceededError creates a new JSConsumerMaxRequestBatchExceededF error: "consumer max request batch exceeds server limit of {limit}" +func NewJSConsumerMaxRequestBatchExceededError(limit interface{}, opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + e := ApiErrors[JSConsumerMaxRequestBatchExceededF] + args := e.toReplacerArgs([]interface{}{"{limit}", limit}) + return &ApiError{ + Code: e.Code, + ErrCode: e.ErrCode, + Description: strings.NewReplacer(args...).Replace(e.Description), + } +} + // NewJSConsumerMaxRequestBatchNegativeError creates a new JSConsumerMaxRequestBatchNegativeErr error: "consumer max request batch needs to be > 0" func NewJSConsumerMaxRequestBatchNegativeError(opts ...ErrorOption) *ApiError { eopts := parseOpts(opts) @@ -937,6 +1005,16 @@ func NewJSConsumerNotFoundError(opts ...ErrorOption) *ApiError { return ApiErrors[JSConsumerNotFoundErr] } +// NewJSConsumerOfflineError creates a new JSConsumerOfflineErr error: "consumer is offline" +func NewJSConsumerOfflineError(opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + return ApiErrors[JSConsumerOfflineErr] +} + // NewJSConsumerOnMappedError creates a new JSConsumerOnMappedErr error: "consumer direct on a mapped consumer" func NewJSConsumerOnMappedError(opts ...ErrorOption) *ApiError { eopts := parseOpts(opts) @@ -997,6 +1075,16 @@ func NewJSConsumerReplacementWithDifferentNameError(opts ...ErrorOption) *ApiErr return ApiErrors[JSConsumerReplacementWithDifferentNameErr] } +// NewJSConsumerReplicasExceedsStreamError creates a new JSConsumerReplicasExceedsStream error: "consumer config replica count exceeds parent stream" +func NewJSConsumerReplicasExceedsStreamError(opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + return ApiErrors[JSConsumerReplicasExceedsStream] +} + // NewJSConsumerSmallHeartbeatError creates a new JSConsumerSmallHeartbeatErr error: "consumer idle heartbeat needs to be >= 100ms" func NewJSConsumerSmallHeartbeatError(opts ...ErrorOption) *ApiError { eopts := parseOpts(opts) @@ -1179,7 +1267,7 @@ func NewJSMirrorWithSubjectFiltersError(opts ...ErrorOption) *ApiError { return ApiErrors[JSMirrorWithSubjectFiltersErr] } -// NewJSMirrorWithSubjectsError creates a new JSMirrorWithSubjectsErr error: "stream mirrors can not also contain subjects" +// NewJSMirrorWithSubjectsError creates a new JSMirrorWithSubjectsErr error: "stream mirrors can not contain subjects" func NewJSMirrorWithSubjectsError(opts ...ErrorOption) *ApiError { eopts := parseOpts(opts) if ae, ok := eopts.err.(*ApiError); ok { @@ -1199,6 +1287,16 @@ func NewJSNoAccountError(opts ...ErrorOption) *ApiError { return ApiErrors[JSNoAccountErr] } +// NewJSNoLimitsError creates a new JSNoLimitsErr error: "no JetStream default or applicable tiered limit present" +func NewJSNoLimitsError(opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + return ApiErrors[JSNoLimitsErr] +} + // NewJSNoMessageFoundError creates a new JSNoMessageFoundErr error: "no message found" func NewJSNoMessageFoundError(opts ...ErrorOption) *ApiError { eopts := parseOpts(opts) @@ -1527,6 +1625,16 @@ func NewJSStreamMaxBytesRequiredError(opts ...ErrorOption) *ApiError { return ApiErrors[JSStreamMaxBytesRequired] } +// NewJSStreamMaxStreamBytesExceededError creates a new JSStreamMaxStreamBytesExceeded error: "stream max bytes exceeds account limit max stream bytes" +func NewJSStreamMaxStreamBytesExceededError(opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + return ApiErrors[JSStreamMaxStreamBytesExceeded] +} + // NewJSStreamMessageExceedsMaximumError creates a new JSStreamMessageExceedsMaximumErr error: "message size exceeds maximum allowed" func NewJSStreamMessageExceedsMaximumError(opts ...ErrorOption) *ApiError { eopts := parseOpts(opts) @@ -1537,7 +1645,7 @@ func NewJSStreamMessageExceedsMaximumError(opts ...ErrorOption) *ApiError { return ApiErrors[JSStreamMessageExceedsMaximumErr] } -// NewJSStreamMirrorNotUpdatableError creates a new JSStreamMirrorNotUpdatableErr error: "Mirror configuration can not be updated" +// NewJSStreamMirrorNotUpdatableError creates a new JSStreamMirrorNotUpdatableErr error: "stream mirror configuration can not be updated" func NewJSStreamMirrorNotUpdatableError(opts ...ErrorOption) *ApiError { eopts := parseOpts(opts) if ae, ok := eopts.err.(*ApiError); ok { @@ -1557,6 +1665,26 @@ func NewJSStreamMismatchError(opts ...ErrorOption) *ApiError { return ApiErrors[JSStreamMismatchErr] } +// NewJSStreamMoveAndScaleError creates a new JSStreamMoveAndScaleErr error: "can not move and scale a stream in a single update" +func NewJSStreamMoveAndScaleError(opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + return ApiErrors[JSStreamMoveAndScaleErr] +} + +// NewJSStreamMoveInProgressError creates a new JSStreamMoveInProgress error: "stream move already in progress" +func NewJSStreamMoveInProgressError(opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + return ApiErrors[JSStreamMoveInProgress] +} + // NewJSStreamMsgDeleteFailedError creates a new JSStreamMsgDeleteFailedF error: "{err}" func NewJSStreamMsgDeleteFailedError(err error, opts ...ErrorOption) *ApiError { eopts := parseOpts(opts) @@ -1603,6 +1731,16 @@ func NewJSStreamNotMatchError(opts ...ErrorOption) *ApiError { return ApiErrors[JSStreamNotMatchErr] } +// NewJSStreamOfflineError creates a new JSStreamOfflineErr error: "stream is offline" +func NewJSStreamOfflineError(opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + return ApiErrors[JSStreamOfflineErr] +} + // NewJSStreamPurgeFailedError creates a new JSStreamPurgeFailedF error: "{err}" func NewJSStreamPurgeFailedError(err error, opts ...ErrorOption) *ApiError { eopts := parseOpts(opts) diff --git a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_events.go b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_events.go index 6497b84d..2e1c7313 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_events.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_events.go @@ -110,6 +110,21 @@ type JSConsumerDeliveryExceededAdvisory struct { // JSConsumerDeliveryExceededAdvisoryType is the schema type for JSConsumerDeliveryExceededAdvisory const JSConsumerDeliveryExceededAdvisoryType = "io.nats.jetstream.advisory.v1.max_deliver" +// JSConsumerDeliveryNakAdvisory is an advisory informing that a message was +// naked by the consumer +type JSConsumerDeliveryNakAdvisory struct { + TypedEvent + Stream string `json:"stream"` + Consumer string `json:"consumer"` + ConsumerSeq uint64 `json:"consumer_seq"` + StreamSeq uint64 `json:"stream_seq"` + Deliveries uint64 `json:"deliveries"` + Domain string `json:"domain,omitempty"` +} + +// JSConsumerDeliveryNakAdvisoryType is the schema type for JSConsumerDeliveryNakAdvisory +const JSConsumerDeliveryNakAdvisoryType = "io.nats.jetstream.advisory.v1.nak" + // JSConsumerDeliveryTerminatedAdvisory is an advisory informing that a message was // terminated by the consumer, so might be a candidate for DLQ handling type JSConsumerDeliveryTerminatedAdvisory struct { diff --git a/vendor/github.com/nats-io/nats-server/v2/server/leafnode.go b/vendor/github.com/nats-io/nats-server/v2/server/leafnode.go index 40f068e1..78adef19 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/leafnode.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/leafnode.go @@ -56,6 +56,10 @@ const leafNodeLoopDetectionSubjectPrefix = "$LDS." // LEAF connection as opposed to a CLIENT. const leafNodeWSPath = "/leafnode" +// This is the time the server will wait, when receiving a CONNECT, +// before closing the connection if the required minimum version is not met. +const leafNodeWaitBeforeClose = 5 * time.Second + type leaf struct { // We have any auth stuff here for solicited connections. remote *leafNodeCfg @@ -252,6 +256,18 @@ func validateLeafNode(o *Options) error { if o.LeafNode.Port == 0 { return nil } + + // If MinVersion is defined, check that it is valid. + if mv := o.LeafNode.MinVersion; mv != _EMPTY_ { + if err := checkLeafMinVersionConfig(mv); err != nil { + return err + } + } + + // The checks below will be done only when detecting that we are configured + // with gateways. So if an option validation needs to be done regardless, + // it MUST be done before this point! + if o.Gateway.Name == "" && o.Gateway.Port == 0 { return nil } @@ -266,6 +282,17 @@ func validateLeafNode(o *Options) error { return nil } +func checkLeafMinVersionConfig(mv string) error { + if ok, err := versionAtLeastCheckError(mv, 2, 8, 0); !ok || err != nil { + if err != nil { + return fmt.Errorf("invalid leafnode's minimum version: %v", err) + } else { + return fmt.Errorf("the minimum version should be at least 2.8.0") + } + } + return nil +} + // Used to validate user names in LeafNode configuration. // - rejects mix of single and multiple users. // - rejects duplicate user names. @@ -613,6 +640,7 @@ var credsRe = regexp.MustCompile(`\s*(?:(?:[-]{3,}[^\n]*[-]{3,}\n)(.+)(?:\n\s*[- func (c *client) sendLeafConnect(clusterName string, tlsRequired, headers bool) error { // We support basic user/pass and operator based user JWT with signatures. cinfo := leafConnectInfo{ + Version: VERSION, TLS: tlsRequired, ID: c.srv.info.ID, Domain: c.srv.info.Domain, @@ -1316,6 +1344,7 @@ func (s *Server) removeLeafNodeConnection(c *client) { // Connect information for solicited leafnodes. type leafConnectInfo struct { + Version string `json:"version,omitempty"` JWT string `json:"jwt,omitempty"` Sig string `json:"sig,omitempty"` User string `json:"user,omitempty"` @@ -1363,6 +1392,25 @@ func (c *client) processLeafNodeConnect(s *Server, arg []byte, lang string) erro return ErrWrongGateway } + if mv := s.getOpts().LeafNode.MinVersion; mv != _EMPTY_ { + major, minor, update, _ := versionComponents(mv) + if !versionAtLeast(proto.Version, major, minor, update) { + // We are going to send back an INFO because otherwise recent + // versions of the remote server would simply break the connection + // after 2 seconds if not receiving it. Instead, we want the + // other side to just "stall" until we finish waiting for the holding + // period and close the connection below. + s.sendPermsAndAccountInfo(c) + c.sendErrAndErr(fmt.Sprintf("connection rejected since minimum version required is %q", mv)) + select { + case <-c.srv.quitCh: + case <-time.After(leafNodeWaitBeforeClose): + } + c.closeConnection(MinimumVersionRequired) + return ErrMinimumVersionRequired + } + } + // Check if this server supports headers. supportHeaders := c.srv.supportsHeaders() diff --git a/vendor/github.com/nats-io/nats-server/v2/server/log.go b/vendor/github.com/nats-io/nats-server/v2/server/log.go index 2bfe25c2..4b9193c0 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/log.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/log.go @@ -14,9 +14,11 @@ package server import ( + "fmt" "io" "os" "sync/atomic" + "time" srvlog "github.com/nats-io/nats-server/v2/logger" ) @@ -204,6 +206,14 @@ func (s *Server) Warnf(format string, v ...interface{}) { }, format, v...) } +func (s *Server) RateLimitWarnf(format string, v ...interface{}) { + statement := fmt.Sprintf(format, v...) + if _, loaded := s.rateLimitLogging.LoadOrStore(statement, time.Now()); loaded { + return + } + s.Warnf("%s", statement) +} + // Fatalf logs a fatal error func (s *Server) Fatalf(format string, v ...interface{}) { s.executeLogCall(func(logger Logger, format string, v ...interface{}) { diff --git a/vendor/github.com/nats-io/nats-server/v2/server/memstore.go b/vendor/github.com/nats-io/nats-server/v2/server/memstore.go index 9e2b16c1..a6adb84f 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/memstore.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/memstore.go @@ -1,4 +1,4 @@ -// Copyright 2019-2021 The NATS Authors +// Copyright 2019-2022 The NATS Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -26,7 +26,7 @@ type memStore struct { mu sync.RWMutex cfg StreamConfig state StreamState - msgs map[uint64]*storedMsg + msgs map[uint64]*StoreMsg fss map[string]*SimpleState maxp int64 scb StorageUpdateHandler @@ -34,14 +34,6 @@ type memStore struct { consumers int } -type storedMsg struct { - subj string - hdr []byte - msg []byte - seq uint64 - ts int64 // nanoseconds -} - func newMemStore(cfg *StreamConfig) (*memStore, error) { if cfg == nil { return nil, fmt.Errorf("config required") @@ -50,7 +42,7 @@ func newMemStore(cfg *StreamConfig) (*memStore, error) { return nil, fmt.Errorf("memStore requires memory storage type in config") } ms := &memStore{ - msgs: make(map[uint64]*storedMsg), + msgs: make(map[uint64]*StoreMsg), fss: make(map[string]*SimpleState), maxp: cfg.MaxMsgsPer, cfg: *cfg, @@ -147,7 +139,15 @@ func (ms *memStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts int hdr = copyBytes(hdr) } - ms.msgs[seq] = &storedMsg{subj, hdr, msg, seq, ts} + // FIXME(dlc) - Could pool at this level? + sm := &StoreMsg{subj, nil, nil, make([]byte, 0, len(hdr)+len(msg)), seq, ts} + sm.buf = append(sm.buf, hdr...) + sm.buf = append(sm.buf, msg...) + if len(hdr) > 0 { + sm.hdr = sm.buf[:len(hdr)] + } + sm.msg = sm.buf[len(hdr):] + ms.msgs[seq] = sm ms.state.Msgs++ ms.state.Bytes += memStoreMsgSize(subj, hdr, msg) ms.state.LastSeq = seq @@ -428,6 +428,10 @@ func (ms *memStore) expireMsgs() { // PurgeEx will remove messages based on subject filters, sequence and number of messages to keep. // Will return the number of purged messages. func (ms *memStore) PurgeEx(subject string, sequence, keep uint64) (purged uint64, err error) { + if sequence > 1 && keep > 0 { + return 0, ErrPurgeArgMismatch + } + if subject == _EMPTY_ || subject == fwcs { if keep == 0 && (sequence == 0 || sequence == 1) { return ms.Purge() @@ -455,7 +459,7 @@ func (ms *memStore) PurgeEx(subject string, sequence, keep uint64) (purged uint6 ss.Msgs -= keep } last := ss.Last - if sequence > 0 { + if sequence > 1 { last = sequence - 1 } ms.mu.Lock() @@ -485,7 +489,7 @@ func (ms *memStore) Purge() (uint64, error) { ms.state.FirstTime = time.Time{} ms.state.Bytes = 0 ms.state.Msgs = 0 - ms.msgs = make(map[uint64]*storedMsg) + ms.msgs = make(map[uint64]*StoreMsg) ms.fss = make(map[string]*SimpleState) ms.mu.Unlock() @@ -536,7 +540,7 @@ func (ms *memStore) Compact(seq uint64) (uint64, error) { ms.state.FirstSeq = seq ms.state.FirstTime = time.Time{} ms.state.LastSeq = seq - 1 - ms.msgs = make(map[uint64]*storedMsg) + ms.msgs = make(map[uint64]*StoreMsg) } ms.mu.Unlock() @@ -592,7 +596,7 @@ func (ms *memStore) deleteFirstMsg() bool { } // LoadMsg will lookup the message by sequence number and return it if found. -func (ms *memStore) LoadMsg(seq uint64) (string, []byte, []byte, int64, error) { +func (ms *memStore) LoadMsg(seq uint64, smp *StoreMsg) (*StoreMsg, error) { ms.mu.RLock() sm, ok := ms.msgs[seq] last := ms.state.LastSeq @@ -603,15 +607,20 @@ func (ms *memStore) LoadMsg(seq uint64) (string, []byte, []byte, int64, error) { if seq <= last { err = ErrStoreMsgNotFound } - return _EMPTY_, nil, nil, 0, err + return nil, err + } + + if smp == nil { + smp = new(StoreMsg) } - return sm.subj, sm.hdr, sm.msg, sm.ts, nil + sm.copy(smp) + return smp, nil } // LoadLastMsg will return the last message we have that matches a given subject. // The subject can be a wildcard. -func (ms *memStore) LoadLastMsg(subject string) (subj string, seq uint64, hdr, msg []byte, ts int64, err error) { - var sm *storedMsg +func (ms *memStore) LoadLastMsg(subject string, smp *StoreMsg) (*StoreMsg, error) { + var sm *StoreMsg var ok bool ms.mu.RLock() @@ -623,14 +632,19 @@ func (ms *memStore) LoadLastMsg(subject string) (subj string, seq uint64, hdr, m sm, ok = ms.msgs[ss.Last] } if !ok || sm == nil { - return _EMPTY_, 0, nil, nil, 0, ErrStoreMsgNotFound + return nil, ErrStoreMsgNotFound + } + + if smp == nil { + smp = new(StoreMsg) } - return sm.subj, sm.seq, sm.hdr, sm.msg, sm.ts, nil + sm.copy(smp) + return smp, nil } // LoadNextMsg will find the next message matching the filter subject starting at the start sequence. // The filter subject can be a wildcard. -func (ms *memStore) LoadNextMsg(filter string, wc bool, start uint64) (subj string, seq uint64, hdr, msg []byte, ts int64, err error) { +func (ms *memStore) LoadNextMsg(filter string, wc bool, start uint64, smp *StoreMsg) (*StoreMsg, uint64, error) { ms.mu.RLock() defer ms.mu.RUnlock() @@ -640,7 +654,7 @@ func (ms *memStore) LoadNextMsg(filter string, wc bool, start uint64) (subj stri // If past the end no results. if start > ms.state.LastSeq { - return _EMPTY_, ms.state.LastSeq, nil, nil, 0, ErrStoreEOF + return nil, ms.state.LastSeq, ErrStoreEOF } isAll := filter == _EMPTY_ || filter == fwcs @@ -677,10 +691,14 @@ func (ms *memStore) LoadNextMsg(filter string, wc bool, start uint64) (subj stri for nseq := fseq; nseq <= lseq; nseq++ { if sm, ok := ms.msgs[nseq]; ok && (isAll || eq(sm.subj, filter)) { - return sm.subj, nseq, sm.hdr, sm.msg, sm.ts, nil + if smp == nil { + smp = new(StoreMsg) + } + sm.copy(smp) + return smp, nseq, nil } } - return _EMPTY_, ms.state.LastSeq, nil, nil, 0, ErrStoreEOF + return nil, ms.state.LastSeq, ErrStoreEOF } // RemoveMsg will remove the message from this store. @@ -707,7 +725,7 @@ func (ms *memStore) updateFirstSeq(seq uint64) { // Interior delete. return } - var nsm *storedMsg + var nsm *StoreMsg var ok bool for nseq := ms.state.FirstSeq + 1; nseq <= ms.state.LastSeq; nseq++ { if nsm, ok = ms.msgs[nseq]; ok { @@ -739,7 +757,12 @@ func (ms *memStore) removeSeqPerSubject(subj string, seq uint64) { if seq != ss.First { return } - // TODO(dlc) - Might want to optimize this. + // If we know we only have 1 msg left don't need to search for next first. + if ss.Msgs == 1 { + ss.First = ss.Last + return + } + // TODO(dlc) - Might want to optimize this longer term. for tseq := seq + 1; tseq <= ss.Last; tseq++ { if sm := ms.msgs[tseq]; sm != nil && sm.subj == subj { ss.First = tseq @@ -866,55 +889,312 @@ func (ms *memStore) Stop() error { return nil } -func (ms *memStore) incConsumers() { +func (ms *memStore) isClosed() bool { + ms.mu.RLock() + defer ms.mu.RUnlock() + return ms.msgs == nil +} + +type consumerMemStore struct { + mu sync.Mutex + ms StreamStore + cfg ConsumerConfig + state ConsumerState + closed bool +} + +func (ms *memStore) ConsumerStore(name string, cfg *ConsumerConfig) (ConsumerStore, error) { + if ms == nil { + return nil, fmt.Errorf("memstore is nil") + } + if ms.isClosed() { + return nil, ErrStoreClosed + } + if cfg == nil || name == _EMPTY_ { + return nil, fmt.Errorf("bad consumer config") + } + o := &consumerMemStore{ms: ms, cfg: *cfg} + ms.AddConsumer(o) + return o, nil +} + +func (ms *memStore) AddConsumer(o ConsumerStore) error { ms.mu.Lock() ms.consumers++ ms.mu.Unlock() + return nil } -func (ms *memStore) decConsumers() { +func (ms *memStore) RemoveConsumer(o ConsumerStore) error { ms.mu.Lock() if ms.consumers > 0 { ms.consumers-- } ms.mu.Unlock() + return nil } -type consumerMemStore struct { - ms *memStore +func (ms *memStore) Snapshot(_ time.Duration, _, _ bool) (*SnapshotResult, error) { + return nil, fmt.Errorf("no impl") +} + +func (o *consumerMemStore) Update(state *ConsumerState) error { + // Sanity checks. + if state.AckFloor.Consumer > state.Delivered.Consumer { + return fmt.Errorf("bad ack floor for consumer") + } + if state.AckFloor.Stream > state.Delivered.Stream { + return fmt.Errorf("bad ack floor for stream") + } + + // Copy to our state. + var pending map[uint64]*Pending + var redelivered map[uint64]uint64 + if len(state.Pending) > 0 { + pending = make(map[uint64]*Pending, len(state.Pending)) + for seq, p := range state.Pending { + pending[seq] = &Pending{p.Sequence, p.Timestamp} + } + for seq := range pending { + if seq <= state.AckFloor.Stream || seq > state.Delivered.Stream { + return fmt.Errorf("bad pending entry, sequence [%d] out of range", seq) + } + } + } + if len(state.Redelivered) > 0 { + redelivered = make(map[uint64]uint64, len(state.Redelivered)) + for seq, dc := range state.Redelivered { + redelivered[seq] = dc + } + } + + // Replace our state. + o.mu.Lock() + + // Check to see if this is an outdated update. + if state.Delivered.Consumer < o.state.Delivered.Consumer { + o.mu.Unlock() + return fmt.Errorf("old update ignored") + } + + o.state.Delivered = state.Delivered + o.state.AckFloor = state.AckFloor + o.state.Pending = pending + o.state.Redelivered = redelivered + o.mu.Unlock() + + return nil } -func (ms *memStore) ConsumerStore(_ string, _ *ConsumerConfig) (ConsumerStore, error) { - ms.incConsumers() - return &consumerMemStore{ms}, nil +// SetStarting sets our starting stream sequence. +func (o *consumerMemStore) SetStarting(sseq uint64) error { + o.mu.Lock() + o.state.Delivered.Stream = sseq + o.mu.Unlock() + return nil } -func (ms *memStore) Snapshot(_ time.Duration, _, _ bool) (*SnapshotResult, error) { - return nil, fmt.Errorf("no impl") +// HasState returns if this store has a recorded state. +func (o *consumerMemStore) HasState() bool { + return false } -// No-ops. -func (os *consumerMemStore) Update(_ *ConsumerState) error { return nil } -func (os *consumerMemStore) UpdateDelivered(_, _, _ uint64, _ int64) error { return nil } -func (os *consumerMemStore) UpdateAcks(_, _ uint64) error { return nil } -func (os *consumerMemStore) UpdateConfig(_ *ConsumerConfig) error { return nil } +func (o *consumerMemStore) UpdateDelivered(dseq, sseq, dc uint64, ts int64) error { + o.mu.Lock() + defer o.mu.Unlock() + + if dc != 1 && o.cfg.AckPolicy == AckNone { + return ErrNoAckPolicy + } -func (os *consumerMemStore) Stop() error { - os.ms.decConsumers() + if dseq <= o.state.AckFloor.Consumer { + return nil + } + + // See if we expect an ack for this. + if o.cfg.AckPolicy != AckNone { + // Need to create pending records here. + if o.state.Pending == nil { + o.state.Pending = make(map[uint64]*Pending) + } + var p *Pending + // Check for an update to a message already delivered. + if sseq <= o.state.Delivered.Stream { + if p = o.state.Pending[sseq]; p != nil { + p.Sequence, p.Timestamp = dseq, ts + } + } else { + // Add to pending. + o.state.Pending[sseq] = &Pending{dseq, ts} + } + // Update delivered as needed. + if dseq > o.state.Delivered.Consumer { + o.state.Delivered.Consumer = dseq + } + if sseq > o.state.Delivered.Stream { + o.state.Delivered.Stream = sseq + } + + if dc > 1 { + if o.state.Redelivered == nil { + o.state.Redelivered = make(map[uint64]uint64) + } + o.state.Redelivered[sseq] = dc - 1 + } + } else { + // For AckNone just update delivered and ackfloor at the same time. + o.state.Delivered.Consumer = dseq + o.state.Delivered.Stream = sseq + o.state.AckFloor.Consumer = dseq + o.state.AckFloor.Stream = sseq + } + + return nil +} + +func (o *consumerMemStore) UpdateAcks(dseq, sseq uint64) error { + o.mu.Lock() + defer o.mu.Unlock() + + if o.cfg.AckPolicy == AckNone { + return ErrNoAckPolicy + } + if len(o.state.Pending) == 0 || o.state.Pending[sseq] == nil { + return ErrStoreMsgNotFound + } + + // On restarts the old leader may get a replay from the raft logs that are old. + if dseq <= o.state.AckFloor.Consumer { + return nil + } + + // Check for AckAll here. + if o.cfg.AckPolicy == AckAll { + sgap := sseq - o.state.AckFloor.Stream + o.state.AckFloor.Consumer = dseq + o.state.AckFloor.Stream = sseq + for seq := sseq; seq > sseq-sgap; seq-- { + delete(o.state.Pending, seq) + if len(o.state.Redelivered) > 0 { + delete(o.state.Redelivered, seq) + } + } + return nil + } + + // AckExplicit + + // First delete from our pending state. + if p, ok := o.state.Pending[sseq]; ok { + delete(o.state.Pending, sseq) + dseq = p.Sequence // Use the original. + } + // Now remove from redelivered. + if len(o.state.Redelivered) > 0 { + delete(o.state.Redelivered, sseq) + } + + if len(o.state.Pending) == 0 { + o.state.AckFloor.Consumer = o.state.Delivered.Consumer + o.state.AckFloor.Stream = o.state.Delivered.Stream + } else if dseq == o.state.AckFloor.Consumer+1 { + first := o.state.AckFloor.Consumer == 0 + o.state.AckFloor.Consumer = dseq + o.state.AckFloor.Stream = sseq + + if !first && o.state.Delivered.Consumer > dseq { + for ss := sseq + 1; ss < o.state.Delivered.Stream; ss++ { + if p, ok := o.state.Pending[ss]; ok { + if p.Sequence > 0 { + o.state.AckFloor.Consumer = p.Sequence - 1 + o.state.AckFloor.Stream = ss - 1 + } + break + } + } + } + } + + return nil +} + +func (o *consumerMemStore) UpdateConfig(cfg *ConsumerConfig) error { + o.mu.Lock() + defer o.mu.Unlock() + + // This is mostly unchecked here. We are assuming the upper layers have done sanity checking. + o.cfg = *cfg return nil } -func (os *consumerMemStore) Delete() error { - return os.Stop() +func (o *consumerMemStore) Stop() error { + o.mu.Lock() + o.closed = true + ms := o.ms + o.mu.Unlock() + ms.RemoveConsumer(o) + return nil +} + +func (o *consumerMemStore) Delete() error { + return o.Stop() +} + +func (o *consumerMemStore) StreamDelete() error { + return o.Stop() +} + +func (o *consumerMemStore) State() (*ConsumerState, error) { + o.mu.Lock() + defer o.mu.Unlock() + + if o.closed { + return nil, ErrStoreClosed + } + + state := &ConsumerState{} + + state.Delivered = o.state.Delivered + state.AckFloor = o.state.AckFloor + if len(o.state.Pending) > 0 { + state.Pending = o.copyPending() + } + if len(o.state.Redelivered) > 0 { + state.Redelivered = o.copyRedelivered() + } + return state, nil } -func (os *consumerMemStore) StreamDelete() error { - return os.Stop() + +// EncodeState for this consumer store. +func (o *consumerMemStore) EncodedState() ([]byte, error) { + o.mu.Lock() + defer o.mu.Unlock() + + if o.closed { + return nil, ErrStoreClosed + } + + return encodeConsumerState(&o.state), nil } -func (os *consumerMemStore) State() (*ConsumerState, error) { return nil, nil } +func (o *consumerMemStore) copyPending() map[uint64]*Pending { + pending := make(map[uint64]*Pending, len(o.state.Pending)) + for seq, p := range o.state.Pending { + pending[seq] = &Pending{p.Sequence, p.Timestamp} + } + return pending +} + +func (o *consumerMemStore) copyRedelivered() map[uint64]uint64 { + redelivered := make(map[uint64]uint64, len(o.state.Redelivered)) + for seq, dc := range o.state.Redelivered { + redelivered[seq] = dc + } + return redelivered +} // Type returns the type of the underlying store. -func (os *consumerMemStore) Type() StorageType { return MemoryStorage } +func (o *consumerMemStore) Type() StorageType { return MemoryStorage } // Templates type templateMemStore struct{} diff --git a/vendor/github.com/nats-io/nats-server/v2/server/monitor.go b/vendor/github.com/nats-io/nats-server/v2/server/monitor.go index 25b7c21c..9c257911 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/monitor.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/monitor.go @@ -749,10 +749,14 @@ type RouteInfo struct { IsConfigured bool `json:"is_configured"` IP string `json:"ip"` Port int `json:"port"` + Start time.Time `json:"start"` + LastActivity time.Time `json:"last_activity"` + RTT string `json:"rtt,omitempty"` + Uptime string `json:"uptime"` + Idle string `json:"idle"` Import *SubjectPermission `json:"import,omitempty"` Export *SubjectPermission `json:"export,omitempty"` Pending int `json:"pending_size"` - RTT string `json:"rtt,omitempty"` InMsgs int64 `json:"in_msgs"` OutMsgs int64 `json:"out_msgs"` InBytes int64 `json:"in_bytes"` @@ -799,6 +803,10 @@ func (s *Server) Routez(routezOpts *RoutezOptions) (*Routez, error) { Import: r.opts.Import, Export: r.opts.Export, RTT: r.getRTT().String(), + Start: r.start, + LastActivity: r.last, + Uptime: myUptime(rs.Now.Sub(r.start)), + Idle: myUptime(rs.Now.Sub(r.last)), } if len(r.subs) > 0 { @@ -1077,6 +1085,38 @@ func (s *Server) HandleStacksz(w http.ResponseWriter, r *http.Request) { ResponseHandler(w, r, buf[:n]) } +type monitorIPQueue struct { + Pending int `json:"pending"` + InProgress int `json:"in_progress,omitempty"` +} + +func (s *Server) HandleIPQueuesz(w http.ResponseWriter, r *http.Request) { + all, err := decodeBool(w, r, "all") + if err != nil { + return + } + qfilter := r.URL.Query().Get("queues") + + queues := map[string]monitorIPQueue{} + + s.ipQueues.Range(func(k, v interface{}) bool { + name := k.(string) + queue := v.(*ipQueue) + pending := queue.len() + inProgress := int(queue.inProgress()) + if !all && (pending == 0 && inProgress == 0) { + return true + } else if qfilter != _EMPTY_ && !strings.Contains(name, qfilter) { + return true + } + queues[name] = monitorIPQueue{Pending: pending, InProgress: inProgress} + return true + }) + + b, _ := json.MarshalIndent(queues, "", " ") + ResponseHandler(w, r, b) +} + // Varz will output server information on the monitoring port at /varz. type Varz struct { ID string `json:"server_id"` @@ -1108,6 +1148,8 @@ type Varz struct { Cluster ClusterOptsVarz `json:"cluster,omitempty"` Gateway GatewayOptsVarz `json:"gateway,omitempty"` LeafNode LeafNodeOptsVarz `json:"leaf,omitempty"` + MQTT MQTTOptsVarz `json:"mqtt,omitempty"` + Websocket WebsocketOptsVarz `json:"websocket,omitempty"` JetStream JetStreamVarz `json:"jetstream,omitempty"` TLSTimeout float64 `json:"tls_timeout"` WriteDeadline time.Duration `json:"write_deadline"` @@ -1204,6 +1246,37 @@ type RemoteLeafOptsVarz struct { Deny *DenyRules `json:"deny,omitempty"` } +// MQTTOptsVarz contains monitoring MQTT information +type MQTTOptsVarz struct { + Host string `json:"host,omitempty"` + Port int `json:"port,omitempty"` + NoAuthUser string `json:"no_auth_user,omitempty"` + AuthTimeout float64 `json:"auth_timeout,omitempty"` + TLSMap bool `json:"tls_map,omitempty"` + TLSTimeout float64 `json:"tls_timeout,omitempty"` + TLSPinnedCerts []string `json:"tls_pinned_certs,omitempty"` + JsDomain string `json:"js_domain,omitempty"` + AckWait time.Duration `json:"ack_wait,omitempty"` + MaxAckPending uint16 `json:"max_ack_pending,omitempty"` +} + +// WebsocketOptsVarz contains monitoring websocket information +type WebsocketOptsVarz struct { + Host string `json:"host,omitempty"` + Port int `json:"port,omitempty"` + Advertise string `json:"advertise,omitempty"` + NoAuthUser string `json:"no_auth_user,omitempty"` + JWTCookie string `json:"jwt_cookie,omitempty"` + HandshakeTimeout time.Duration `json:"handshake_timeout,omitempty"` + AuthTimeout float64 `json:"auth_timeout,omitempty"` + NoTLS bool `json:"no_tls,omitempty"` + TLSMap bool `json:"tls_map,omitempty"` + TLSPinnedCerts []string `json:"tls_pinned_certs,omitempty"` + SameOrigin bool `json:"same_origin,omitempty"` + AllowedOrigins []string `json:"allowed_origins,omitempty"` + Compression bool `json:"compression,omitempty"` +} + // VarzOptions are the options passed to Varz(). // Currently, there are no options defined. type VarzOptions struct{} @@ -1241,20 +1314,36 @@ func (s *Server) HandleRoot(w http.ResponseWriter, r *http.Request) { s.mu.Lock() s.httpReqStats[RootPath]++ s.mu.Unlock() + + // Calculate source url. If git set go directly to that tag, otherwise just main. + var srcUrl string + if gitCommit == _EMPTY_ { + srcUrl = "https://github.com/nats-io/nats-server" + } else { + srcUrl = fmt.Sprintf("https://github.com/nats-io/nats-server/tree/%s", gitCommit) + } + fmt.Fprintf(w, ` -
- - - - - -