|
|
@@ -14,84 +14,9 @@
|
|
|
// You should have received a copy of the GNU Lesser General Public License
|
|
|
// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
-/*
|
|
|
-Counting number of items in garbage collection index
|
|
|
-
|
|
|
-The number of items in garbage collection index is not the same as the number of
|
|
|
-chunks in retrieval index (total number of stored chunks). Chunk can be garbage
|
|
|
-collected only when it is set to a synced state by ModSetSync, and only then can
|
|
|
-be counted into garbage collection size, which determines whether a number of
|
|
|
-chunk should be removed from the storage by the garbage collection. This opens a
|
|
|
-possibility that the storage size exceeds the limit if files are locally
|
|
|
-uploaded and the node is not connected to other nodes or there is a problem with
|
|
|
-syncing.
|
|
|
-
|
|
|
-Tracking of garbage collection size (gcSize) is focused on performance. Key
|
|
|
-points:
|
|
|
-
|
|
|
- 1. counting the number of key/value pairs in LevelDB takes around 0.7s for 1e6
|
|
|
- on a very fast ssd (unacceptable long time in reality)
|
|
|
- 2. locking leveldb batch writes with a global mutex (serial batch writes) is
|
|
|
- not acceptable, we should use locking per chunk address
|
|
|
-
|
|
|
-Because of point 1. we cannot count the number of items in garbage collection
|
|
|
-index in New constructor as it could last very long for realistic scenarios
|
|
|
-where limit is 5e6 and nodes are running on slower hdd disks or cloud providers
|
|
|
-with low IOPS.
|
|
|
-
|
|
|
-Point 2. is a performance optimization to allow parallel batch writes with
|
|
|
-getters, putters and setters. Every single batch that they create contain only
|
|
|
-information related to a single chunk, no relations with other chunks or shared
|
|
|
-statistical data (like gcSize). This approach avoids race conditions on writing
|
|
|
-batches in parallel, but creates a problem of synchronizing statistical data
|
|
|
-values like gcSize. With global mutex lock, any data could be written by any
|
|
|
-batch, but would not use utilize the full potential of leveldb parallel writes.
|
|
|
-
|
|
|
-To mitigate this two problems, the implementation of counting and persisting
|
|
|
-gcSize is split into two parts. One is the in-memory value (gcSize) that is fast
|
|
|
-to read and write with a dedicated mutex (gcSizeMu) if the batch which adds or
|
|
|
-removes items from garbage collection index is successful. The second part is
|
|
|
-the reliable persistence of this value to leveldb database, as storedGCSize
|
|
|
-field. This database field is saved by writeGCSizeWorker and writeGCSize
|
|
|
-functions when in-memory gcSize variable is changed, but no too often to avoid
|
|
|
-very frequent database writes. This database writes are triggered by
|
|
|
-writeGCSizeTrigger when a call is made to function incGCSize. Trigger ensures
|
|
|
-that no database writes are done only when gcSize is changed (contrary to a
|
|
|
-simpler periodic writes or checks). A backoff of 10s in writeGCSizeWorker
|
|
|
-ensures that no frequent batch writes are made. Saving the storedGCSize on
|
|
|
-database Close function ensures that in-memory gcSize is persisted when database
|
|
|
-is closed.
|
|
|
-
|
|
|
-This persistence must be resilient to failures like panics. For this purpose, a
|
|
|
-collection of hashes that are added to the garbage collection index, but still
|
|
|
-not persisted to storedGCSize, must be tracked to count them in when DB is
|
|
|
-constructed again with New function after the failure (swarm node restarts). On
|
|
|
-every batch write that adds a new item to garbage collection index, the same
|
|
|
-hash is added to gcUncountedHashesIndex. This ensures that there is a persisted
|
|
|
-information which hashes were added to the garbage collection index. But, when
|
|
|
-the storedGCSize is saved by writeGCSize function, this values are removed in
|
|
|
-the same batch in which storedGCSize is changed to ensure consistency. When the
|
|
|
-panic happen, or database Close method is not saved. The database storage
|
|
|
-contains all information to reliably and efficiently get the correct number of
|
|
|
-items in garbage collection index. This is performed in the New function when
|
|
|
-all hashes in gcUncountedHashesIndex are counted, added to the storedGCSize and
|
|
|
-saved to the disk before the database is constructed again. Index
|
|
|
-gcUncountedHashesIndex is acting as dirty bit for recovery that provides
|
|
|
-information what needs to be corrected. With a simple dirty bit, the whole
|
|
|
-garbage collection index should me counted on recovery instead only the items in
|
|
|
-gcUncountedHashesIndex. Because of the triggering mechanizm of writeGCSizeWorker
|
|
|
-and relatively short backoff time, the number of hashes in
|
|
|
-gcUncountedHashesIndex should be low and it should take a very short time to
|
|
|
-recover from the previous failure. If there was no failure and
|
|
|
-gcUncountedHashesIndex is empty, which is the usual case, New function will take
|
|
|
-the minimal time to return.
|
|
|
-*/
|
|
|
-
|
|
|
package localstore
|
|
|
|
|
|
import (
|
|
|
- "time"
|
|
|
-
|
|
|
"github.com/ethereum/go-ethereum/log"
|
|
|
"github.com/ethereum/go-ethereum/swarm/shed"
|
|
|
"github.com/syndtr/goleveldb/leveldb"
|
|
|
@@ -109,7 +34,7 @@ var (
|
|
|
gcTargetRatio = 0.9
|
|
|
// gcBatchSize limits the number of chunks in a single
|
|
|
// leveldb batch on garbage collection.
|
|
|
- gcBatchSize int64 = 1000
|
|
|
+ gcBatchSize uint64 = 1000
|
|
|
)
|
|
|
|
|
|
// collectGarbageWorker is a long running function that waits for
|
|
|
@@ -149,20 +74,21 @@ func (db *DB) collectGarbageWorker() {
|
|
|
// is false, another call to this function is needed to collect
|
|
|
// the rest of the garbage as the batch size limit is reached.
|
|
|
// This function is called in collectGarbageWorker.
|
|
|
-func (db *DB) collectGarbage() (collectedCount int64, done bool, err error) {
|
|
|
+func (db *DB) collectGarbage() (collectedCount uint64, done bool, err error) {
|
|
|
batch := new(leveldb.Batch)
|
|
|
target := db.gcTarget()
|
|
|
|
|
|
+ // protect database from changing idexes and gcSize
|
|
|
+ db.batchMu.Lock()
|
|
|
+ defer db.batchMu.Unlock()
|
|
|
+
|
|
|
+ gcSize, err := db.gcSize.Get()
|
|
|
+ if err != nil {
|
|
|
+ return 0, true, err
|
|
|
+ }
|
|
|
+
|
|
|
done = true
|
|
|
err = db.gcIndex.Iterate(func(item shed.Item) (stop bool, err error) {
|
|
|
- // protect parallel updates
|
|
|
- unlock, err := db.lockAddr(item.Address)
|
|
|
- if err != nil {
|
|
|
- return false, err
|
|
|
- }
|
|
|
- defer unlock()
|
|
|
-
|
|
|
- gcSize := db.getGCSize()
|
|
|
if gcSize-collectedCount <= target {
|
|
|
return true, nil
|
|
|
}
|
|
|
@@ -184,49 +110,19 @@ func (db *DB) collectGarbage() (collectedCount int64, done bool, err error) {
|
|
|
return 0, false, err
|
|
|
}
|
|
|
|
|
|
+ db.gcSize.PutInBatch(batch, gcSize-collectedCount)
|
|
|
+
|
|
|
err = db.shed.WriteBatch(batch)
|
|
|
if err != nil {
|
|
|
return 0, false, err
|
|
|
}
|
|
|
- // batch is written, decrement gcSize
|
|
|
- db.incGCSize(-collectedCount)
|
|
|
return collectedCount, done, nil
|
|
|
}
|
|
|
|
|
|
// gcTrigger retruns the absolute value for garbage collection
|
|
|
// target value, calculated from db.capacity and gcTargetRatio.
|
|
|
-func (db *DB) gcTarget() (target int64) {
|
|
|
- return int64(float64(db.capacity) * gcTargetRatio)
|
|
|
-}
|
|
|
-
|
|
|
-// incGCSize increments gcSize by the provided number.
|
|
|
-// If count is negative, it will decrement gcSize.
|
|
|
-func (db *DB) incGCSize(count int64) {
|
|
|
- if count == 0 {
|
|
|
- return
|
|
|
- }
|
|
|
-
|
|
|
- db.gcSizeMu.Lock()
|
|
|
- new := db.gcSize + count
|
|
|
- db.gcSize = new
|
|
|
- db.gcSizeMu.Unlock()
|
|
|
-
|
|
|
- select {
|
|
|
- case db.writeGCSizeTrigger <- struct{}{}:
|
|
|
- default:
|
|
|
- }
|
|
|
- if new >= db.capacity {
|
|
|
- db.triggerGarbageCollection()
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-// getGCSize returns gcSize value by locking it
|
|
|
-// with gcSizeMu mutex.
|
|
|
-func (db *DB) getGCSize() (count int64) {
|
|
|
- db.gcSizeMu.RLock()
|
|
|
- count = db.gcSize
|
|
|
- db.gcSizeMu.RUnlock()
|
|
|
- return count
|
|
|
+func (db *DB) gcTarget() (target uint64) {
|
|
|
+ return uint64(float64(db.capacity) * gcTargetRatio)
|
|
|
}
|
|
|
|
|
|
// triggerGarbageCollection signals collectGarbageWorker
|
|
|
@@ -239,68 +135,41 @@ func (db *DB) triggerGarbageCollection() {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-// writeGCSizeWorker writes gcSize on trigger event
|
|
|
-// and waits writeGCSizeDelay after each write.
|
|
|
-// It implements a linear backoff with delay of
|
|
|
-// writeGCSizeDelay duration to avoid very frequent
|
|
|
-// database operations.
|
|
|
-func (db *DB) writeGCSizeWorker() {
|
|
|
- defer close(db.writeGCSizeWorkerDone)
|
|
|
+// incGCSizeInBatch changes gcSize field value
|
|
|
+// by change which can be negative. This function
|
|
|
+// must be called under batchMu lock.
|
|
|
+func (db *DB) incGCSizeInBatch(batch *leveldb.Batch, change int64) (err error) {
|
|
|
+ if change == 0 {
|
|
|
+ return nil
|
|
|
+ }
|
|
|
+ gcSize, err := db.gcSize.Get()
|
|
|
+ if err != nil {
|
|
|
+ return err
|
|
|
+ }
|
|
|
|
|
|
- for {
|
|
|
- select {
|
|
|
- case <-db.writeGCSizeTrigger:
|
|
|
- err := db.writeGCSize(db.getGCSize())
|
|
|
- if err != nil {
|
|
|
- log.Error("localstore write gc size", "err", err)
|
|
|
- }
|
|
|
- // Wait some time before writing gc size in the next
|
|
|
- // iteration. This prevents frequent I/O operations.
|
|
|
- select {
|
|
|
- case <-time.After(10 * time.Second):
|
|
|
- case <-db.close:
|
|
|
- return
|
|
|
- }
|
|
|
- case <-db.close:
|
|
|
- return
|
|
|
+ var new uint64
|
|
|
+ if change > 0 {
|
|
|
+ new = gcSize + uint64(change)
|
|
|
+ } else {
|
|
|
+ // 'change' is an int64 and is negative
|
|
|
+ // a conversion is needed with correct sign
|
|
|
+ c := uint64(-change)
|
|
|
+ if c > gcSize {
|
|
|
+ // protect uint64 undeflow
|
|
|
+ return nil
|
|
|
}
|
|
|
+ new = gcSize - c
|
|
|
}
|
|
|
-}
|
|
|
-
|
|
|
-// writeGCSize stores the number of items in gcIndex.
|
|
|
-// It removes all hashes from gcUncountedHashesIndex
|
|
|
-// not to include them on the next DB initialization
|
|
|
-// (New function) when gcSize is counted.
|
|
|
-func (db *DB) writeGCSize(gcSize int64) (err error) {
|
|
|
- const maxBatchSize = 1000
|
|
|
-
|
|
|
- batch := new(leveldb.Batch)
|
|
|
- db.storedGCSize.PutInBatch(batch, uint64(gcSize))
|
|
|
- batchSize := 1
|
|
|
+ db.gcSize.PutInBatch(batch, new)
|
|
|
|
|
|
- // use only one iterator as it acquires its snapshot
|
|
|
- // not to remove hashes from index that are added
|
|
|
- // after stored gc size is written
|
|
|
- err = db.gcUncountedHashesIndex.Iterate(func(item shed.Item) (stop bool, err error) {
|
|
|
- db.gcUncountedHashesIndex.DeleteInBatch(batch, item)
|
|
|
- batchSize++
|
|
|
- if batchSize >= maxBatchSize {
|
|
|
- err = db.shed.WriteBatch(batch)
|
|
|
- if err != nil {
|
|
|
- return false, err
|
|
|
- }
|
|
|
- batch.Reset()
|
|
|
- batchSize = 0
|
|
|
- }
|
|
|
- return false, nil
|
|
|
- }, nil)
|
|
|
- if err != nil {
|
|
|
- return err
|
|
|
+ // trigger garbage collection if we reached the capacity
|
|
|
+ if new >= db.capacity {
|
|
|
+ db.triggerGarbageCollection()
|
|
|
}
|
|
|
- return db.shed.WriteBatch(batch)
|
|
|
+ return nil
|
|
|
}
|
|
|
|
|
|
// testHookCollectGarbage is a hook that can provide
|
|
|
// information when a garbage collection run is done
|
|
|
// and how many items it removed.
|
|
|
-var testHookCollectGarbage func(collectedCount int64)
|
|
|
+var testHookCollectGarbage func(collectedCount uint64)
|