/* * Copyright 2016 Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include #include #include #include #include #include #include #include // Ignore shadowing warnings within this file, so includers can use -Wshadow. #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wshadow" namespace folly { namespace detail { template struct IndexedMemPoolRecycler; } /// Instances of IndexedMemPool dynamically allocate and then pool their /// element type (T), returning 4-byte integer indices that can be passed /// to the pool's operator[] method to access or obtain pointers to the /// actual elements. The memory backing items returned from the pool /// will always be readable, even if items have been returned to the pool. /// These two features are useful for lock-free algorithms. The indexing /// behavior makes it easy to build tagged pointer-like-things, since /// a large number of elements can be managed using fewer bits than a /// full pointer. The access-after-free behavior makes it safe to read /// from T-s even after they have been recycled, since it is guaranteed /// that the memory won't have been returned to the OS and unmapped /// (the algorithm must still use a mechanism to validate that the read /// was correct, but it doesn't have to worry about page faults), and if /// the elements use internal sequence numbers it can be guaranteed that /// there won't be an ABA match due to the element being overwritten with /// a different type that has the same bit pattern. /// /// IndexedMemPool has two object lifecycle strategies. The first /// is to construct objects when they are allocated from the pool and /// destroy them when they are recycled. In this mode allocIndex and /// allocElem have emplace-like semantics. In the second mode, objects /// are default-constructed the first time they are removed from the pool, /// and deleted when the pool itself is deleted. By default the first /// mode is used for non-trivial T, and the second is used for trivial T. /// /// IMPORTANT: Space for extra elements is allocated to account for those /// that are inaccessible because they are in other local lists, so the /// actual number of items that can be allocated ranges from capacity to /// capacity + (NumLocalLists_-1)*LocalListLimit_. This is important if /// you are trying to maximize the capacity of the pool while constraining /// the bit size of the resulting pointers, because the pointers will /// actually range up to the boosted capacity. See maxIndexForCapacity /// and capacityForMaxIndex. /// /// To avoid contention, NumLocalLists_ free lists of limited (less than /// or equal to LocalListLimit_) size are maintained, and each thread /// retrieves and returns entries from its associated local list. If the /// local list becomes too large then elements are placed in bulk in a /// global free list. This allows items to be efficiently recirculated /// from consumers to producers. AccessSpreader is used to access the /// local lists, so there is no performance advantage to having more /// local lists than L1 caches. /// /// The pool mmap-s the entire necessary address space when the pool is /// constructed, but delays element construction. This means that only /// elements that are actually returned to the caller get paged into the /// process's resident set (RSS). template class Atom = std::atomic, bool EagerRecycleWhenTrivial = false, bool EagerRecycleWhenNotTrivial = true> struct IndexedMemPool : boost::noncopyable { typedef T value_type; typedef std::unique_ptr> UniquePtr; static_assert(LocalListLimit_ <= 255, "LocalListLimit must fit in 8 bits"); enum { NumLocalLists = NumLocalLists_, LocalListLimit = LocalListLimit_ }; static constexpr bool eagerRecycle() { return std::is_trivial::value ? EagerRecycleWhenTrivial : EagerRecycleWhenNotTrivial; } // these are public because clients may need to reason about the number // of bits required to hold indices from a pool, given its capacity static constexpr uint32_t maxIndexForCapacity(uint32_t capacity) { // index of uint32_t(-1) == UINT32_MAX is reserved for isAllocated tracking return std::min(uint64_t(capacity) + (NumLocalLists - 1) * LocalListLimit, uint64_t(uint32_t(-1) - 1)); } static constexpr uint32_t capacityForMaxIndex(uint32_t maxIndex) { return maxIndex - (NumLocalLists - 1) * LocalListLimit; } /// Constructs a pool that can allocate at least _capacity_ elements, /// even if all the local lists are full explicit IndexedMemPool(uint32_t capacity) : actualCapacity_(maxIndexForCapacity(capacity)) , size_(0) , globalHead_(TaggedPtr{}) { const size_t needed = sizeof(Slot) * (actualCapacity_ + 1); size_t pagesize = sysconf(_SC_PAGESIZE); mmapLength_ = ((needed - 1) & ~(pagesize - 1)) + pagesize; assert(needed <= mmapLength_ && mmapLength_ < needed + pagesize); assert((mmapLength_ % pagesize) == 0); slots_ = static_cast(mmap(nullptr, mmapLength_, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)); if (slots_ == MAP_FAILED) { assert(errno == ENOMEM); throw std::bad_alloc(); } } /// Destroys all of the contained elements ~IndexedMemPool() { if (!eagerRecycle()) { for (size_t i = size_; i > 0; --i) { slots_[i].~Slot(); } } munmap(slots_, mmapLength_); } /// Returns a lower bound on the number of elements that may be /// simultaneously allocated and not yet recycled. Because of the /// local lists it is possible that more elements than this are returned /// successfully size_t capacity() { return capacityForMaxIndex(actualCapacity_); } /// Finds a slot with a non-zero index, emplaces a T there if we're /// using the eager recycle lifecycle mode, and returns the index, /// or returns 0 if no elements are available. template uint32_t allocIndex(Args&&... args) { static_assert(sizeof...(Args) == 0 || eagerRecycle(), "emplace-style allocation requires eager recycle, " "which is defaulted only for non-trivial types"); auto idx = localPop(localHead()); if (idx != 0 && eagerRecycle()) { T* ptr = &slot(idx).elem; new (ptr) T(std::forward(args)...); } return idx; } /// If an element is available, returns a std::unique_ptr to it that will /// recycle the element to the pool when it is reclaimed, otherwise returns /// a null (falsy) std::unique_ptr template UniquePtr allocElem(Args&&... args) { auto idx = allocIndex(std::forward(args)...); T* ptr = idx == 0 ? nullptr : &slot(idx).elem; return UniquePtr(ptr, typename UniquePtr::deleter_type(this)); } /// Gives up ownership previously granted by alloc() void recycleIndex(uint32_t idx) { assert(isAllocated(idx)); if (eagerRecycle()) { slot(idx).elem.~T(); } localPush(localHead(), idx); } /// Provides access to the pooled element referenced by idx T& operator[](uint32_t idx) { return slot(idx).elem; } /// Provides access to the pooled element referenced by idx const T& operator[](uint32_t idx) const { return slot(idx).elem; } /// If elem == &pool[idx], then pool.locateElem(elem) == idx. Also, /// pool.locateElem(nullptr) == 0 uint32_t locateElem(const T* elem) const { if (!elem) { return 0; } static_assert(std::is_standard_layout::value, "offsetof needs POD"); auto slot = reinterpret_cast( reinterpret_cast(elem) - offsetof(Slot, elem)); auto rv = slot - slots_; // this assert also tests that rv is in range assert(elem == &(*this)[rv]); return rv; } /// Returns true iff idx has been alloc()ed and not recycleIndex()ed bool isAllocated(uint32_t idx) const { return slot(idx).localNext == uint32_t(-1); } private: ///////////// types struct Slot { T elem; uint32_t localNext; uint32_t globalNext; Slot() : localNext{}, globalNext{} {} }; struct TaggedPtr { uint32_t idx; // size is bottom 8 bits, tag in top 24. g++'s code generation for // bitfields seems to depend on the phase of the moon, plus we can // do better because we can rely on other checks to avoid masking uint32_t tagAndSize; enum : uint32_t { SizeBits = 8, SizeMask = (1U << SizeBits) - 1, TagIncr = 1U << SizeBits, }; uint32_t size() const { return tagAndSize & SizeMask; } TaggedPtr withSize(uint32_t repl) const { assert(repl <= LocalListLimit); return TaggedPtr{ idx, (tagAndSize & ~SizeMask) | repl }; } TaggedPtr withSizeIncr() const { assert(size() < LocalListLimit); return TaggedPtr{ idx, tagAndSize + 1 }; } TaggedPtr withSizeDecr() const { assert(size() > 0); return TaggedPtr{ idx, tagAndSize - 1 }; } TaggedPtr withIdx(uint32_t repl) const { return TaggedPtr{ repl, tagAndSize + TagIncr }; } TaggedPtr withEmpty() const { return withIdx(0).withSize(0); } }; struct FOLLY_ALIGN_TO_AVOID_FALSE_SHARING LocalList { AtomicStruct head; LocalList() : head(TaggedPtr{}) {} }; ////////// fields /// the actual number of slots that we will allocate, to guarantee /// that we will satisfy the capacity requested at construction time. /// They will be numbered 1..actualCapacity_ (note the 1-based counting), /// and occupy slots_[1..actualCapacity_]. size_t actualCapacity_; /// the number of bytes allocated from mmap, which is a multiple of /// the page size of the machine size_t mmapLength_; /// this records the number of slots that have actually been constructed. /// To allow use of atomic ++ instead of CAS, we let this overflow. /// The actual number of constructed elements is min(actualCapacity_, /// size_) Atom size_; /// raw storage, only 1..min(size_,actualCapacity_) (inclusive) are /// actually constructed. Note that slots_[0] is not constructed or used FOLLY_ALIGN_TO_AVOID_FALSE_SHARING Slot* slots_; /// use AccessSpreader to find your list. We use stripes instead of /// thread-local to avoid the need to grow or shrink on thread start /// or join. These are heads of lists chained with localNext LocalList local_[NumLocalLists]; /// this is the head of a list of node chained by globalNext, that are /// themselves each the head of a list chained by localNext FOLLY_ALIGN_TO_AVOID_FALSE_SHARING AtomicStruct globalHead_; ///////////// private methods size_t slotIndex(uint32_t idx) const { assert(0 < idx && idx <= actualCapacity_ && idx <= size_.load(std::memory_order_acquire)); return idx; } Slot& slot(uint32_t idx) { return slots_[slotIndex(idx)]; } const Slot& slot(uint32_t idx) const { return slots_[slotIndex(idx)]; } // localHead references a full list chained by localNext. s should // reference slot(localHead), it is passed as a micro-optimization void globalPush(Slot& s, uint32_t localHead) { while (true) { TaggedPtr gh = globalHead_.load(std::memory_order_acquire); s.globalNext = gh.idx; if (globalHead_.compare_exchange_strong(gh, gh.withIdx(localHead))) { // success return; } } } // idx references a single node void localPush(AtomicStruct& head, uint32_t idx) { Slot& s = slot(idx); TaggedPtr h = head.load(std::memory_order_acquire); while (true) { s.localNext = h.idx; if (h.size() == LocalListLimit) { // push will overflow local list, steal it instead if (head.compare_exchange_strong(h, h.withEmpty())) { // steal was successful, put everything in the global list globalPush(s, idx); return; } } else { // local list has space if (head.compare_exchange_strong(h, h.withIdx(idx).withSizeIncr())) { // success return; } } // h was updated by failing CAS } } // returns 0 if empty uint32_t globalPop() { while (true) { TaggedPtr gh = globalHead_.load(std::memory_order_acquire); if (gh.idx == 0 || globalHead_.compare_exchange_strong( gh, gh.withIdx(slot(gh.idx).globalNext))) { // global list is empty, or pop was successful return gh.idx; } } } // returns 0 if allocation failed uint32_t localPop(AtomicStruct& head) { while (true) { TaggedPtr h = head.load(std::memory_order_acquire); if (h.idx != 0) { // local list is non-empty, try to pop Slot& s = slot(h.idx); if (head.compare_exchange_strong( h, h.withIdx(s.localNext).withSizeDecr())) { // success s.localNext = uint32_t(-1); return h.idx; } continue; } uint32_t idx = globalPop(); if (idx == 0) { // global list is empty, allocate and construct new slot if (size_.load(std::memory_order_relaxed) >= actualCapacity_ || (idx = ++size_) > actualCapacity_) { // allocation failed return 0; } // default-construct it now if we aren't going to construct and // destroy on each allocation if (!eagerRecycle()) { T* ptr = &slot(idx).elem; new (ptr) T(); } slot(idx).localNext = uint32_t(-1); return idx; } Slot& s = slot(idx); if (head.compare_exchange_strong( h, h.withIdx(s.localNext).withSize(LocalListLimit))) { // global list moved to local list, keep head for us s.localNext = uint32_t(-1); return idx; } // local bulk push failed, return idx to the global list and try again globalPush(s, idx); } } AtomicStruct& localHead() { auto stripe = detail::AccessSpreader::current(NumLocalLists); return local_[stripe].head; } }; namespace detail { /// This is a stateful Deleter functor, which allows std::unique_ptr /// to track elements allocated from an IndexedMemPool by tracking the /// associated pool. See IndexedMemPool::allocElem. template struct IndexedMemPoolRecycler { Pool* pool; explicit IndexedMemPoolRecycler(Pool* pool) : pool(pool) {} IndexedMemPoolRecycler(const IndexedMemPoolRecycler& rhs) = default; IndexedMemPoolRecycler& operator= (const IndexedMemPoolRecycler& rhs) = default; void operator()(typename Pool::value_type* elem) const { pool->recycleIndex(pool->locateElem(elem)); } }; } } // namespace folly # pragma GCC diagnostic pop