Linux kernel memory allocation

큰 흐름...(초기 슬랩을 위한 페이지 할당부분만을 따라갈때)

kmem_cache_alloc() ->

slab_alloc() ->

__slab_alloc() ->

new_slab_objects() ->

new_slab() ->

allocate_slab() ->

// page order 가 여기서 정해짐.(s->oo)

static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)

{

struct page *page;

struct kmem_cache_order_objects oo = s->oo;

alloc_slab_page() ->

alloc_pages() -> (매크로임. 이부분에서 mm/page_alloc.c 부분으로 넘어가게됨)

따라가면 아래의 __alloc_pages_nodemask 를 부르게됨.(NUMA 설정이 없는경우)

* This is the 'heart' of the zoned buddy allocator.

struct page *

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,

struct zonelist *zonelist, nodemask_t *nodemask)

{

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

struct zone *preferred_zone;

struct page *page = NULL;

int migratetype = allocflags_to_migratetype(gfp_mask);

... gfp_zone 으로 ZONE index 가 정해지고 migrate type 이 여기서 정해짐.

/* First allocation attempt */

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,

zonelist, high_zoneidx, alloc_flags,

preferred_zone, migratetype);

if (unlikely(!page))

page = __alloc_pages_slowpath(gfp_mask, order,

zonelist, high_zoneidx, nodemask,

preferred_zone, migratetype);

페이지의 특정한 zone의 페이지 freelist 에서 꺼내오거나, alloc_pages_slowpath 에서 가져옴.

nodemask 에서 get_page_from_freelist 를 호출.

get_page_from_freelist() ->

...

try_this_zone:

page = buffered_rmqueue(preferred_zone, zone, order,

gfp_mask, migratetype);

if (page)

break;

...

zone 리스트를 순회하다가 위에처럼 buffered_rmqueue 에서 페이지를 꺼내옴.

struct page *buffered_rmqueue(struct zone *preferred_zone,

struct zone *zone, int order, gfp_t gfp_flags,

int migratetype)

{

unsigned long flags;

struct page *page;

int cold = !!(gfp_flags & __GFP_COLD);

again:

if (likely(order == 0)) {

struct per_cpu_pages *pcp;

struct list_head *list;

local_irq_save(flags);

pcp = &this_cpu_ptr(zone->pageset)->pcp;

list = &pcp->lists[migratetype];

if (list_empty(list)) {

pcp->count += rmqueue_bulk(zone, 0,

pcp->batch, list,

migratetype, cold); // rmqueue!

if (unlikely(list_empty(list)))

goto failed;

}

if (cold)

page = list_entry(list->prev, struct page, lru);

else

page = list_entry(list->next, struct page, lru);

list_del(&page->lru);

pcp->count--;

} else {

if (unlikely(gfp_flags & __GFP_NOFAIL)) {

* __GFP_NOFAIL is not to be used in new code.

* All __GFP_NOFAIL callers should be fixed so that they

* properly detect and handle allocation failures.

* We most definitely don't want callers attempting to

* allocate greater than order-1 page units with

* __GFP_NOFAIL.

WARN_ON_ONCE(order > 1);

}

spin_lock_irqsave(&zone->lock, flags);

page = __rmqueue(zone, order, migratetype); // rmqueue!

spin_unlock(&zone->lock);

if (!page)

goto failed;

rmqueue_bulk 에서 1페이지만 꺼내오거나

order 가 1이상인경우 __rmqueue 에서 원하는만큼의 페이지를 가져옴.

__rmqueue 에서 따라가보면 아래처럼 __rmqueue_smallest 에서 꺼내오기를 반복하게 되어있음.

* Do the hard work of removing an element from the buddy allocator.

* Call me with the zone->lock already held.

static struct page *__rmqueue(struct zone *zone, unsigned int order,

int migratetype)

{

struct page *page;

retry_reserve:

page = __rmqueue_smallest(zone, order, migratetype);

if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {

page = __rmqueue_fallback(zone, order, migratetype);

__rmqueue_smallest 는 아래와 같음.

아마도 SLAB 을 초기할당하면서 대신 페이지 자체는 free page 들이 있는 상황에서

이쪽으로 흐름이 오게될것 같음.

디버깅 메시지를 조금 심어서 테스트를 아래처럼 해봤음.

* Go through the free lists for the given migratetype and remove

* the smallest available page from the freelists

static inline

struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,

int migratetype)

{

unsigned int current_order;

struct free_area * area;

struct page *page;

// KIMON DEBUG

printk("zone:%x, zonename:%s, order:%x, migratetype:%x\n", zone, zone->name, order, migratetype);

/* Find a page of the appropriate size in the preferred list */

for (current_order = order; current_order < MAX_ORDER; ++current_order) {

// KIMON DEBUG

printk("current order : %d\n", current_order);

area = &(zone->free_area[current_order]);

if (list_empty(&area->free_list[migratetype]))

continue;

// think through..!

page = list_entry(area->free_list[migratetype].next,

struct page, lru);

list_del(&page->lru);

rmv_page_order(page);

area->nr_free--;

expand(zone, page, order, current_order, area, migratetype);

return page;

}

return NULL;

}

그결과 zone name 은 언제나 Normal 만 나오는것으로 봐서 ZONE_NORMAL 에서만 전부 할당되고있다...

order 0 인것이 대부분인데, current order 가 4~5 정도까지 올라갈때도 종종 있다.

order 0 인데 연속된 페이지를 못찾는경우가 종종 있다는게 말이되나..? 뭔가 이상하다

그나저나 zone 의 경우 arch dependent 하기때문에 x86 디렉토리 밑의 mm 아래에 init.c 에 관련 코드가 있었음...

아무래도 부팅시 zone size, 및 종류를 초기화하는 코드가 arch dependent 하게 수행되고

page 구조체들도 처음에 쭉 다 만들어놓고 memory allocator 가 나중에 이것들을 가져다쓰는것같음...

root@ubuntu:/var/www/work/kimon/linux-3.8.1# grep -R 'zone_sizes_init' * -n

arch/m32r/mm/discontig.c:132:unsigned long __init zone_sizes_init(void)

arch/m32r/mm/init.c:63:unsigned long __init zone_sizes_init(void)

arch/m32r/mm/init.c:93:extern unsigned long zone_sizes_init(void);

arch/m32r/mm/init.c:111: hole_pages = zone_sizes_init();

arch/openrisc/mm/init.c:51:static void __init zone_sizes_init(void)

arch/openrisc/mm/init.c:155: zone_sizes_init();

arch/arm64/mm/init.c:72:static void __init zone_sizes_init(unsigned long min, unsigned long max)

arch/arm64/mm/init.c:194: zone_sizes_init(min, max);

arch/tile/kernel/setup.c:660:static void __init zone_sizes_init(void)

arch/tile/kernel/setup.c:1395: zone_sizes_init();

arch/x86/include/asm/init.h:8:extern void __init zone_sizes_init(void);

arch/x86/mm/init.c:407:void __init zone_sizes_init(void)

arch/x86/mm/init_32.c:697: zone_sizes_init();

arch/x86/mm/init_64.c:637: zone_sizes_init();

root@ubuntu:/var/www/work/kimon/linux-3.8.1#

^[[Aroot@ubuntu:/var/www/work/kimon/linux-3.8.1# grep -R 'MAX_DMA32_PFN' * -n

arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h:88:#define MAX_DMA32_PFN (((1ULL << 32) - (1ULL << 28)) >> PAGE_SHIFT)

arch/mips/include/asm/dma.h:94:#ifndef MAX_DMA32_PFN

arch/mips/include/asm/dma.h:95:#define MAX_DMA32_PFN (1UL << (32 - PAGE_SHIFT))

arch/mips/mm/init.c:343: max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;

arch/arm64/mm/init.c:70:#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT)

arch/arm64/mm/init.c:82: max_dma32 = max(min, min(max, MAX_DMA32_PFN));

arch/x86/kernel/pci-swiotlb.c:81: if (!no_iommu && max_pfn > MAX_DMA32_PFN)

arch/x86/kernel/amd_gart_64.c:756: (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||

arch/x86/kernel/amd_gart_64.c:759: if (max_pfn > MAX_DMA32_PFN) {

arch/x86/kernel/early-quirks.c:51: if ((max_pfn > MAX_DMA32_PFN || force_iommu) &&

arch/x86/kernel/aperture_64.c:422: max_pfn > MAX_DMA32_PFN &&

arch/x86/kernel/aperture_64.c:463: } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) ||

arch/x86/include/asm/dma.h:76:#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT)

arch/x86/mm/init.c:417: max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;

arch/x86/mm/numa_emulation.c:128: u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);

arch/x86/mm/numa_emulation.c:243: u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);

arch/x86/xen/pci-swiotlb-xen.c:66: if (max_pfn > MAX_DMA32_PFN)

drivers/iommu/amd_iommu.c:3173: if (unhandled && max_pfn > MAX_DMA32_PFN) {

mm/bootmem.c:787:#ifdef MAX_DMA32_PFN

mm/bootmem.c:793: /* update goal according ...MAX_DMA32_PFN */

mm/bootmem.c:796: if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&

mm/bootmem.c:797: (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {

mm/bootmem.c:801: new_goal = MAX_DMA32_PFN << PAGE_SHIFT;

우연히 zone_sizes_init 이라는 함수를 찾고, 거기서 아키텍쳐별로 ZONE 설정을 지역변수에 한다음

아래의 함수 파라미터로 넘기는것을 발견했음.

/**

* free_area_init_nodes - Initialise all pg_data_t and zone data

* @max_zone_pfn: an array of max PFNs for each zone

* This will call free_area_init_node() for each active node in the system.

* Using the page ranges provided by add_active_range(), the size of each

* zone in each node and their holes is calculated. If the maximum PFN

* between two adjacent zones match, it is assumed that the zone is empty.

* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

* that arch_max_dma32_pfn has no pages. It is also assumed that a zone

* starts where the previous one ended. For example, ZONE_DMA32 starts

* at arch_max_dma_pfn.

void __init free_area_init_nodes(unsigned long *max_zone_pfn)

{

unsigned long start_pfn, end_pfn;

int i, nid;

/* Record where the zone boundaries are */

memset(arch_zone_lowest_possible_pfn, 0,

sizeof(arch_zone_lowest_possible_pfn));

memset(arch_zone_highest_possible_pfn, 0,

sizeof(arch_zone_highest_possible_pfn));

arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();

arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];

저작자표시

'Programming' 카테고리의 다른 글

How NX is implemented in x86 Linux (0)	2013.10.22
Preemptive kernel vs Non-preemptive kernel (0)	2013.10.14
QEMU NAT configuration (0)	2013.09.17
Windows PE Structure (0)	2013.09.12
Linux vmarea structure (0)	2013.09.11

SHALL WE PLAY A GAME?

Linux kernel memory allocation

'Programming' 카테고리의 다른 글

티스토리툴바

Linux kernel memory allocation

'Programming' 카테고리의 다른 글

'Programming' Related Articles

티스토리툴바