본문 바로가기

Programming

Linux kernel memory allocation

큰 흐름...(초기 슬랩을 위한 페이지 할당부분만을 따라갈때)


kmem_cache_alloc() -> 

slab_alloc() ->

__slab_alloc() ->

new_slab_objects() ->

new_slab() ->

allocate_slab() ->


// page order 가 여기서 정해짐.(s->oo)

static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)

{

struct page *page;

struct kmem_cache_order_objects oo = s->oo;


alloc_slab_page() ->

alloc_pages() -> (매크로임. 이부분에서 mm/page_alloc.c 부분으로 넘어가게됨)

따라가면 아래의 __alloc_pages_nodemask 를 부르게됨.(NUMA 설정이 없는경우)


/*

 * This is the 'heart' of the zoned buddy allocator.

 */

struct page *

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,

struct zonelist *zonelist, nodemask_t *nodemask)

{

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

struct zone *preferred_zone;

struct page *page = NULL;

int migratetype = allocflags_to_migratetype(gfp_mask);


... gfp_zone 으로 ZONE index 가 정해지고 migrate type 이 여기서 정해짐.


/* First allocation attempt */

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,

zonelist, high_zoneidx, alloc_flags,

preferred_zone, migratetype);

if (unlikely(!page))

page = __alloc_pages_slowpath(gfp_mask, order,

zonelist, high_zoneidx, nodemask,

preferred_zone, migratetype);



페이지의 특정한 zone의 페이지 freelist 에서 꺼내오거나, alloc_pages_slowpath 에서 가져옴.


nodemask 에서 get_page_from_freelist 를 호출.


get_page_from_freelist() -> 

...

try_this_zone:

page = buffered_rmqueue(preferred_zone, zone, order,

gfp_mask, migratetype);

if (page)

break;

...

zone 리스트를 순회하다가 위에처럼 buffered_rmqueue 에서 페이지를 꺼내옴.


struct page *buffered_rmqueue(struct zone *preferred_zone,

struct zone *zone, int order, gfp_t gfp_flags,

int migratetype)

{

unsigned long flags;

struct page *page;

int cold = !!(gfp_flags & __GFP_COLD);


again:

if (likely(order == 0)) {

struct per_cpu_pages *pcp;

struct list_head *list;


local_irq_save(flags);

pcp = &this_cpu_ptr(zone->pageset)->pcp;

list = &pcp->lists[migratetype];

if (list_empty(list)) {

pcp->count += rmqueue_bulk(zone, 0,

pcp->batch, list,

migratetype, cold); // rmqueue!

if (unlikely(list_empty(list)))

goto failed;

}


if (cold)

page = list_entry(list->prev, struct page, lru);

else

page = list_entry(list->next, struct page, lru);


list_del(&page->lru);

pcp->count--;

} else {

if (unlikely(gfp_flags & __GFP_NOFAIL)) {

/*

* __GFP_NOFAIL is not to be used in new code.

*

* All __GFP_NOFAIL callers should be fixed so that they

* properly detect and handle allocation failures.

*

* We most definitely don't want callers attempting to

* allocate greater than order-1 page units with

* __GFP_NOFAIL.

*/

WARN_ON_ONCE(order > 1);

}

spin_lock_irqsave(&zone->lock, flags);

page = __rmqueue(zone, order, migratetype); // rmqueue!

spin_unlock(&zone->lock);

if (!page)

goto failed;


rmqueue_bulk 에서 1페이지만 꺼내오거나

order 가 1이상인경우 __rmqueue 에서 원하는만큼의 페이지를 가져옴.



__rmqueue 에서 따라가보면 아래처럼 __rmqueue_smallest 에서 꺼내오기를 반복하게 되어있음.


/*

 * Do the hard work of removing an element from the buddy allocator.

 * Call me with the zone->lock already held.

 */

static struct page *__rmqueue(struct zone *zone, unsigned int order,

int migratetype)

{

struct page *page;


retry_reserve:

page = __rmqueue_smallest(zone, order, migratetype);


if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {

page = __rmqueue_fallback(zone, order, migratetype);




__rmqueue_smallest 는 아래와 같음.

아마도 SLAB 을 초기할당하면서 대신 페이지 자체는 free page 들이 있는 상황에서

이쪽으로 흐름이 오게될것 같음.

디버깅 메시지를 조금 심어서 테스트를 아래처럼 해봤음.


/*

 * Go through the free lists for the given migratetype and remove

 * the smallest available page from the freelists

 */

static inline

struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,

int migratetype)

{

unsigned int current_order;

struct free_area * area;

struct page *page;


// KIMON DEBUG

printk("zone:%x, zonename:%s, order:%x, migratetype:%x\n", zone, zone->name, order, migratetype);


/* Find a page of the appropriate size in the preferred list */

for (current_order = order; current_order < MAX_ORDER; ++current_order) {


// KIMON DEBUG

printk("current order : %d\n", current_order);


area = &(zone->free_area[current_order]);

if (list_empty(&area->free_list[migratetype]))

continue;


// think through..!

page = list_entry(area->free_list[migratetype].next,

struct page, lru);

list_del(&page->lru);

rmv_page_order(page);

area->nr_free--;

expand(zone, page, order, current_order, area, migratetype);

return page;

}


return NULL;

}



그결과 zone name 은 언제나 Normal 만 나오는것으로 봐서 ZONE_NORMAL 에서만 전부 할당되고있다...

order 0 인것이 대부분인데, current order 가 4~5 정도까지 올라갈때도 종종 있다.

order 0 인데 연속된 페이지를 못찾는경우가 종종 있다는게 말이되나..? 뭔가 이상하다



그나저나  zone 의 경우 arch dependent 하기때문에 x86 디렉토리 밑의 mm 아래에 init.c 에 관련 코드가 있었음...

아무래도 부팅시 zone size, 및 종류를 초기화하는 코드가 arch dependent 하게 수행되고

page 구조체들도 처음에 쭉 다 만들어놓고 memory allocator 가 나중에 이것들을 가져다쓰는것같음...


root@ubuntu:/var/www/work/kimon/linux-3.8.1# grep -R 'zone_sizes_init' * -n

arch/m32r/mm/discontig.c:132:unsigned long __init zone_sizes_init(void)

arch/m32r/mm/init.c:63:unsigned long __init zone_sizes_init(void)

arch/m32r/mm/init.c:93:extern unsigned long zone_sizes_init(void);

arch/m32r/mm/init.c:111: hole_pages = zone_sizes_init();

arch/openrisc/mm/init.c:51:static void __init zone_sizes_init(void)

arch/openrisc/mm/init.c:155: zone_sizes_init();

arch/arm64/mm/init.c:72:static void __init zone_sizes_init(unsigned long min, unsigned long max)

arch/arm64/mm/init.c:194: zone_sizes_init(min, max);

arch/tile/kernel/setup.c:660:static void __init zone_sizes_init(void)

arch/tile/kernel/setup.c:1395: zone_sizes_init();

arch/x86/include/asm/init.h:8:extern void __init zone_sizes_init(void);

arch/x86/mm/init.c:407:void __init zone_sizes_init(void)

arch/x86/mm/init_32.c:697: zone_sizes_init();

arch/x86/mm/init_64.c:637: zone_sizes_init();

^C

root@ubuntu:/var/www/work/kimon/linux-3.8.1# 

^[[Aroot@ubuntu:/var/www/work/kimon/linux-3.8.1# grep -R 'MAX_DMA32_PFN' * -n

arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h:88:#define MAX_DMA32_PFN (((1ULL << 32) - (1ULL << 28)) >> PAGE_SHIFT)

arch/mips/include/asm/dma.h:94:#ifndef MAX_DMA32_PFN

arch/mips/include/asm/dma.h:95:#define MAX_DMA32_PFN (1UL << (32 - PAGE_SHIFT))

arch/mips/mm/init.c:343: max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;

arch/arm64/mm/init.c:70:#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT)

arch/arm64/mm/init.c:82: max_dma32 = max(min, min(max, MAX_DMA32_PFN));

arch/x86/kernel/pci-swiotlb.c:81: if (!no_iommu && max_pfn > MAX_DMA32_PFN)

arch/x86/kernel/amd_gart_64.c:756:    (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||

arch/x86/kernel/amd_gart_64.c:759: if (max_pfn > MAX_DMA32_PFN) {

arch/x86/kernel/early-quirks.c:51: if ((max_pfn > MAX_DMA32_PFN ||  force_iommu) &&

arch/x86/kernel/aperture_64.c:422:    max_pfn > MAX_DMA32_PFN &&

arch/x86/kernel/aperture_64.c:463: } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) ||

arch/x86/include/asm/dma.h:76:#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT)

arch/x86/mm/init.c:417: max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;

arch/x86/mm/numa_emulation.c:128: u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);

arch/x86/mm/numa_emulation.c:243: u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);

arch/x86/xen/pci-swiotlb-xen.c:66: if (max_pfn > MAX_DMA32_PFN)

drivers/iommu/amd_iommu.c:3173: if (unhandled && max_pfn > MAX_DMA32_PFN) {

mm/bootmem.c:787:#ifdef MAX_DMA32_PFN

mm/bootmem.c:793: /* update goal according ...MAX_DMA32_PFN */

mm/bootmem.c:796: if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&

mm/bootmem.c:797:    (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {

mm/bootmem.c:801: new_goal = MAX_DMA32_PFN << PAGE_SHIFT;




우연히 zone_sizes_init 이라는 함수를 찾고, 거기서 아키텍쳐별로 ZONE 설정을 지역변수에 한다음
아래의 함수 파라미터로 넘기는것을 발견했음.

/**
 * free_area_init_nodes - Initialise all pg_data_t and zone data
 * @max_zone_pfn: an array of max PFNs for each zone
 *
 * This will call free_area_init_node() for each active node in the system.
 * Using the page ranges provided by add_active_range(), the size of each
 * zone in each node and their holes is calculated. If the maximum PFN
 * between two adjacent zones match, it is assumed that the zone is empty.
 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
 * starts where the previous one ended. For example, ZONE_DMA32 starts
 * at arch_max_dma_pfn.
 */
void __init free_area_init_nodes(unsigned long *max_zone_pfn)
{
unsigned long start_pfn, end_pfn;
int i, nid;

/* Record where the zone boundaries are */
memset(arch_zone_lowest_possible_pfn, 0,
sizeof(arch_zone_lowest_possible_pfn));
memset(arch_zone_highest_possible_pfn, 0,
sizeof(arch_zone_highest_possible_pfn));
arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];


'Programming' 카테고리의 다른 글

How NX is implemented in x86 Linux  (0) 2013.10.22
Preemptive kernel vs Non-preemptive kernel  (0) 2013.10.14
QEMU NAT configuration  (0) 2013.09.17
Windows PE Structure  (0) 2013.09.12
Linux vmarea structure  (0) 2013.09.11