文章目录
内核版本:linux-2.4.22
1、初始化管理区
1.1、函数:setup_memory()
- 初始化低端内存PFN的起点和终点,高端内存PFN的起点和终点,以及系统最后一页的PFN。
- 初始化
bootmem_data
结构以及声明可能被boot memory allocator
用到的页面。 - 标记所有系统可用的页面为空闲,然后保留表示为图的页面。
- 在配置了
SMP
或initrd
镜像存在时,为它们保留页面。
// arch/i386/kernel/setup.c
static unsigned long __init setup_memory(void)
{
unsigned long bootmap_size, start_pfn, max_low_pfn;
/*
* partially used pages are not usable - thus
* we are rounding upwards:
*/
start_pfn = PFN_UP(__pa(&_end)); //_end是已载入内核镜像的低端地址
find_max_pfn(); //遍历e820图,查找最高的可用PFN
max_low_pfn = find_max_low_pfn(); //在ZONE_NORMAL中找到可用的最高页面帧
#ifdef CONFIG_HIGHMEM
highstart_pfn = highend_pfn = max_pfn;
if (max_pfn > max_low_pfn) {
highstart_pfn = max_low_pfn;
}
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
pages_to_mb(highend_pfn - highstart_pfn));
#endif
printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
pages_to_mb(max_low_pfn));
/*
* Initialize the boot-time allocator (with low memory only):
*/
bootmap_size = init_bootmem(start_pfn, max_low_pfn);
register_bootmem_low_pages(max_low_pfn);
/*
* Reserve the bootmem bitmap itself as well. We do this in two
* steps (first step was init_bootmem()) because this catches
* the (very unlikely) case of us accidentally initializing the
* bootmem allocator with an invalid RAM area.
*/
reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) +
bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY)); //保留用于存储表示页面的位图的页面
/*
* reserve physical page 0 - it's a special BIOS page on many boxes,
* enabling clean reboots, SMP operation, laptop functions.
*/
reserve_bootmem(0, PAGE_SIZE); //保留0号页面,0号页面是BIOS用到的特殊页面
#ifdef CONFIG_SMP
/*
* But first pinch a few for the stack/trampoline stuff
* FIXME: Don't need the extra page at 4K, but need to fix
* trampoline before removing it. (see the GDT stuff)
*/
reserve_bootmem(PAGE_SIZE, PAGE_SIZE); //保留额外的页面为跳板代码用。跳板代码用于处理用户空间如何进入内核空间
#endif
#ifdef CONFIG_ACPI_SLEEP
/*
* Reserve low memory region for sleep support.
*/
acpi_reserve_bootmem();
#endif
#ifdef CONFIG_X86_LOCAL_APIC
/*
* Find and reserve possible boot-time SMP configuration.
*/
find_smp_config();
#endif
#ifdef CONFIG_BLK_DEV_INITRD //initrd提供一个小型文件系统镜像,用于启动系统
if (LOADER_TYPE && INITRD_START) {
if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
reserve_bootmem(INITRD_START, INITRD_SIZE);
initrd_start =
INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
initrd_end = initrd_start+INITRD_SIZE;
}
else {
printk(KERN_ERR "initrd extends beyond end of memory "
"(0x%08lx > 0x%08lx)\ndisabling initrd\n",
INITRD_START + INITRD_SIZE,
max_low_pfn << PAGE_SHIFT);
initrd_start = 0;
}
}
#endif
return max_low_pfn; //返回ZONE_NORMAL中可寻址内存上限
}
1.2、函数:zone_sizes_init()
初始化各管理区的高层函数。该函数填充一个记录管理区大小的数组,并把它传给free_area_init()
。
// arch/i386/mm/init.c
static void __init zone_sizes_init(void)
{
unsigned long zones_size[MAX_NR_ZONES] = {
0, 0, 0};
unsigned int max_dma, high, low;
max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
low = max_low_pfn;
high = highend_pfn;
if (low < max_dma)
zones_size[ZONE_DMA] = low;
else {
zones_size[ZONE_DMA] = max_dma;
zones_size[ZONE_NORMAL] = low - max_dma;
#ifdef CONFIG_HIGHMEM
zones_size[ZONE_HIGHMEM] = high - low;
#endif
}
free_area_init(zones_size);
}
1.3、函数:free_area_init()
// mm/page_alloc.c
void __init free_area_init(unsigned long *zones_size)
{
free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0);
}
1.4、函数:free_area_init_node()
初始化系统中每个pgdat
。如果希望在特定的体系结构中对他们的位置调优,调用者可以选择性的分配它们自己的mem_map并作为参数传递给这个函数。如果不,则mem_map[]
部分会由free_area_init_core()
分配。
// mm/numa.c
/*
* Nodes can be initialized parallely, in no particular order.
*/
void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
unsigned long *zones_size, unsigned long zone_start_paddr,
unsigned long *zholes_size)
{
int i, size = 0;
struct page *discard;
if (mem_map == (mem_map_t *)NULL) //全局mem_map[]设置在线性地址空间中内核部分的起点
mem_map = (mem_map_t *)PAGE_OFFSET;
free_area_init_core(nid, pgdat, &discard, zones_size, zone_start_paddr,
zholes_size, pmap);
pgdat->node_id = nid; //记录pgdat的NID
/*
* Get space for the valid bitmap.
*/
for (i = 0; i < MAX_NR_ZONES; i++) //计算节点的总大小
size += zones_size[i];
size = LONG_ALIGN((size + 7) >> 3); //重新计算字节数,满足每一位表示一个字节
//分配一张位图,表示节点中存在的有效管理区。事实上,这个仅用于Sparc体系结构。
pgdat->valid_addr_bitmap = (unsigned long *)alloc_bootmem_node(pgdat, size);
memset(pgdat->valid_addr_bitmap, 0, size); //所有区域都是无效的。有效区域由Sparc中的mem_init()函数标记。其他体系结构忽略这张位图
}
1.5、函数:free_area_init_core()
该函数负责初始化所有的区域,并在节点中分配它们的局部lmem_map
。在UMA体系结构中,调用这个函数初始化全局mem_map[]
。在NUMA体系结构中,mem_map[]
被看作是一个稀疏分布的虚拟数组。
// mm/page_alloc.c
/*
* Set up the zone data structures:
* - mark all pages reserved
* - mark all memory queues empty
* - clear the memory bitmaps
*/
void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
unsigned long *zones_size, unsigned long zone_start_paddr,
unsigned long *zholes_size, struct page *lmem_map)
{
unsigned long i, j;
unsigned long map_size;
unsigned long totalpages, offset, realtotalpages;
const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
if (zone_start_paddr & ~PAGE_MASK)
BUG();
totalpages = 0;
for (i = 0; i < MAX_NR_ZONES; i++) {
unsigned long size = zones_size[i];
totalpages += size;
}
realtotalpages = totalpages;
if (zholes_size)
for (i = 0; i < MAX_NR_ZONES; i++)
realtotalpages -= zholes_size[i];
printk("On node %d totalpages: %lu\n", nid, realtotalpages);
上述这部分代码计算并记录节点部分信息。
/*
* Some architectures (with lots of mem and discontinous memory
* maps) have to search for a good mem_map area:
* For discontigmem, the conceptual mem map array starts from
* PAGE_OFFSET, we need to align the actual array onto a mem map
* boundary, so that MAP_NR works.
*/
map_size = (totalpages + 1)*sizeof(struct page); //计算mem_map[]所需的内存量
if (lmem_map == (struct page *)0) {
lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size);
lmem_map = (struct page *)(PAGE_OFFSET +
MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET)); //MAP_ALIGN()将在一个struct page大小范围内排列数组,从而计算在mem_map中基于物理地址MAP_NR()宏的内部偏移
}
*gmap = pgdat->node_mem_map = lmem_map;
pgdat->node_size = totalpages;
pgdat->node_start_paddr = zone_start_paddr;
pgdat->node_start_mapnr = (lmem_map - mem_map);
pgdat->nr_zones = 0;
offset = lmem_map - mem_map;
for (j = 0; j < MAX_NR_ZONES; j++) {
//初始化节点中每一个zone_t
zone_t *zone = pgdat->node_zones + j;
unsigned long mask;
unsigned long size, realsize;
zone_table[nid * MAX_NR_ZONES + j] = zone;
realsize = size = zones_size[j];
if (zholes_size)
realsize -= zholes_size[j];
printk("zone(%lu): %lu pages.\n", j, size);
zone->size = size;
zone->name = zone_names[j];
zone->lock = SPIN_LOCK_UNLOCKED;
zone->zone_pgdat = pgdat;
zone->free_pages = 0;
zone->need_balance = 0;
if (!size)
continue;
循环初始化节点中的每一个zone_t
中一些基本字段的值。
/*
* The per-page waitqueue mechanism uses hashed waitqueues
* per zone.
*/
zone->wait_table_size = wait_table_size(size); //wait_table_size()计算哈希表的大小。该哈希表不会大于4KB
zone->wait_table_shift =
BITS_PER_LONG - wait_table_bits(zone->wait_table_size);
zone->wait_table = (wait_queue_head_t *)
alloc_bootmem_node(pgdat, zone->wait_table_size
* sizeof(wait_queue_head_t));
for(i = 0; i < zone->wait_table_size; ++i)
init_waitqueue_head(zone->wait_table + i);
上述这部分代码初始化管理区的等待队列。
pgdat->nr_zones = j+1; //计算管理区极值并记录管理区地址。这个极值为管理区大小的的比率
//若激活一个新的管理区,更新节点中管理区的数量
mask = (realsize / zone_balance_ratio[j]); //管理区的大小除以管理区的平衡因子。将用于page_min极值。
if (mask < zone_balance_min[j])
mask = zone_balance_min[j];
else if (mask > zone_balance_max[j])
mask = zone_balance_max[j];
zone->pages_min = mask;
zone->pages_low = mask*2;
zone->pages_high = mask*3;
zone->zone_mem_map = mem_map + offset;
zone->zone_start_mapnr = offset;
zone->zone_start_paddr = zone_start_paddr;
if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1)) //确保zone的地址对齐,供伙伴分配器使用
printk("BUG: wrong zone alignment, it will crash\n");
上述部分代码计算管理区极值并记录管理区地址。
/*
* Initially all pages are reserved - free ones are freed
* up by free_all_bootmem() once the early boot process is
* done. Non-atomic initialization, single-pass.
*/
for (i = 0; i < size; i++) {
struct page *page = mem_map + offset + i;
set_page_zone(page, nid * MAX_NR_ZONES + j);
set_page_count(page, 0);
SetPageReserved(page);
INIT_LIST_HEAD(&page->list);
if (j != ZONE_HIGHMEM)
set_page_address(page, __va(zone_start_paddr));
zone_start_paddr += PAGE_SIZE; //?
}
初始化时,管理区中所有的页面都标记为保留
,因为没有办法知道引导内存分配器使用了哪些页面。引导内存分配起在free_all_bootmem()
中回收时,未使用的页面中的PG_reserved
会被清除。
offset += size; //初始化管理区的空闲链表,并且分配一个位图,该位图被伙伴分配器用于记录the state of page buddies
for (i = 0; ; i++) {
unsigned long bitmap_size;
INIT_LIST_HEAD(&zone->free_area[i].free_list);
if (i == MAX_ORDER-1) {
zone->free_area[i].map = NULL;
break;
}
/*
* Page buddy system uses "index >> (i+1)",
* where "index" is at most "size-1".
*
* The extra "+3" is to round down to byte
* size (8 bits per byte assumption). Thus
* we get "(size-1) >> (i+4)" as the last byte
* we can access.
*
* The "+1" is because we want to round the
* byte allocation up rather than down. So
* we should have had a "+7" before we shifted
* down by three. Also, we have to add one as
* we actually _use_ the last bit (it's [0,n]
* inclusive, not [0,n[).
*
* So we actually had +7+1 before we shift
* down by 3. But (n+8) >> 3 == (n >> 3) + 1
* (modulo overflows, which we do not have).
*
* Finally, we LONG_ALIGN because all bitmap
* operations are on longs.
*/
bitmap_size = (size-1) >> (i+4); //计算荣达整个位图所需的字节数。位图中每一位表示一个有2^i数量页面的伙伴对
bitmap_size = LONG_ALIGN(bitmap_size+1);
zone->free_area[i].map =
(unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
}
}
build_zonelists(pgdat); //构造节点的the zone fallback lists(管理区回退链表)
}
上述代码初始化管理区的空闲链表,并且分配一个位图,该位图被伙伴分配器用于记录the state of page buddies
。
综合图示
文中图示均为个人理解
参考文献:
[1] 白洛. 深入理解Linux虚拟内存管理. 2006-1
[2] Mel Gorman. Understanding the Linux Virtual Memory Manager. 2004-5-9