1、初始化管理区

1.1、函数：setup_memory()

初始化低端内存PFN的起点和终点，高端内存PFN的起点和终点，以及系统最后一页的PFN。
初始化bootmem_data结构以及声明可能被boot memory allocator用到的页面。
标记所有系统可用的页面为空闲，然后保留表示为图的页面。
在配置了SMP或initrd镜像存在时，为它们保留页面。

// arch/i386/kernel/setup.c
static unsigned long __init setup_memory(void)
{
   
	unsigned long bootmap_size, start_pfn, max_low_pfn;

	/*
	 * partially used pages are not usable - thus
	 * we are rounding upwards:
	 */
	start_pfn = PFN_UP(__pa(&_end));   //_end是已载入内核镜像的低端地址

	find_max_pfn();   //遍历e820图，查找最高的可用PFN

	max_low_pfn = find_max_low_pfn();   //在ZONE_NORMAL中找到可用的最高页面帧

#ifdef CONFIG_HIGHMEM
	highstart_pfn = highend_pfn = max_pfn;
	if (max_pfn > max_low_pfn) {
   
		highstart_pfn = max_low_pfn;
	}
	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
		pages_to_mb(highend_pfn - highstart_pfn));
#endif
	printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
			pages_to_mb(max_low_pfn));
	/*
	 * Initialize the boot-time allocator (with low memory only):
	 */
	bootmap_size = init_bootmem(start_pfn, max_low_pfn);

	register_bootmem_low_pages(max_low_pfn);

	/*
	 * Reserve the bootmem bitmap itself as well. We do this in two
	 * steps (first step was init_bootmem()) because this catches
	 * the (very unlikely) case of us accidentally initializing the
	 * bootmem allocator with an invalid RAM area.
	 */
	reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) +
			 bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY));   //保留用于存储表示页面的位图的页面

	/*
	 * reserve physical page 0 - it's a special BIOS page on many boxes,
	 * enabling clean reboots, SMP operation, laptop functions.
	 */
	reserve_bootmem(0, PAGE_SIZE);   //保留0号页面，0号页面是BIOS用到的特殊页面

#ifdef CONFIG_SMP
	/*
	 * But first pinch a few for the stack/trampoline stuff
	 * FIXME: Don't need the extra page at 4K, but need to fix
	 * trampoline before removing it. (see the GDT stuff)
	 */
	reserve_bootmem(PAGE_SIZE, PAGE_SIZE);   //保留额外的页面为跳板代码用。跳板代码用于处理用户空间如何进入内核空间
#endif
#ifdef CONFIG_ACPI_SLEEP
	/*
	 * Reserve low memory region for sleep support.
	 */
	acpi_reserve_bootmem();
#endif
#ifdef CONFIG_X86_LOCAL_APIC
	/*
	 * Find and reserve possible boot-time SMP configuration.
	 */
	find_smp_config();
#endif
#ifdef CONFIG_BLK_DEV_INITRD   //initrd提供一个小型文件系统镜像，用于启动系统
	if (LOADER_TYPE && INITRD_START) {
   
		if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
   
			reserve_bootmem(INITRD_START, INITRD_SIZE);
			initrd_start =
				INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
			initrd_end = initrd_start+INITRD_SIZE;
		}
		else {
   
			printk(KERN_ERR "initrd extends beyond end of memory "
			    "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
			    INITRD_START + INITRD_SIZE,
			    max_low_pfn << PAGE_SHIFT);
			initrd_start = 0;
		}
	}
#endif

	return max_low_pfn;   //返回ZONE_NORMAL中可寻址内存上限
}

在这里插入图片描述

1.2、函数：zone_sizes_init()

初始化各管理区的高层函数。该函数填充一个记录管理区大小的数组，并把它传给free_area_init()。

// arch/i386/mm/init.c
static void __init zone_sizes_init(void)
{
   
	unsigned long zones_size[MAX_NR_ZONES] = {
   0, 0, 0};
	unsigned int max_dma, high, low;

	max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
	low = max_low_pfn;
	high = highend_pfn;

	if (low < max_dma)
		zones_size[ZONE_DMA] = low;
	else {
   
		zones_size[ZONE_DMA] = max_dma;
		zones_size[ZONE_NORMAL] = low - max_dma;
#ifdef CONFIG_HIGHMEM
		zones_size[ZONE_HIGHMEM] = high - low;
#endif
	}
	free_area_init(zones_size);
}

在这里插入图片描述

1.3、函数：free_area_init()

// mm/page_alloc.c
void __init free_area_init(unsigned long *zones_size)
{
   
	free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0);
}

在这里插入图片描述

1.4、函数：free_area_init_node()

初始化系统中每个pgdat。如果希望在特定的体系结构中对他们的位置调优，调用者可以选择性的分配它们自己的mem_map并作为参数传递给这个函数。如果不，则mem_map[]部分会由free_area_init_core()分配。

// mm/numa.c
/*
 * Nodes can be initialized parallely, in no particular order.
 */
void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
	unsigned long *zones_size, unsigned long zone_start_paddr, 
	unsigned long *zholes_size)
{
   
	int i, size = 0;
	struct page *discard;

	if (mem_map == (mem_map_t *)NULL)   //全局mem_map[]设置在线性地址空间中内核部分的起点
		mem_map = (mem_map_t *)PAGE_OFFSET;

	free_area_init_core(nid, pgdat, &discard, zones_size, zone_start_paddr,
					zholes_size, pmap);
	pgdat->node_id = nid;   //记录pgdat的NID

	/*
	 * Get space for the valid bitmap.
	 */
	for (i = 0; i < MAX_NR_ZONES; i++)   //计算节点的总大小
		size += zones_size[i];
	size = LONG_ALIGN((size + 7) >> 3);   //重新计算字节数，满足每一位表示一个字节
	//分配一张位图，表示节点中存在的有效管理区。事实上，这个仅用于Sparc体系结构。
	pgdat->valid_addr_bitmap = (unsigned long *)alloc_bootmem_node(pgdat, size);
	memset(pgdat->valid_addr_bitmap, 0, size);   //所有区域都是无效的。有效区域由Sparc中的mem_init()函数标记。其他体系结构忽略这张位图
}

在这里插入图片描述

1.5、函数：free_area_init_core()

该函数负责初始化所有的区域，并在节点中分配它们的局部lmem_map。在UMA体系结构中，调用这个函数初始化全局mem_map[]。在NUMA体系结构中，mem_map[]被看作是一个稀疏分布的虚拟数组。

// mm/page_alloc.c
/*
 * Set up the zone data structures:
 *   - mark all pages reserved
 *   - mark all memory queues empty
 *   - clear the memory bitmaps
 */
void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
	unsigned long *zones_size, unsigned long zone_start_paddr, 
	unsigned long *zholes_size, struct page *lmem_map)
{
   
	unsigned long i, j;
	unsigned long map_size;
	unsigned long totalpages, offset, realtotalpages;
	const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);

	if (zone_start_paddr & ~PAGE_MASK)
		BUG();

	totalpages = 0;
	for (i = 0; i < MAX_NR_ZONES; i++) {
   
		unsigned long size = zones_size[i];
		totalpages += size;
	}
	realtotalpages = totalpages;
	if (zholes_size)
		for (i = 0; i < MAX_NR_ZONES; i++)
			realtotalpages -= zholes_size[i];
			
	printk("On node %d totalpages: %lu\n", nid, realtotalpages);

上述这部分代码计算并记录节点部分信息。

	/*
	 * Some architectures (with lots of mem and discontinous memory
	 * maps) have to search for a good mem_map area:
	 * For discontigmem, the conceptual mem map array starts from 
	 * PAGE_OFFSET, we need to align the actual array onto a mem map 
	 * boundary, so that MAP_NR works.
	 */
	map_size = (totalpages + 1)*sizeof(struct page);   //计算mem_map[]所需的内存量
	if (lmem_map == (struct page *)0) {
   
		lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size);
		lmem_map = (struct page *)(PAGE_OFFSET + 
			MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));   //MAP_ALIGN()将在一个struct page大小范围内排列数组，从而计算在mem_map中基于物理地址MAP_NR()宏的内部偏移
	}
	*gmap = pgdat->node_mem_map = lmem_map;
	pgdat->node_size = totalpages;
	pgdat->node_start_paddr = zone_start_paddr;
	pgdat->node_start_mapnr = (lmem_map - mem_map);
	pgdat->nr_zones = 0;

	offset = lmem_map - mem_map;	
	for (j = 0; j < MAX_NR_ZONES; j++) {
      //初始化节点中每一个zone_t
		zone_t *zone = pgdat->node_zones + j;
		unsigned long mask;
		unsigned long size, realsize;

		zone_table[nid * MAX_NR_ZONES + j] = zone;
		realsize = size = zones_size[j];
		if (zholes_size)
			realsize -= zholes_size[j];

		printk("zone(%lu): %lu pages.\n", j, size);
		zone->size = size;
		zone->name = zone_names[j];
		zone->lock = SPIN_LOCK_UNLOCKED;
		zone->zone_pgdat = pgdat;
		zone->free_pages = 0;
		zone->need_balance = 0;
		if (!size)
			continue;

循环初始化节点中的每一个zone_t中一些基本字段的值。
在这里插入图片描述

		/*
		 * The per-page waitqueue mechanism uses hashed waitqueues
		 * per zone.
		 */
		zone->wait_table_size = wait_table_size(size);   //wait_table_size()计算哈希表的大小。该哈希表不会大于4KB
		zone->wait_table_shift =
			BITS_PER_LONG - wait_table_bits(zone->wait_table_size);
		zone->wait_table = (wait_queue_head_t *)
			alloc_bootmem_node(pgdat, zone->wait_table_size
						* sizeof(wait_queue_head_t));

		for(i = 0; i < zone->wait_table_size; ++i)
			init_waitqueue_head(zone->wait_table + i);

上述这部分代码初始化管理区的等待队列。
在这里插入图片描述

		pgdat->nr_zones = j+1;   //计算管理区极值并记录管理区地址。这个极值为管理区大小的的比率
								 //若激活一个新的管理区，更新节点中管理区的数量
		mask = (realsize / zone_balance_ratio[j]);   //管理区的大小除以管理区的平衡因子。将用于page_min极值。
		if (mask < zone_balance_min[j])
			mask = zone_balance_min[j];
		else if (mask > zone_balance_max[j])
			mask = zone_balance_max[j];
		zone->pages_min = mask;
		zone->pages_low = mask*2;
		zone->pages_high = mask*3;

		zone->zone_mem_map = mem_map + offset;
		zone->zone_start_mapnr = offset;
		zone->zone_start_paddr = zone_start_paddr;

		if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1))   //确保zone的地址对齐，供伙伴分配器使用
			printk("BUG: wrong zone alignment, it will crash\n");

上述部分代码计算管理区极值并记录管理区地址。
在这里插入图片描述

		/*
		 * Initially all pages are reserved - free ones are freed
		 * up by free_all_bootmem() once the early boot process is
		 * done. Non-atomic initialization, single-pass.
		 */
		for (i = 0; i < size; i++) {
   
			struct page *page = mem_map + offset + i;
			set_page_zone(page, nid * MAX_NR_ZONES + j);
			set_page_count(page, 0);
			SetPageReserved(page);
			INIT_LIST_HEAD(&page->list);
			if (j != ZONE_HIGHMEM)
				set_page_address(page, __va(zone_start_paddr));
			zone_start_paddr += PAGE_SIZE;   //?
		}

初始化时，管理区中所有的页面都标记为保留，因为没有办法知道引导内存分配器使用了哪些页面。引导内存分配起在free_all_bootmem()中回收时，未使用的页面中的PG_reserved会被清除。


		offset += size;   //初始化管理区的空闲链表，并且分配一个位图，该位图被伙伴分配器用于记录the state of page buddies
		for (i = 0; ; i++) {
   
			unsigned long bitmap_size;

			INIT_LIST_HEAD(&zone->free_area[i].free_list);
			if (i == MAX_ORDER-1) {
   
				zone->free_area[i].map = NULL;
				break;
			}

			/*
			 * Page buddy system uses "index >> (i+1)",
			 * where "index" is at most "size-1".
			 *
			 * The extra "+3" is to round down to byte
			 * size (8 bits per byte assumption). Thus
			 * we get "(size-1) >> (i+4)" as the last byte
			 * we can access.
			 *
			 * The "+1" is because we want to round the
			 * byte allocation up rather than down. So
			 * we should have had a "+7" before we shifted
			 * down by three. Also, we have to add one as
			 * we actually _use_ the last bit (it's [0,n]
			 * inclusive, not [0,n[).
			 *
			 * So we actually had +7+1 before we shift
			 * down by 3. But (n+8) >> 3 == (n >> 3) + 1
			 * (modulo overflows, which we do not have).
			 *
			 * Finally, we LONG_ALIGN because all bitmap
			 * operations are on longs.
			 */
			bitmap_size = (size-1) >> (i+4);   //计算荣达整个位图所需的字节数。位图中每一位表示一个有2^i数量页面的伙伴对
			bitmap_size = LONG_ALIGN(bitmap_size+1);
			zone->free_area[i].map = 
			  (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
		}
	}
	build_zonelists(pgdat);   //构造节点的the zone fallback lists（管理区回退链表）
}

上述代码初始化管理区的空闲链表，并且分配一个位图，该位图被伙伴分配器用于记录the state of page buddies。
在这里插入图片描述

综合图示

在这里插入图片描述

文中图示均为个人理解

参考文献：
[1] 白洛. 深入理解Linux虚拟内存管理. 2006-1
[2] Mel Gorman. Understanding the Linux Virtual Memory Manager. 2004-5-9