kexec.c 27.6 KB
Newer Older
1
2
3
4
5
6
7
8
/*
 * kexec.c - kexec system call
 * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
 *
 * This source code is licensed under the GNU General Public License,
 * Version 2.  See the file COPYING for more details.
 */

9
#include <linux/capability.h>
10
11
12
13
14
15
16
17
18
19
20
21
#include <linux/mm.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/kexec.h>
#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/highmem.h>
#include <linux/syscalls.h>
#include <linux/reboot.h>
#include <linux/syscalls.h>
#include <linux/ioport.h>
22
23
#include <linux/hardirq.h>

24
25
26
27
28
29
#include <asm/page.h>
#include <asm/uaccess.h>
#include <asm/io.h>
#include <asm/system.h>
#include <asm/semaphore.h>

30
31
32
/* Per cpu memory for storing cpu states in case of system crash. */
note_buf_t* crash_notes;

33
34
35
36
37
38
39
40
/* Location of the reserved area for the crash kernel */
struct resource crashk_res = {
	.name  = "Crash kernel",
	.start = 0,
	.end   = 0,
	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
};

41
42
43
44
45
46
47
int kexec_should_crash(struct task_struct *p)
{
	if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
		return 1;
	return 0;
}

48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
/*
 * When kexec transitions to the new kernel there is a one-to-one
 * mapping between physical and virtual addresses.  On processors
 * where you can disable the MMU this is trivial, and easy.  For
 * others it is still a simple predictable page table to setup.
 *
 * In that environment kexec copies the new kernel to its final
 * resting place.  This means I can only support memory whose
 * physical address can fit in an unsigned long.  In particular
 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
 * If the assembly stub has more restrictive requirements
 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
 * defined more restrictively in <asm/kexec.h>.
 *
 * The code for the transition from the current kernel to the
 * the new kernel is placed in the control_code_buffer, whose size
 * is given by KEXEC_CONTROL_CODE_SIZE.  In the best case only a single
 * page of memory is necessary, but some architectures require more.
 * Because this memory must be identity mapped in the transition from
 * virtual to physical addresses it must live in the range
 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
 * modifiable.
 *
 * The assembly stub in the control code buffer is passed a linked list
 * of descriptor pages detailing the source pages of the new kernel,
 * and the destination addresses of those source pages.  As this data
 * structure is not used in the context of the current OS, it must
 * be self-contained.
 *
 * The code has been made to work with highmem pages and will use a
 * destination page in its final resting place (if it happens
 * to allocate it).  The end product of this is that most of the
 * physical address space, and most of RAM can be used.
 *
 * Future directions include:
 *  - allocating a page table with the control code buffer identity
 *    mapped, to simplify machine_kexec and make kexec_on_panic more
 *    reliable.
 */

/*
 * KIMAGE_NO_DEST is an impossible destination address..., for
 * allocating pages whose destination address we do not care about.
 */
#define KIMAGE_NO_DEST (-1UL)

Maneesh Soni's avatar
Maneesh Soni committed
94
95
96
static int kimage_is_destination_range(struct kimage *image,
				       unsigned long start, unsigned long end);
static struct page *kimage_alloc_page(struct kimage *image,
Al Viro's avatar
Al Viro committed
97
				       gfp_t gfp_mask,
Maneesh Soni's avatar
Maneesh Soni committed
98
				       unsigned long dest);
99
100

static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
Maneesh Soni's avatar
Maneesh Soni committed
101
102
	                    unsigned long nr_segments,
                            struct kexec_segment __user *segments)
103
104
105
106
107
108
109
110
111
{
	size_t segment_bytes;
	struct kimage *image;
	unsigned long i;
	int result;

	/* Allocate a controlling structure */
	result = -ENOMEM;
	image = kmalloc(sizeof(*image), GFP_KERNEL);
Maneesh Soni's avatar
Maneesh Soni committed
112
	if (!image)
113
		goto out;
Maneesh Soni's avatar
Maneesh Soni committed
114

115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
	memset(image, 0, sizeof(*image));
	image->head = 0;
	image->entry = &image->head;
	image->last_entry = &image->head;
	image->control_page = ~0; /* By default this does not apply */
	image->start = entry;
	image->type = KEXEC_TYPE_DEFAULT;

	/* Initialize the list of control pages */
	INIT_LIST_HEAD(&image->control_pages);

	/* Initialize the list of destination pages */
	INIT_LIST_HEAD(&image->dest_pages);

	/* Initialize the list of unuseable pages */
	INIT_LIST_HEAD(&image->unuseable_pages);

	/* Read in the segments */
	image->nr_segments = nr_segments;
	segment_bytes = nr_segments * sizeof(*segments);
	result = copy_from_user(image->segment, segments, segment_bytes);
	if (result)
		goto out;

	/*
	 * Verify we have good destination addresses.  The caller is
	 * responsible for making certain we don't attempt to load
	 * the new image into invalid or reserved areas of RAM.  This
	 * just verifies it is an address we can use.
	 *
	 * Since the kernel does everything in page size chunks ensure
	 * the destination addreses are page aligned.  Too many
	 * special cases crop of when we don't do this.  The most
	 * insidious is getting overlapping destination addresses
	 * simply because addresses are changed to page size
	 * granularity.
	 */
	result = -EADDRNOTAVAIL;
	for (i = 0; i < nr_segments; i++) {
		unsigned long mstart, mend;
Maneesh Soni's avatar
Maneesh Soni committed
155

156
157
158
159
160
161
162
163
164
165
166
167
168
169
		mstart = image->segment[i].mem;
		mend   = mstart + image->segment[i].memsz;
		if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
			goto out;
		if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
			goto out;
	}

	/* Verify our destination addresses do not overlap.
	 * If we alloed overlapping destination addresses
	 * through very weird things can happen with no
	 * easy explanation as one segment stops on another.
	 */
	result = -EINVAL;
Maneesh Soni's avatar
Maneesh Soni committed
170
	for (i = 0; i < nr_segments; i++) {
171
172
		unsigned long mstart, mend;
		unsigned long j;
Maneesh Soni's avatar
Maneesh Soni committed
173

174
175
		mstart = image->segment[i].mem;
		mend   = mstart + image->segment[i].memsz;
Maneesh Soni's avatar
Maneesh Soni committed
176
		for (j = 0; j < i; j++) {
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
			unsigned long pstart, pend;
			pstart = image->segment[j].mem;
			pend   = pstart + image->segment[j].memsz;
			/* Do the segments overlap ? */
			if ((mend > pstart) && (mstart < pend))
				goto out;
		}
	}

	/* Ensure our buffer sizes are strictly less than
	 * our memory sizes.  This should always be the case,
	 * and it is easier to check up front than to be surprised
	 * later on.
	 */
	result = -EINVAL;
Maneesh Soni's avatar
Maneesh Soni committed
192
	for (i = 0; i < nr_segments; i++) {
193
194
195
196
197
		if (image->segment[i].bufsz > image->segment[i].memsz)
			goto out;
	}

	result = 0;
Maneesh Soni's avatar
Maneesh Soni committed
198
199
out:
	if (result == 0)
200
		*rimage = image;
Maneesh Soni's avatar
Maneesh Soni committed
201
	else
202
		kfree(image);
Maneesh Soni's avatar
Maneesh Soni committed
203

204
205
206
207
208
	return result;

}

static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
Maneesh Soni's avatar
Maneesh Soni committed
209
210
				unsigned long nr_segments,
				struct kexec_segment __user *segments)
211
212
213
214
215
216
217
{
	int result;
	struct kimage *image;

	/* Allocate and initialize a controlling structure */
	image = NULL;
	result = do_kimage_alloc(&image, entry, nr_segments, segments);
Maneesh Soni's avatar
Maneesh Soni committed
218
	if (result)
219
		goto out;
Maneesh Soni's avatar
Maneesh Soni committed
220

221
222
223
224
225
226
227
228
229
	*rimage = image;

	/*
	 * Find a location for the control code buffer, and add it
	 * the vector of segments so that it's pages will also be
	 * counted as destination pages.
	 */
	result = -ENOMEM;
	image->control_code_page = kimage_alloc_control_pages(image,
Maneesh Soni's avatar
Maneesh Soni committed
230
					   get_order(KEXEC_CONTROL_CODE_SIZE));
231
232
233
234
235
236
237
	if (!image->control_code_page) {
		printk(KERN_ERR "Could not allocate control_code_buffer\n");
		goto out;
	}

	result = 0;
 out:
Maneesh Soni's avatar
Maneesh Soni committed
238
	if (result == 0)
239
		*rimage = image;
Maneesh Soni's avatar
Maneesh Soni committed
240
	else
241
		kfree(image);
Maneesh Soni's avatar
Maneesh Soni committed
242

243
244
245
246
	return result;
}

static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
Maneesh Soni's avatar
Maneesh Soni committed
247
				unsigned long nr_segments,
248
				struct kexec_segment __user *segments)
249
250
251
252
253
254
255
256
257
258
259
260
261
262
{
	int result;
	struct kimage *image;
	unsigned long i;

	image = NULL;
	/* Verify we have a valid entry point */
	if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
		result = -EADDRNOTAVAIL;
		goto out;
	}

	/* Allocate and initialize a controlling structure */
	result = do_kimage_alloc(&image, entry, nr_segments, segments);
Maneesh Soni's avatar
Maneesh Soni committed
263
	if (result)
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
		goto out;

	/* Enable the special crash kernel control page
	 * allocation policy.
	 */
	image->control_page = crashk_res.start;
	image->type = KEXEC_TYPE_CRASH;

	/*
	 * Verify we have good destination addresses.  Normally
	 * the caller is responsible for making certain we don't
	 * attempt to load the new image into invalid or reserved
	 * areas of RAM.  But crash kernels are preloaded into a
	 * reserved area of ram.  We must ensure the addresses
	 * are in the reserved area otherwise preloading the
	 * kernel could corrupt things.
	 */
	result = -EADDRNOTAVAIL;
	for (i = 0; i < nr_segments; i++) {
		unsigned long mstart, mend;
Maneesh Soni's avatar
Maneesh Soni committed
284

285
		mstart = image->segment[i].mem;
286
		mend = mstart + image->segment[i].memsz - 1;
287
288
289
290
291
292
293
294
295
296
297
298
		/* Ensure we are within the crash kernel limits */
		if ((mstart < crashk_res.start) || (mend > crashk_res.end))
			goto out;
	}

	/*
	 * Find a location for the control code buffer, and add
	 * the vector of segments so that it's pages will also be
	 * counted as destination pages.
	 */
	result = -ENOMEM;
	image->control_code_page = kimage_alloc_control_pages(image,
Maneesh Soni's avatar
Maneesh Soni committed
299
					   get_order(KEXEC_CONTROL_CODE_SIZE));
300
301
302
303
304
305
	if (!image->control_code_page) {
		printk(KERN_ERR "Could not allocate control_code_buffer\n");
		goto out;
	}

	result = 0;
Maneesh Soni's avatar
Maneesh Soni committed
306
307
out:
	if (result == 0)
308
		*rimage = image;
Maneesh Soni's avatar
Maneesh Soni committed
309
	else
310
		kfree(image);
Maneesh Soni's avatar
Maneesh Soni committed
311

312
313
314
	return result;
}

Maneesh Soni's avatar
Maneesh Soni committed
315
316
317
static int kimage_is_destination_range(struct kimage *image,
					unsigned long start,
					unsigned long end)
318
319
320
321
322
{
	unsigned long i;

	for (i = 0; i < image->nr_segments; i++) {
		unsigned long mstart, mend;
Maneesh Soni's avatar
Maneesh Soni committed
323

324
		mstart = image->segment[i].mem;
Maneesh Soni's avatar
Maneesh Soni committed
325
326
		mend = mstart + image->segment[i].memsz;
		if ((end > mstart) && (start < mend))
327
328
			return 1;
	}
Maneesh Soni's avatar
Maneesh Soni committed
329

330
331
332
	return 0;
}

Al Viro's avatar
Al Viro committed
333
static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
334
335
{
	struct page *pages;
Maneesh Soni's avatar
Maneesh Soni committed
336

337
338
339
340
	pages = alloc_pages(gfp_mask, order);
	if (pages) {
		unsigned int count, i;
		pages->mapping = NULL;
341
		set_page_private(pages, order);
342
		count = 1 << order;
Maneesh Soni's avatar
Maneesh Soni committed
343
		for (i = 0; i < count; i++)
344
345
			SetPageReserved(pages + i);
	}
Maneesh Soni's avatar
Maneesh Soni committed
346

347
348
349
350
351
352
	return pages;
}

static void kimage_free_pages(struct page *page)
{
	unsigned int order, count, i;
Maneesh Soni's avatar
Maneesh Soni committed
353

354
	order = page_private(page);
355
	count = 1 << order;
Maneesh Soni's avatar
Maneesh Soni committed
356
	for (i = 0; i < count; i++)
357
358
359
360
361
362
363
		ClearPageReserved(page + i);
	__free_pages(page, order);
}

static void kimage_free_page_list(struct list_head *list)
{
	struct list_head *pos, *next;
Maneesh Soni's avatar
Maneesh Soni committed
364

365
366
367
368
369
370
371
372
373
	list_for_each_safe(pos, next, list) {
		struct page *page;

		page = list_entry(pos, struct page, lru);
		list_del(&page->lru);
		kimage_free_pages(page);
	}
}

Maneesh Soni's avatar
Maneesh Soni committed
374
375
static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
							unsigned int order)
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
{
	/* Control pages are special, they are the intermediaries
	 * that are needed while we copy the rest of the pages
	 * to their final resting place.  As such they must
	 * not conflict with either the destination addresses
	 * or memory the kernel is already using.
	 *
	 * The only case where we really need more than one of
	 * these are for architectures where we cannot disable
	 * the MMU and must instead generate an identity mapped
	 * page table for all of the memory.
	 *
	 * At worst this runs in O(N) of the image size.
	 */
	struct list_head extra_pages;
	struct page *pages;
	unsigned int count;

	count = 1 << order;
	INIT_LIST_HEAD(&extra_pages);

	/* Loop while I can allocate a page and the page allocated
	 * is a destination page.
	 */
	do {
		unsigned long pfn, epfn, addr, eaddr;
Maneesh Soni's avatar
Maneesh Soni committed
402

403
404
405
406
407
408
409
410
		pages = kimage_alloc_pages(GFP_KERNEL, order);
		if (!pages)
			break;
		pfn   = page_to_pfn(pages);
		epfn  = pfn + count;
		addr  = pfn << PAGE_SHIFT;
		eaddr = epfn << PAGE_SHIFT;
		if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
Maneesh Soni's avatar
Maneesh Soni committed
411
			      kimage_is_destination_range(image, addr, eaddr)) {
412
413
414
			list_add(&pages->lru, &extra_pages);
			pages = NULL;
		}
Maneesh Soni's avatar
Maneesh Soni committed
415
416
	} while (!pages);

417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
	if (pages) {
		/* Remember the allocated page... */
		list_add(&pages->lru, &image->control_pages);

		/* Because the page is already in it's destination
		 * location we will never allocate another page at
		 * that address.  Therefore kimage_alloc_pages
		 * will not return it (again) and we don't need
		 * to give it an entry in image->segment[].
		 */
	}
	/* Deal with the destination pages I have inadvertently allocated.
	 *
	 * Ideally I would convert multi-page allocations into single
	 * page allocations, and add everyting to image->dest_pages.
	 *
	 * For now it is simpler to just free the pages.
	 */
	kimage_free_page_list(&extra_pages);

Maneesh Soni's avatar
Maneesh Soni committed
437
	return pages;
438
439
}

Maneesh Soni's avatar
Maneesh Soni committed
440
441
static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
						      unsigned int order)
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
{
	/* Control pages are special, they are the intermediaries
	 * that are needed while we copy the rest of the pages
	 * to their final resting place.  As such they must
	 * not conflict with either the destination addresses
	 * or memory the kernel is already using.
	 *
	 * Control pages are also the only pags we must allocate
	 * when loading a crash kernel.  All of the other pages
	 * are specified by the segments and we just memcpy
	 * into them directly.
	 *
	 * The only case where we really need more than one of
	 * these are for architectures where we cannot disable
	 * the MMU and must instead generate an identity mapped
	 * page table for all of the memory.
	 *
	 * Given the low demand this implements a very simple
	 * allocator that finds the first hole of the appropriate
	 * size in the reserved memory region, and allocates all
	 * of the memory up to and including the hole.
	 */
	unsigned long hole_start, hole_end, size;
	struct page *pages;
Maneesh Soni's avatar
Maneesh Soni committed
466

467
468
469
470
	pages = NULL;
	size = (1 << order) << PAGE_SHIFT;
	hole_start = (image->control_page + (size - 1)) & ~(size - 1);
	hole_end   = hole_start + size - 1;
Maneesh Soni's avatar
Maneesh Soni committed
471
	while (hole_end <= crashk_res.end) {
472
		unsigned long i;
Maneesh Soni's avatar
Maneesh Soni committed
473
474

		if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
475
			break;
Maneesh Soni's avatar
Maneesh Soni committed
476
		if (hole_end > crashk_res.end)
477
478
			break;
		/* See if I overlap any of the segments */
Maneesh Soni's avatar
Maneesh Soni committed
479
		for (i = 0; i < image->nr_segments; i++) {
480
			unsigned long mstart, mend;
Maneesh Soni's avatar
Maneesh Soni committed
481

482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
			mstart = image->segment[i].mem;
			mend   = mstart + image->segment[i].memsz - 1;
			if ((hole_end >= mstart) && (hole_start <= mend)) {
				/* Advance the hole to the end of the segment */
				hole_start = (mend + (size - 1)) & ~(size - 1);
				hole_end   = hole_start + size - 1;
				break;
			}
		}
		/* If I don't overlap any segments I have found my hole! */
		if (i == image->nr_segments) {
			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
			break;
		}
	}
Maneesh Soni's avatar
Maneesh Soni committed
497
	if (pages)
498
		image->control_page = hole_end;
Maneesh Soni's avatar
Maneesh Soni committed
499

500
501
502
503
	return pages;
}


Maneesh Soni's avatar
Maneesh Soni committed
504
505
struct page *kimage_alloc_control_pages(struct kimage *image,
					 unsigned int order)
506
507
{
	struct page *pages = NULL;
Maneesh Soni's avatar
Maneesh Soni committed
508
509

	switch (image->type) {
510
511
512
513
514
515
516
	case KEXEC_TYPE_DEFAULT:
		pages = kimage_alloc_normal_control_pages(image, order);
		break;
	case KEXEC_TYPE_CRASH:
		pages = kimage_alloc_crash_control_pages(image, order);
		break;
	}
Maneesh Soni's avatar
Maneesh Soni committed
517

518
519
520
521
522
	return pages;
}

static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
{
Maneesh Soni's avatar
Maneesh Soni committed
523
	if (*image->entry != 0)
524
		image->entry++;
Maneesh Soni's avatar
Maneesh Soni committed
525

526
527
528
	if (image->entry == image->last_entry) {
		kimage_entry_t *ind_page;
		struct page *page;
Maneesh Soni's avatar
Maneesh Soni committed
529

530
		page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
Maneesh Soni's avatar
Maneesh Soni committed
531
		if (!page)
532
			return -ENOMEM;
Maneesh Soni's avatar
Maneesh Soni committed
533

534
535
536
		ind_page = page_address(page);
		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
		image->entry = ind_page;
Maneesh Soni's avatar
Maneesh Soni committed
537
538
		image->last_entry = ind_page +
				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
539
540
541
542
	}
	*image->entry = entry;
	image->entry++;
	*image->entry = 0;
Maneesh Soni's avatar
Maneesh Soni committed
543

544
545
546
	return 0;
}

Maneesh Soni's avatar
Maneesh Soni committed
547
548
static int kimage_set_destination(struct kimage *image,
				   unsigned long destination)
549
550
551
552
553
{
	int result;

	destination &= PAGE_MASK;
	result = kimage_add_entry(image, destination | IND_DESTINATION);
Maneesh Soni's avatar
Maneesh Soni committed
554
	if (result == 0)
555
		image->destination = destination;
Maneesh Soni's avatar
Maneesh Soni committed
556

557
558
559
560
561
562
563
564
565
566
	return result;
}


static int kimage_add_page(struct kimage *image, unsigned long page)
{
	int result;

	page &= PAGE_MASK;
	result = kimage_add_entry(image, page | IND_SOURCE);
Maneesh Soni's avatar
Maneesh Soni committed
567
	if (result == 0)
568
		image->destination += PAGE_SIZE;
Maneesh Soni's avatar
Maneesh Soni committed
569

570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
	return result;
}


static void kimage_free_extra_pages(struct kimage *image)
{
	/* Walk through and free any extra destination pages I may have */
	kimage_free_page_list(&image->dest_pages);

	/* Walk through and free any unuseable pages I have cached */
	kimage_free_page_list(&image->unuseable_pages);

}
static int kimage_terminate(struct kimage *image)
{
Maneesh Soni's avatar
Maneesh Soni committed
585
	if (*image->entry != 0)
586
		image->entry++;
Maneesh Soni's avatar
Maneesh Soni committed
587

588
	*image->entry = IND_DONE;
Maneesh Soni's avatar
Maneesh Soni committed
589

590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
	return 0;
}

#define for_each_kimage_entry(image, ptr, entry) \
	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
		ptr = (entry & IND_INDIRECTION)? \
			phys_to_virt((entry & PAGE_MASK)): ptr +1)

static void kimage_free_entry(kimage_entry_t entry)
{
	struct page *page;

	page = pfn_to_page(entry >> PAGE_SHIFT);
	kimage_free_pages(page);
}

static void kimage_free(struct kimage *image)
{
	kimage_entry_t *ptr, entry;
	kimage_entry_t ind = 0;

	if (!image)
		return;
Maneesh Soni's avatar
Maneesh Soni committed
613

614
615
616
617
	kimage_free_extra_pages(image);
	for_each_kimage_entry(image, ptr, entry) {
		if (entry & IND_INDIRECTION) {
			/* Free the previous indirection page */
Maneesh Soni's avatar
Maneesh Soni committed
618
			if (ind & IND_INDIRECTION)
619
620
621
622
623
624
				kimage_free_entry(ind);
			/* Save this indirection page until we are
			 * done with it.
			 */
			ind = entry;
		}
Maneesh Soni's avatar
Maneesh Soni committed
625
		else if (entry & IND_SOURCE)
626
627
628
			kimage_free_entry(entry);
	}
	/* Free the final indirection page */
Maneesh Soni's avatar
Maneesh Soni committed
629
	if (ind & IND_INDIRECTION)
630
631
632
633
634
635
636
637
638
639
		kimage_free_entry(ind);

	/* Handle any machine specific cleanup */
	machine_kexec_cleanup(image);

	/* Free the kexec control pages... */
	kimage_free_page_list(&image->control_pages);
	kfree(image);
}

Maneesh Soni's avatar
Maneesh Soni committed
640
641
static kimage_entry_t *kimage_dst_used(struct kimage *image,
					unsigned long page)
642
643
644
645
646
{
	kimage_entry_t *ptr, entry;
	unsigned long destination = 0;

	for_each_kimage_entry(image, ptr, entry) {
Maneesh Soni's avatar
Maneesh Soni committed
647
		if (entry & IND_DESTINATION)
648
649
			destination = entry & PAGE_MASK;
		else if (entry & IND_SOURCE) {
Maneesh Soni's avatar
Maneesh Soni committed
650
			if (page == destination)
651
652
653
654
				return ptr;
			destination += PAGE_SIZE;
		}
	}
Maneesh Soni's avatar
Maneesh Soni committed
655

656
	return NULL;
657
658
}

Maneesh Soni's avatar
Maneesh Soni committed
659
static struct page *kimage_alloc_page(struct kimage *image,
Al Viro's avatar
Al Viro committed
660
					gfp_t gfp_mask,
Maneesh Soni's avatar
Maneesh Soni committed
661
					unsigned long destination)
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
{
	/*
	 * Here we implement safeguards to ensure that a source page
	 * is not copied to its destination page before the data on
	 * the destination page is no longer useful.
	 *
	 * To do this we maintain the invariant that a source page is
	 * either its own destination page, or it is not a
	 * destination page at all.
	 *
	 * That is slightly stronger than required, but the proof
	 * that no problems will not occur is trivial, and the
	 * implementation is simply to verify.
	 *
	 * When allocating all pages normally this algorithm will run
	 * in O(N) time, but in the worst case it will run in O(N^2)
	 * time.   If the runtime is a problem the data structures can
	 * be fixed.
	 */
	struct page *page;
	unsigned long addr;

	/*
	 * Walk through the list of destination pages, and see if I
	 * have a match.
	 */
	list_for_each_entry(page, &image->dest_pages, lru) {
		addr = page_to_pfn(page) << PAGE_SHIFT;
		if (addr == destination) {
			list_del(&page->lru);
			return page;
		}
	}
	page = NULL;
	while (1) {
		kimage_entry_t *old;

		/* Allocate a page, if we run out of memory give up */
		page = kimage_alloc_pages(gfp_mask, 0);
Maneesh Soni's avatar
Maneesh Soni committed
701
		if (!page)
702
			return NULL;
703
		/* If the page cannot be used file it away */
Maneesh Soni's avatar
Maneesh Soni committed
704
705
		if (page_to_pfn(page) >
				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
706
707
708
709
710
711
712
713
714
715
			list_add(&page->lru, &image->unuseable_pages);
			continue;
		}
		addr = page_to_pfn(page) << PAGE_SHIFT;

		/* If it is the destination page we want use it */
		if (addr == destination)
			break;

		/* If the page is not a destination page use it */
Maneesh Soni's avatar
Maneesh Soni committed
716
717
		if (!kimage_is_destination_range(image, addr,
						  addr + PAGE_SIZE))
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
			break;

		/*
		 * I know that the page is someones destination page.
		 * See if there is already a source page for this
		 * destination page.  And if so swap the source pages.
		 */
		old = kimage_dst_used(image, addr);
		if (old) {
			/* If so move it */
			unsigned long old_addr;
			struct page *old_page;

			old_addr = *old & PAGE_MASK;
			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
			copy_highpage(page, old_page);
			*old = addr | (*old & ~PAGE_MASK);

			/* The old page I have found cannot be a
			 * destination page, so return it.
			 */
			addr = old_addr;
			page = old_page;
			break;
		}
		else {
			/* Place the page on the destination list I
			 * will use it later.
			 */
			list_add(&page->lru, &image->dest_pages);
		}
	}
Maneesh Soni's avatar
Maneesh Soni committed
750

751
752
753
754
	return page;
}

static int kimage_load_normal_segment(struct kimage *image,
Maneesh Soni's avatar
Maneesh Soni committed
755
					 struct kexec_segment *segment)
756
757
758
759
{
	unsigned long maddr;
	unsigned long ubytes, mbytes;
	int result;
760
	unsigned char __user *buf;
761
762
763
764
765
766
767
768

	result = 0;
	buf = segment->buf;
	ubytes = segment->bufsz;
	mbytes = segment->memsz;
	maddr = segment->mem;

	result = kimage_set_destination(image, maddr);
Maneesh Soni's avatar
Maneesh Soni committed
769
	if (result < 0)
770
		goto out;
Maneesh Soni's avatar
Maneesh Soni committed
771
772

	while (mbytes) {
773
774
775
		struct page *page;
		char *ptr;
		size_t uchunk, mchunk;
Maneesh Soni's avatar
Maneesh Soni committed
776

777
778
779
780
781
		page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
		if (page == 0) {
			result  = -ENOMEM;
			goto out;
		}
Maneesh Soni's avatar
Maneesh Soni committed
782
783
784
		result = kimage_add_page(image, page_to_pfn(page)
								<< PAGE_SHIFT);
		if (result < 0)
785
			goto out;
Maneesh Soni's avatar
Maneesh Soni committed
786

787
788
789
790
791
		ptr = kmap(page);
		/* Start with a clear page */
		memset(ptr, 0, PAGE_SIZE);
		ptr += maddr & ~PAGE_MASK;
		mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
Maneesh Soni's avatar
Maneesh Soni committed
792
		if (mchunk > mbytes)
793
			mchunk = mbytes;
Maneesh Soni's avatar
Maneesh Soni committed
794

795
		uchunk = mchunk;
Maneesh Soni's avatar
Maneesh Soni committed
796
		if (uchunk > ubytes)
797
			uchunk = ubytes;
Maneesh Soni's avatar
Maneesh Soni committed
798

799
800
801
802
803
804
805
806
807
808
809
		result = copy_from_user(ptr, buf, uchunk);
		kunmap(page);
		if (result) {
			result = (result < 0) ? result : -EIO;
			goto out;
		}
		ubytes -= uchunk;
		maddr  += mchunk;
		buf    += mchunk;
		mbytes -= mchunk;
	}
Maneesh Soni's avatar
Maneesh Soni committed
810
out:
811
812
813
814
	return result;
}

static int kimage_load_crash_segment(struct kimage *image,
Maneesh Soni's avatar
Maneesh Soni committed
815
					struct kexec_segment *segment)
816
817
818
819
820
821
822
823
{
	/* For crash dumps kernels we simply copy the data from
	 * user space to it's destination.
	 * We do things a page at a time for the sake of kmap.
	 */
	unsigned long maddr;
	unsigned long ubytes, mbytes;
	int result;
824
	unsigned char __user *buf;
825
826
827
828
829
830

	result = 0;
	buf = segment->buf;
	ubytes = segment->bufsz;
	mbytes = segment->memsz;
	maddr = segment->mem;
Maneesh Soni's avatar
Maneesh Soni committed
831
	while (mbytes) {
832
833
834
		struct page *page;
		char *ptr;
		size_t uchunk, mchunk;
Maneesh Soni's avatar
Maneesh Soni committed
835

836
837
838
839
840
841
842
843
		page = pfn_to_page(maddr >> PAGE_SHIFT);
		if (page == 0) {
			result  = -ENOMEM;
			goto out;
		}
		ptr = kmap(page);
		ptr += maddr & ~PAGE_MASK;
		mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
Maneesh Soni's avatar
Maneesh Soni committed
844
		if (mchunk > mbytes)
845
			mchunk = mbytes;
Maneesh Soni's avatar
Maneesh Soni committed
846

847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
		uchunk = mchunk;
		if (uchunk > ubytes) {
			uchunk = ubytes;
			/* Zero the trailing part of the page */
			memset(ptr + uchunk, 0, mchunk - uchunk);
		}
		result = copy_from_user(ptr, buf, uchunk);
		kunmap(page);
		if (result) {
			result = (result < 0) ? result : -EIO;
			goto out;
		}
		ubytes -= uchunk;
		maddr  += mchunk;
		buf    += mchunk;
		mbytes -= mchunk;
	}
Maneesh Soni's avatar
Maneesh Soni committed
864
out:
865
866
867
868
	return result;
}

static int kimage_load_segment(struct kimage *image,
Maneesh Soni's avatar
Maneesh Soni committed
869
				struct kexec_segment *segment)
870
871
{
	int result = -ENOMEM;
Maneesh Soni's avatar
Maneesh Soni committed
872
873

	switch (image->type) {
874
875
876
877
878
879
880
	case KEXEC_TYPE_DEFAULT:
		result = kimage_load_normal_segment(image, segment);
		break;
	case KEXEC_TYPE_CRASH:
		result = kimage_load_crash_segment(image, segment);
		break;
	}
Maneesh Soni's avatar
Maneesh Soni committed
881

882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
	return result;
}

/*
 * Exec Kernel system call: for obvious reasons only root may call it.
 *
 * This call breaks up into three pieces.
 * - A generic part which loads the new kernel from the current
 *   address space, and very carefully places the data in the
 *   allocated pages.
 *
 * - A generic part that interacts with the kernel and tells all of
 *   the devices to shut down.  Preventing on-going dmas, and placing
 *   the devices in a consistent state so a later kernel can
 *   reinitialize them.
 *
 * - A machine specific part that includes the syscall number
 *   and the copies the image to it's final destination.  And
 *   jumps into the image at entry.
 *
 * kexec does not sync, or unmount filesystems so if you need
 * that to happen you need to do that yourself.
 */
struct kimage *kexec_image = NULL;
static struct kimage *kexec_crash_image = NULL;
/*
 * A home grown binary mutex.
 * Nothing can wait so this mutex is safe to use
 * in interrupt context :)
 */
static int kexec_lock = 0;

Maneesh Soni's avatar
Maneesh Soni committed
914
915
916
asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
				struct kexec_segment __user *segments,
				unsigned long flags)
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
{
	struct kimage **dest_image, *image;
	int locked;
	int result;

	/* We only trust the superuser with rebooting the system. */
	if (!capable(CAP_SYS_BOOT))
		return -EPERM;

	/*
	 * Verify we have a legal set of flags
	 * This leaves us room for future extensions.
	 */
	if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
		return -EINVAL;

	/* Verify we are on the appropriate architecture */
	if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
		((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
		return -EINVAL;

	/* Put an artificial cap on the number
	 * of segments passed to kexec_load.
	 */
	if (nr_segments > KEXEC_SEGMENT_MAX)
		return -EINVAL;

	image = NULL;
	result = 0;

	/* Because we write directly to the reserved memory
	 * region when loading crash kernels we need a mutex here to
	 * prevent multiple crash  kernels from attempting to load
	 * simultaneously, and to prevent a crash kernel from loading
	 * over the top of a in use crash kernel.
	 *
	 * KISS: always take the mutex.
	 */
	locked = xchg(&kexec_lock, 1);
Maneesh Soni's avatar
Maneesh Soni committed
956
	if (locked)
957
		return -EBUSY;
Maneesh Soni's avatar
Maneesh Soni committed
958

959
	dest_image = &kexec_image;
Maneesh Soni's avatar
Maneesh Soni committed
960
	if (flags & KEXEC_ON_CRASH)
961
962
963
		dest_image = &kexec_crash_image;
	if (nr_segments > 0) {
		unsigned long i;
Maneesh Soni's avatar
Maneesh Soni committed
964

965
		/* Loading another kernel to reboot into */
Maneesh Soni's avatar
Maneesh Soni committed
966
967
968
		if ((flags & KEXEC_ON_CRASH) == 0)
			result = kimage_normal_alloc(&image, entry,
							nr_segments, segments);
969
970
971
972
973
974
		/* Loading another kernel to switch to if this one crashes */
		else if (flags & KEXEC_ON_CRASH) {
			/* Free any current crash dump kernel before
			 * we corrupt it.
			 */
			kimage_free(xchg(&kexec_crash_image, NULL));
Maneesh Soni's avatar
Maneesh Soni committed
975
976
			result = kimage_crash_alloc(&image, entry,
						     nr_segments, segments);
977
		}
Maneesh Soni's avatar
Maneesh Soni committed
978
		if (result)
979
			goto out;
Maneesh Soni's avatar
Maneesh Soni committed
980

981
		result = machine_kexec_prepare(image);
Maneesh Soni's avatar
Maneesh Soni committed
982
		if (result)
983
			goto out;
Maneesh Soni's avatar
Maneesh Soni committed
984
985

		for (i = 0; i < nr_segments; i++) {
986
			result = kimage_load_segment(image, &image->segment[i]);
Maneesh Soni's avatar
Maneesh Soni committed
987
			if (result)
988
989
990
				goto out;
		}
		result = kimage_terminate(image);
Maneesh Soni's avatar
Maneesh Soni committed
991
		if (result)
992
993
994
995
996
			goto out;
	}
	/* Install the new kernel, and  Uninstall the old */
	image = xchg(dest_image, image);

Maneesh Soni's avatar
Maneesh Soni committed
997
out:
998
999
	xchg(&kexec_lock, 0); /* Release the mutex */
	kimage_free(image);
Maneesh Soni's avatar
Maneesh Soni committed
1000

1001
1002
1003
1004
1005
	return result;
}

#ifdef CONFIG_COMPAT
asmlinkage long compat_sys_kexec_load(unsigned long entry,
Maneesh Soni's avatar
Maneesh Soni committed
1006
1007
1008
				unsigned long nr_segments,
				struct compat_kexec_segment __user *segments,
				unsigned long flags)
1009
1010
1011
1012
1013
1014
1015
1016
{
	struct compat_kexec_segment in;
	struct kexec_segment out, __user *ksegments;
	unsigned long i, result;

	/* Don't allow clients that don't understand the native
	 * architecture to do anything.
	 */
Maneesh Soni's avatar
Maneesh Soni committed
1017
	if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
1018
1019
		return -EINVAL;

Maneesh Soni's avatar
Maneesh Soni committed
1020
	if (nr_segments > KEXEC_SEGMENT_MAX)
1021
1022
1023
1024
1025
		return -EINVAL;

	ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
	for (i=0; i < nr_segments; i++) {
		result = copy_from_user(&in, &segments[i], sizeof(in));
Maneesh Soni's avatar
Maneesh Soni committed
1026
		if (result)
1027
1028
1029
1030
1031
1032
1033
1034
			return -EFAULT;

		out.buf   = compat_ptr(in.buf);
		out.bufsz = in.bufsz;
		out.mem   = in.mem;
		out.memsz = in.memsz;

		result = copy_to_user(&ksegments[i], &out, sizeof(out));
Maneesh Soni's avatar
Maneesh Soni committed
1035
		if (result)
1036
1037
1038
1039
1040
1041
1042
			return -EFAULT;
	}

	return sys_kexec_load(entry, nr_segments, ksegments, flags);
}
#endif

1043
void crash_kexec(struct pt_regs *regs)
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
{
	struct kimage *image;
	int locked;


	/* Take the kexec_lock here to prevent sys_kexec_load
	 * running on one cpu from replacing the crash kernel
	 * we are using after a panic on a different cpu.
	 *
	 * If the crash kernel was not located in a fixed area
	 * of memory the xchg(&kexec_crash_image) would be
	 * sufficient.  But since I reuse the memory...
	 */
	locked = xchg(&kexec_lock, 1);
	if (!locked) {
		image = xchg(&kexec_crash_image, NULL);
		if (image) {
1061
1062
1063
			struct pt_regs fixed_regs;
			crash_setup_regs(&fixed_regs, regs);
			machine_crash_shutdown(&fixed_regs);
1064
1065
1066
1067
1068
			machine_kexec(image);
		}
		xchg(&kexec_lock, 0);
	}
}
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081

static int __init crash_notes_memory_init(void)
{
	/* Allocate memory for saving cpu registers. */
	crash_notes = alloc_percpu(note_buf_t);
	if (!crash_notes) {
		printk("Kexec: Memory allocation for saving cpu register"
		" states failed\n");
		return -ENOMEM;
	}
	return 0;
}
module_init(crash_notes_memory_init)