Mailing List Archive

[xen-unstable] merge with xen-unstable.hg
# HG changeset patch
# User Isaku Yamahata <yamahata@valinux.co.jp>
# Date 1221198460 -32400
# Node ID ec8eaab557d867dca3e8cbb3e0384d797929102a
# Parent 4ddd63b4be9be2440d213da60b10c20327e5c515
# Parent 346c073ed6a4f0debca36588039d649e2efd93c3
merge with xen-unstable.hg
---
.hgignore | 1
Config.mk | 4
docs/misc/vtd.txt | 27
docs/src/user.tex | 4
stubdom/README | 8
tools/examples/init.d/xendomains | 6
tools/examples/xend-config.sxp | 4
tools/examples/xmexample.hvm | 2
tools/examples/xmexample.hvm-stubdom | 2
tools/flask/policy/Makefile | 234 +++++
tools/flask/policy/Rules.modular | 166 +++
tools/flask/policy/Rules.monolithic | 196 ++++
tools/flask/policy/policy/constraints | 27
tools/flask/policy/policy/flask/Makefile | 41
tools/flask/policy/policy/flask/access_vectors | 166 +++
tools/flask/policy/policy/flask/initial_sids | 17
tools/flask/policy/policy/flask/mkaccess_vector.sh | 227 +++++
tools/flask/policy/policy/flask/mkflask.sh | 95 ++
tools/flask/policy/policy/flask/security_classes | 20
tools/flask/policy/policy/global_booleans | 5
tools/flask/policy/policy/global_tunables | 6
tools/flask/policy/policy/mcs | 324 +++++++
tools/flask/policy/policy/mls | 354 ++++++++
tools/flask/policy/policy/modules.conf | 21
tools/flask/policy/policy/modules/xen/xen.if | 1
tools/flask/policy/policy/modules/xen/xen.te | 135 +++
tools/flask/policy/policy/support/loadable_module.spt | 166 +++
tools/flask/policy/policy/support/misc_macros.spt | 32
tools/flask/policy/policy/systemuser | 19
tools/flask/policy/policy/users | 39
tools/ioemu/hw/cirrus_vga.c | 3
tools/ioemu/hw/pass-through.c | 146 +++
tools/ioemu/hw/pass-through.h | 15
tools/ioemu/hw/pci.c | 5
tools/ioemu/hw/pt-msi.c | 2
tools/ioemu/hw/vga.c | 8
tools/ioemu/hw/xen_machine_fv.c | 4
tools/ioemu/vl.h | 2
tools/libxc/ia64/xc_ia64_linux_save.c | 6
tools/libxc/xc_domain_save.c | 65 -
tools/libxc/xc_evtchn.c | 15
tools/libxc/xc_private.c | 10
tools/libxc/xenctrl.h | 6
tools/libxc/xenguest.h | 2
tools/python/Makefile | 26
tools/python/xen/util/xsconstants.py | 6
tools/python/xen/util/xsm/flask/flask.py | 8
tools/python/xen/util/xsm/xsm.py | 20
tools/python/xen/xend/XendConfig.py | 2
tools/python/xen/xend/XendDomainInfo.py | 6
tools/python/xen/xend/XendOptions.py | 8
tools/python/xen/xend/server/blkif.py | 2
tools/python/xen/xend/server/netif.py | 2
tools/python/xen/xend/server/pciif.py | 2
tools/python/xen/xm/create.py | 6
tools/python/xen/xm/main.py | 2
tools/xcutils/lsevtchn.c | 48 -
tools/xcutils/xc_save.c | 117 +-
tools/xenstore/xs.c | 7
tools/xentrace/formats | 149 ++-
tools/xentrace/xentrace.c | 399 ++++++++-
xen/arch/x86/acpi/Makefile | 2
xen/arch/x86/acpi/cpu_idle.c | 434 ++-------
xen/arch/x86/acpi/cpufreq/cpufreq.c | 26
xen/arch/x86/acpi/cpufreq/powernow.c | 4
xen/arch/x86/acpi/cpuidle_menu.c | 132 +++
xen/arch/x86/domain.c | 24
xen/arch/x86/domain_build.c | 1
xen/arch/x86/domctl.c | 47 -
xen/arch/x86/hpet.c | 30
xen/arch/x86/hvm/hvm.c | 5
xen/arch/x86/hvm/svm/intr.c | 4
xen/arch/x86/hvm/svm/svm.c | 36
xen/arch/x86/hvm/vmx/intr.c | 2
xen/arch/x86/hvm/vmx/vmx.c | 49 -
xen/arch/x86/io_apic.c | 13
xen/arch/x86/irq.c | 23
xen/arch/x86/mm.c | 783 +++++++++++-------
xen/arch/x86/mm/hap/hap.c | 1
xen/arch/x86/mm/shadow/common.c | 71 +
xen/arch/x86/mm/shadow/multi.c | 210 ++++
xen/arch/x86/mm/shadow/private.h | 43
xen/arch/x86/physdev.c | 80 -
xen/arch/x86/platform_hypercall.c | 16
xen/arch/x86/smpboot.c | 40
xen/arch/x86/time.c | 7
xen/arch/x86/traps.c | 45 +
xen/common/domain.c | 4
xen/common/domctl.c | 19
xen/common/event_channel.c | 21
xen/common/rangeset.c | 9
xen/common/sched_credit.c | 5
xen/common/schedule.c | 123 ++
xen/common/sysctl.c | 12
xen/common/trace.c | 45 -
xen/drivers/acpi/hwregs.c | 2
xen/drivers/passthrough/iommu.c | 4
xen/drivers/passthrough/vtd/iommu.c | 22
xen/include/asm-ia64/shadow.h | 2
xen/include/asm-x86/bitops.h | 4
xen/include/asm-x86/guest_access.h | 6
xen/include/asm-x86/hvm/trace.h | 49 -
xen/include/asm-x86/io_apic.h | 2
xen/include/asm-x86/mm.h | 38
xen/include/asm-x86/msr-index.h | 12
xen/include/asm-x86/shadow.h | 2
xen/include/public/trace.h | 51 -
xen/include/xen/cpuidle.h | 82 +
xen/include/xen/iommu.h | 1
xen/include/xen/sched.h | 22
xen/include/xen/trace.h | 2
xen/include/xsm/xsm.h | 148 ++-
xen/xsm/dummy.c | 130 ++
xen/xsm/flask/hooks.c | 318 ++++++-
xen/xsm/flask/include/av_perm_to_string.h | 21
xen/xsm/flask/include/av_permissions.h | 63 -
xen/xsm/flask/include/flask.h | 11
xen/xsm/flask/include/initial_sid_to_string.h | 3
xen/xsm/flask/include/security.h | 6
xen/xsm/flask/ss/policydb.h | 13
xen/xsm/flask/ss/services.c | 40
121 files changed, 5439 insertions(+), 1429 deletions(-)

diff -r 4ddd63b4be9b -r ec8eaab557d8 .hgignore
--- a/.hgignore Fri Sep 12 14:32:45 2008 +0900
+++ b/.hgignore Fri Sep 12 14:47:40 2008 +0900
@@ -185,7 +185,6 @@
^tools/misc/xenperf$
^tools/pygrub/build/.*$
^tools/python/build/.*$
-^tools/python/xen/util/xsm/xsm\.py$
^tools/security/secpol_tool$
^tools/security/xen/.*$
^tools/security/xensec_tool$
diff -r 4ddd63b4be9b -r ec8eaab557d8 Config.mk
--- a/Config.mk Fri Sep 12 14:32:45 2008 +0900
+++ b/Config.mk Fri Sep 12 14:47:40 2008 +0900
@@ -86,11 +86,7 @@ QEMU_REMOTE=http://xenbits.xensource.com
# Mercurial in-tree version, or a local directory, or a git URL.
# CONFIG_QEMU ?= ioemu
# CONFIG_QEMU ?= ../qemu-xen.git
-ifeq ($(XEN_TARGET_ARCH),ia64)
-CONFIG_QEMU ?= ioemu
-else
CONFIG_QEMU ?= $(QEMU_REMOTE)
-endif

# Optional components
XENSTAT_XENTOP ?= y
diff -r 4ddd63b4be9b -r ec8eaab557d8 docs/misc/vtd.txt
--- a/docs/misc/vtd.txt Fri Sep 12 14:32:45 2008 +0900
+++ b/docs/misc/vtd.txt Fri Sep 12 14:47:40 2008 +0900
@@ -1,8 +1,9 @@ Title : How to do PCI Passthrough with
Title : How to do PCI Passthrough with VT-d
Authors : Allen Kay <allen.m.kay@intel.com>
Weidong Han <weidong.han@intel.com>
+ Yuji Shimada <shimada-yxb@necst.nec.co.jp>
Created : October-24-2007
-Updated : August-06-2008
+Updated : September-09-2008

How to turn on VT-d in Xen
--------------------------
@@ -106,3 +107,27 @@ http://h10010.www1.hp.com/wwpc/us/en/en/

For more information, pls refer to http://wiki.xensource.com/xenwiki/VTdHowTo.

+
+Assigning devices to HVM domains
+--------------------------------
+
+Most device types such as NIC, HBA, EHCI and UHCI can be assigned to
+an HVM domain.
+
+But some devices have design features which make them unsuitable for
+assignment to an HVM domain. Examples include:
+
+ * Device has an internal resource, such as private memory, which is
+ mapped to memory address space with BAR (Base Address Register).
+ * Driver submits command with a pointer to a buffer within internal
+ resource. Device decodes the pointer (address), and accesses to the
+ buffer.
+
+In an HVM domain, the BAR is virtualized, and host-BAR value and
+guest-BAR value are different. The addresses of internal resource from
+device's view and driver's view are different. Similarly, the
+addresses of buffer within internal resource from device's view and
+driver's view are different. As a result, device can't access to the
+buffer specified by driver.
+
+Such devices assigned to HVM domain currently do not work.
diff -r 4ddd63b4be9b -r ec8eaab557d8 docs/src/user.tex
--- a/docs/src/user.tex Fri Sep 12 14:32:45 2008 +0900
+++ b/docs/src/user.tex Fri Sep 12 14:47:40 2008 +0900
@@ -4252,7 +4252,7 @@ directory of the Xen source distribution
\section{Online References}

The official Xen web site can be found at:
-\begin{quote} {\tt http://www.xensource.com}
+\begin{quote} {\tt http://www.xen.org}
\end{quote}

This contains links to the latest versions of all online
@@ -4282,7 +4282,7 @@ mailing lists and subscription informati
Subscribe at: \\
{\small {\tt http://lists.xensource.com/xen-announce}}
\item[xen-changelog@lists.xensource.com] Changelog feed
- from the unstable and 2.0 trees - developer oriented. Subscribe at: \\
+ from the unstable and 3.x trees - developer oriented. Subscribe at: \\
{\small {\tt http://lists.xensource.com/xen-changelog}}
\end{description}

diff -r 4ddd63b4be9b -r ec8eaab557d8 stubdom/README
--- a/stubdom/README Fri Sep 12 14:32:45 2008 +0900
+++ b/stubdom/README Fri Sep 12 14:47:40 2008 +0900
@@ -27,7 +27,7 @@ device_model = '/usr/lib/xen/bin/stubdom
- disable anything related to dom0, like pty serial assignments


-Create /etc/xen/stubdom-hvmconfig (where "hvmconfig" is the name of your HVM
+Create /etc/xen/hvmconfig-dm (where "hvmconfig" is the name of your HVM
guest) with

kernel = "/usr/lib/xen/boot/ioemu-stubdom.gz"
@@ -52,7 +52,7 @@ vnc = 0
vnc = 0
sdl = 0

- - In stubdom-hvmconfig, set an sdl vfb:
+ - In hvmconfig-dm, set an sdl vfb:

vfb = [ 'type=sdl' ]

@@ -65,7 +65,7 @@ vnc = 1
vnc = 1
vnclisten = "172.30.206.1"

- - In stubdom-hvmconfig, fill the reserved vif with the same IP, for instance:
+ - In hvmconfig-dm, fill the reserved vif with the same IP, for instance:

vif = [ 'ip=172.30.206.1', 'ip=10.0.1.1,mac=aa:00:00:12:23:34']

@@ -76,7 +76,7 @@ vnc = 0
vnc = 0
sdl = 0

- - In stubdom-hvmconfig, set a vnc vfb:
+ - In hvmconfig-dm, set a vnc vfb:

vfb = [ 'type=vnc' ]

diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/examples/init.d/xendomains
--- a/tools/examples/init.d/xendomains Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/examples/init.d/xendomains Fri Sep 12 14:47:40 2008 +0900
@@ -327,15 +327,17 @@ stop()
if test $id = 0; then continue; fi
echo -n " $name"
if test "$XENDOMAINS_AUTO_ONLY" = "true"; then
- case $name in
+ eval "
+ case \"\$name\" in
($NAMES)
# nothing
;;
(*)
- echo -n "(skip)"
+ echo -n '(skip)'
continue
;;
esac
+ "
fi
# XENDOMAINS_SYSRQ chould be something like just "s"
# or "s e i u" or even "s e s i u o"
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/examples/xend-config.sxp
--- a/tools/examples/xend-config.sxp Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/examples/xend-config.sxp Fri Sep 12 14:47:40 2008 +0900
@@ -14,6 +14,10 @@
#(logfile /var/log/xen/xend.log)
#(loglevel DEBUG)

+# Uncomment the line below. Set the value to flask, acm, or dummy to
+# select a security module.
+
+#(xsm_module_name dummy)

# The Xen-API server configuration.
#
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/examples/xmexample.hvm
--- a/tools/examples/xmexample.hvm Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/examples/xmexample.hvm Fri Sep 12 14:47:40 2008 +0900
@@ -220,7 +220,7 @@ serial='pty'
# Configure guest CPUID responses:
#
#cpuid=[ '1:ecx=xxxxxxxxxxx00xxxxxxxxxxxxxxxxxxx,
-# eax=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' ]
+# eax=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' ]
# - Unset the SSE4 features (CPUID.1[ECX][20-19])
# - Default behaviour for all other bits in ECX And EAX registers.
#
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/examples/xmexample.hvm-stubdom
--- a/tools/examples/xmexample.hvm-stubdom Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/examples/xmexample.hvm-stubdom Fri Sep 12 14:47:40 2008 +0900
@@ -236,7 +236,7 @@ stdvga=0
# Configure guest CPUID responses:
#
#cpuid=[ '1:ecx=xxxxxxxxxxx00xxxxxxxxxxxxxxxxxxx,
-# eax=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' ]
+# eax=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' ]
# - Unset the SSE4 features (CPUID.1[ECX][20-19])
# - Default behaviour for all other bits in ECX And EAX registers.
#
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/Makefile
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/Makefile Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,234 @@
+#
+# Makefile for the security policy.
+#
+# Targets:
+#
+# install - compile and install the policy configuration, and context files.
+# load - compile, install, and load the policy configuration.
+# reload - compile, install, and load/reload the policy configuration.
+# policy - compile the policy configuration locally for testing/development.
+#
+# The default target is 'policy'.
+#
+
+########################################
+#
+# Configurable portions of the Makefile
+#
+
+# Policy version
+# By default, checkpolicy will create the highest
+# version policy it supports. Setting this will
+# override the version.
+OUTPUT_POLICY = 20
+
+# Policy Type
+# strict, targeted,
+# strict-mls, targeted-mls,
+# strict-mcs, targeted-mcs
+TYPE = strict
+
+# Policy Name
+# If set, this will be used as the policy
+# name. Otherwise the policy type will be
+# used for the name.
+NAME = xenrefpolicy
+
+# Distribution
+# Some distributions have portions of policy
+# for programs or configurations specific to the
+# distribution. Setting this will enable options
+# for the distribution.
+# redhat, gentoo, debian, and suse are current options.
+# Fedora users should enable redhat.
+#DISTRO =
+
+# Build monolithic policy. Putting n here
+# will build a loadable module policy.
+MONOLITHIC=y
+
+# Uncomment this to disable command echoing
+#QUIET:=@
+
+########################################
+#
+# NO OPTIONS BELOW HERE
+#
+
+# executable paths
+PREFIX := /usr
+BINDIR := $(PREFIX)/bin
+SBINDIR := $(PREFIX)/sbin
+CHECKPOLICY := $(BINDIR)/checkpolicy
+CHECKMODULE := $(BINDIR)/checkmodule
+SEMOD_PKG := $(BINDIR)/semodule_package
+LOADPOLICY := $(SBINDIR)/flask-loadpolicy
+
+CFLAGS := -Wall
+
+# policy source layout
+POLDIR := policy
+MODDIR := $(POLDIR)/modules
+FLASKDIR := $(POLDIR)/flask
+SECCLASS := $(FLASKDIR)/security_classes
+ISIDS := $(FLASKDIR)/initial_sids
+AVS := $(FLASKDIR)/access_vectors
+
+#policy building support tools
+SUPPORT := support
+FCSORT := tmp/fc_sort
+
+# config file paths
+GLOBALTUN := $(POLDIR)/global_tunables
+GLOBALBOOL := $(POLDIR)/global_booleans
+MOD_CONF := $(POLDIR)/modules.conf
+TUNABLES := $(POLDIR)/tunables.conf
+BOOLEANS := $(POLDIR)/booleans.conf
+
+# install paths
+TOPDIR = $(DESTDIR)/etc/xen/
+INSTALLDIR = $(TOPDIR)/$(NAME)
+SRCPATH = $(INSTALLDIR)/src
+USERPATH = $(INSTALLDIR)/users
+CONTEXTPATH = $(INSTALLDIR)/contexts
+
+# enable MLS if requested.
+ifneq ($(findstring -mls,$(TYPE)),)
+ override M4PARAM += -D enable_mls
+ CHECKPOLICY += -M
+ CHECKMODULE += -M
+endif
+
+# enable MLS if MCS requested.
+ifneq ($(findstring -mcs,$(TYPE)),)
+ override M4PARAM += -D enable_mcs
+ CHECKPOLICY += -M
+ CHECKMODULE += -M
+endif
+
+# compile targeted policy if requested.
+ifneq ($(findstring targeted,$(TYPE)),)
+ override M4PARAM += -D targeted_policy
+endif
+
+# enable distribution-specific policy
+ifneq ($(DISTRO),)
+ override M4PARAM += -D distro_$(DISTRO)
+endif
+
+ifneq ($(OUTPUT_POLICY),)
+ CHECKPOLICY += -c $(OUTPUT_POLICY)
+endif
+
+ifeq ($(NAME),)
+ NAME := $(TYPE)
+endif
+
+# determine the policy version and current kernel version if possible
+PV := $(shell $(CHECKPOLICY) -V |cut -f 1 -d ' ')
+KV := $(shell cat /selinux/policyvers)
+
+# dont print version warnings if we are unable to determine
+# the currently running kernel's policy version
+ifeq ($(KV),)
+ KV := $(PV)
+endif
+
+FC := file_contexts
+POLVER := policy.$(PV)
+
+M4SUPPORT = $(wildcard $(POLDIR)/support/*.spt)
+
+APPCONF := config/appconfig-$(TYPE)
+APPDIR := $(CONTEXTPATH)
+APPFILES := $(INSTALLDIR)/booleans
+CONTEXTFILES += $(wildcard $(APPCONF)/*_context*) $(APPCONF)/media
+USER_FILES := $(POLDIR)/systemuser $(POLDIR)/users
+
+ALL_LAYERS := $(filter-out $(MODDIR)/CVS,$(shell find $(wildcard $(MODDIR)/*) -maxdepth 0 -type d))
+
+GENERATED_TE := $(basename $(foreach dir,$(ALL_LAYERS),$(wildcard $(dir)/*.te.in)))
+GENERATED_IF := $(basename $(foreach dir,$(ALL_LAYERS),$(wildcard $(dir)/*.if.in)))
+GENERATED_FC := $(basename $(foreach dir,$(ALL_LAYERS),$(wildcard $(dir)/*.fc.in)))
+
+# sort here since it removes duplicates, which can happen
+# when a generated file is already generated
+DETECTED_MODS := $(sort $(foreach dir,$(ALL_LAYERS),$(wildcard $(dir)/*.te)) $(GENERATED_TE))
+
+# modules.conf setting for base module
+MODBASE := base
+
+# modules.conf setting for module
+MODMOD := module
+
+# extract settings from modules.conf
+BASE_MODS := $(foreach mod,$(shell awk '/^[[:blank:]]*[[:alpha:]]/{ if ($$3 == "$(MODBASE)") print $$1 }' $(MOD_CONF) 2> /dev/null),$(subst ./,,$(shell find -iname $(mod).te)))
+MOD_MODS := $(foreach mod,$(shell awk '/^[[:blank:]]*[[:alpha:]]/{ if ($$3 == "$(MODMOD)") print $$1 }' $(MOD_CONF) 2> /dev/null),$(subst ./,,$(shell find -iname $(mod).te)))
+
+HOMEDIR_TEMPLATE = tmp/homedir_template
+
+########################################
+#
+# Load appropriate rules
+#
+
+ifeq ($(MONOLITHIC),y)
+ include Rules.monolithic
+else
+ include Rules.modular
+endif
+
+########################################
+#
+# Create config files
+#
+conf: $(MOD_CONF) $(BOOLEANS) $(GENERATED_TE) $(GENERATED_IF) $(GENERATED_FC)
+
+$(MOD_CONF) $(BOOLEANS): $(POLXML)
+ @echo "Updating $(MOD_CONF) and $(BOOLEANS)"
+ $(QUIET) cd $(DOCS) && ../$(GENDOC) -t ../$(BOOLEANS) -m ../$(MOD_CONF) -x ../$(POLXML)
+
+########################################
+#
+# Appconfig files
+#
+install-appconfig: $(APPFILES)
+
+$(INSTALLDIR)/booleans: $(BOOLEANS)
+ @mkdir -p $(INSTALLDIR)
+ $(QUIET) egrep '^[[:blank:]]*[[:alpha:]]' $(BOOLEANS) \
+ | sed -e 's/false/0/g' -e 's/true/1/g' > tmp/booleans
+ $(QUIET) install -m 644 tmp/booleans $@
+
+########################################
+#
+# Install policy sources
+#
+install-src:
+ rm -rf $(SRCPATH)/policy.old
+ -mv $(SRCPATH)/policy $(SRCPATH)/policy.old
+ mkdir -p $(SRCPATH)/policy
+ cp -R . $(SRCPATH)/policy
+
+########################################
+#
+# Clean everything
+#
+bare: clean
+ rm -f $(POLXML)
+ rm -f $(SUPPORT)/*.pyc
+ rm -f $(FCSORT)
+ rm -f $(MOD_CONF)
+ rm -f $(BOOLEANS)
+ rm -fR $(HTMLDIR)
+ifneq ($(GENERATED_TE),)
+ rm -f $(GENERATED_TE)
+endif
+ifneq ($(GENERATED_IF),)
+ rm -f $(GENERATED_IF)
+endif
+ifneq ($(GENERATED_FC),)
+ rm -f $(GENERATED_FC)
+endif
+
+.PHONY: install-src install-appconfig conf html bare
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/Rules.modular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/Rules.modular Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,166 @@
+########################################
+#
+# Rules and Targets for building modular policies
+#
+
+ALL_MODULES := $(filter $(BASE_MODS) $(MOD_MODS),$(DETECTED_MODS))
+ALL_INTERFACES := $(ALL_MODULES:.te=.if)
+
+BASE_PKG := base.pp
+BASE_FC := base.fc
+
+BASE_SECTIONS := tmp/pre_te_files.conf tmp/generated_definitions.conf tmp/all_interfaces.conf tmp/all_attrs_types.conf $(GLOBALBOOL) $(GLOBALTUN) tmp/only_te_rules.conf tmp/all_post.conf
+
+BASE_PRE_TE_FILES := $(SECCLASS) $(ISIDS) $(AVS) $(M4SUPPORT) $(POLDIR)/mls $(POLDIR)/mcs
+BASE_TE_FILES := $(BASE_MODS)
+BASE_POST_TE_FILES := $(POLDIR)/systemuser $(POLDIR)/constraints
+BASE_FC_FILES := $(BASE_MODS:.te=.fc)
+
+MOD_MODULES := $(MOD_MODS:.te=.mod)
+MOD_PKGS := $(notdir $(MOD_MODS:.te=.pp))
+
+# search layer dirs for source files
+vpath %.te $(ALL_LAYERS)
+vpath %.if $(ALL_LAYERS)
+vpath %.fc $(ALL_LAYERS)
+
+########################################
+#
+# default action: create all module packages
+#
+default: base
+
+base: $(BASE_PKG)
+
+modules: $(MOD_PKGS)
+
+#policy: $(POLVER)
+#install: $(LOADPATH) $(FCPATH) $(APPFILES) $(USERPATH)/local.users
+#load: tmp/load
+
+########################################
+#
+# Create a base module package
+#
+$(BASE_PKG): tmp/base.mod $(BASE_FC)
+ @echo "Creating $(NAME) base module package"
+ $(QUIET) $(SEMOD_PKG) $@ $^
+
+########################################
+#
+# Compile a base module
+#
+tmp/base.mod: base.conf
+ @echo "Compiling $(NAME) base module"
+ $(QUIET) $(CHECKMODULE) $^ -o $@
+
+########################################
+#
+# Construct a base module policy.conf
+#
+base.conf: $(BASE_SECTIONS)
+ @echo "Creating $(NAME) base module policy.conf"
+# checkpolicy can use the #line directives provided by -s for error reporting:
+ $(QUIET) m4 -D self_contained_policy $(M4PARAM) -s $^ > tmp/$@.tmp
+ $(QUIET) sed -e /^portcon/d -e /^nodecon/d -e /^netifcon/d < tmp/$@.tmp > $@
+# the ordering of these ocontexts matters:
+ $(QUIET) grep ^portcon tmp/$@.tmp >> $@ || true
+ $(QUIET) grep ^netifcon tmp/$@.tmp >> $@ || true
+ $(QUIET) grep ^nodecon tmp/$@.tmp >> $@ || true
+
+tmp/pre_te_files.conf: $(BASE_PRE_TE_FILES)
+ @test -d tmp || mkdir -p tmp
+ $(QUIET) cat $^ > $@
+
+tmp/generated_definitions.conf: $(ALL_LAYERS) $(BASE_TE_FILES)
+ @test -d tmp || mkdir -p tmp
+# define all available object classes
+ $(QUIET) $(GENPERM) $(AVS) $(SECCLASS) > $@
+# per-userdomain templates
+ $(QUIET) echo "define(\`per_userdomain_templates',\`" >> $@
+ $(QUIET) for i in $(patsubst %.te,%,$(notdir $(ALL_MODULES))); do \
+ echo "ifdef(\`""$$i""_per_userdomain_template',\`""$$i""_per_userdomain_template("'$$*'")')" \
+ >> $@ ;\
+ done
+ $(QUIET) echo "')" >> $@
+# define foo.te
+ $(QUIET) for i in $(notdir $(BASE_TE_FILES)); do \
+ echo "define(\`$$i')" >> $@ ;\
+ done
+ $(QUIET) $(SETTUN) $(BOOLEANS) >> $@
+
+tmp/all_interfaces.conf: $(M4SUPPORT) $(ALL_INTERFACES)
+ifeq ($(ALL_INTERFACES),)
+ $(error No enabled modules! $(notdir $(MOD_CONF)) may need to be generated by using "make conf")
+endif
+ @test -d tmp || mkdir -p tmp
+ $(QUIET) m4 $^ | sed -e s/dollarsstar/\$$\*/g > $@
+
+tmp/all_te_files.conf: $(BASE_TE_FILES)
+ifeq ($(BASE_TE_FILES),)
+ $(error No enabled modules! $(notdir $(MOD_CONF)) may need to be generated by using "make conf")
+endif
+ @test -d tmp || mkdir -p tmp
+ $(QUIET) cat $^ > $@
+
+tmp/post_te_files.conf: $(BASE_POST_TE_FILES)
+ @test -d tmp || mkdir -p tmp
+ $(QUIET) cat $^ > $@
+
+# extract attributes and put them first. extract post te stuff
+# like genfscon and put last. portcon, nodecon, and netifcon
+# is delayed since they are generated by m4
+tmp/all_attrs_types.conf tmp/only_te_rules.conf tmp/all_post.conf: tmp/all_te_files.conf tmp/post_te_files.conf
+ $(QUIET) grep ^attribute tmp/all_te_files.conf > tmp/all_attrs_types.conf || true
+ $(QUIET) grep '^type ' tmp/all_te_files.conf >> tmp/all_attrs_types.conf
+ $(QUIET) cat tmp/post_te_files.conf > tmp/all_post.conf
+ $(QUIET) grep '^sid ' tmp/all_te_files.conf >> tmp/all_post.conf || true
+ $(QUIET) egrep '^fs_use_(xattr|task|trans)' tmp/all_te_files.conf >> tmp/all_post.conf || true
+ $(QUIET) grep ^genfscon tmp/all_te_files.conf >> tmp/all_post.conf || true
+ $(QUIET) sed -r -e /^attribute/d -e '/^type /d' -e /^genfscon/d \
+ -e '/^sid /d' -e '/^fs_use_(xattr|task|trans)/d' \
+ < tmp/all_te_files.conf > tmp/only_te_rules.conf
+
+########################################
+#
+# Construct base module file contexts
+#
+$(BASE_FC): $(M4SUPPORT) tmp/generated_definitions.conf $(BASE_FC_FILES) $(FCSORT)
+ifeq ($(BASE_FC_FILES),)
+ $(error No enabled modules! $(notdir $(MOD_CONF)) may need to be generated by using "make conf")
+endif
+ @echo "Creating $(NAME) base module file contexts."
+ @test -d tmp || mkdir -p tmp
+ $(QUIET) m4 $(M4PARAM) $(M4SUPPORT) tmp/generated_definitions.conf $(BASE_FC_FILES) > tmp/$@.tmp
+ $(QUIET) grep -e HOME -e ROLE tmp/$@.tmp > $(HOMEDIR_TEMPLATE)
+ $(QUIET) sed -i -e /HOME/d -e /ROLE/d tmp/$@.tmp
+ $(QUIET) $(FCSORT) tmp/$@.tmp $@
+
+########################################
+#
+# Build module packages
+#
+tmp/%.mod: $(M4SUPPORT) tmp/generated_definitions.conf tmp/all_interfaces.conf %.te
+ @if test -z "$(filter $^,$(MOD_MODS))"; then \
+ echo "The $(notdir $(basename $@)) module is not configured to be compiled as a lodable module." ;\
+ false ;\
+ fi
+ @echo "Compliling $(NAME) $(@F) module"
+ $(QUIET) m4 $(M4PARAM) -s $^ > $(@:.mod=.tmp)
+ $(QUIET) $(CHECKMODULE) -m $(@:.mod=.tmp) -o $@
+
+%.pp: tmp/%.mod %.fc
+ @echo "Creating $(NAME) $(@F) policy package"
+ $(QUIET) $(SEMOD_PKG) $@ $^
+
+########################################
+#
+# Clean the sources
+#
+clean:
+ rm -fR tmp
+ rm -f base.conf
+ rm -f *.pp
+ rm -f $(BASE_FC)
+
+.PHONY: default base modules clean
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/Rules.monolithic
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/Rules.monolithic Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,196 @@
+########################################
+#
+# Rules and Targets for building monolithic policies
+#
+
+# install paths
+POLICYPATH = $(INSTALLDIR)/policy
+LOADPATH = $(POLICYPATH)/$(POLVER)
+FCPATH = $(CONTEXTPATH)/files/file_contexts
+HOMEDIRPATH = $(CONTEXTPATH)/files/homedir_template
+
+# for monolithic policy use all base and module to create policy
+ENABLEMOD := $(BASE_MODS) $(MOD_MODS)
+
+ALL_MODULES := $(filter $(ENABLEMOD),$(DETECTED_MODS))
+
+ALL_INTERFACES := $(ALL_MODULES:.te=.if)
+ALL_TE_FILES := $(ALL_MODULES)
+ALL_FC_FILES := $(ALL_MODULES:.te=.fc)
+
+PRE_TE_FILES := $(SECCLASS) $(ISIDS) $(AVS) $(M4SUPPORT) $(POLDIR)/mls $(POLDIR)/mcs
+POST_TE_FILES := $(POLDIR)/systemuser $(POLDIR)/users $(POLDIR)/constraints
+
+POLICY_SECTIONS := tmp/pre_te_files.conf tmp/generated_definitions.conf tmp/all_interfaces.conf tmp/all_attrs_types.conf $(GLOBALBOOL) $(GLOBALTUN) tmp/only_te_rules.conf tmp/all_post.conf
+
+########################################
+#
+# default action: build policy locally
+#
+default: policy
+
+policy: $(POLVER)
+
+install: $(LOADPATH) $(FCPATH) $(APPFILES) $(USERPATH)/local.users
+
+load: tmp/load
+
+########################################
+#
+# Build a binary policy locally
+#
+$(POLVER): policy.conf
+ @echo "Compiling $(NAME) $(POLVER)"
+ifneq ($(PV),$(KV))
+ @echo
+ @echo "WARNING: Policy version mismatch! Is your OUTPUT_POLICY set correctly?"
+ @echo
+endif
+ $(QUIET) $(CHECKPOLICY) $^ -o $@
+
+########################################
+#
+# Install a binary policy
+#
+$(LOADPATH): policy.conf
+ @mkdir -p $(POLICYPATH)
+ @echo "Compiling and installing $(NAME) $(LOADPATH)"
+ifneq ($(PV),$(KV))
+ @echo
+ @echo "WARNING: Policy version mismatch! Is your OUTPUT_POLICY set correctly?"
+ @echo
+endif
+ $(QUIET) $(CHECKPOLICY) $^ -o $@
+
+########################################
+#
+# Load the binary policy
+#
+reload tmp/load: $(LOADPATH) $(FCPATH)
+ @echo "Loading $(NAME) $(LOADPATH)"
+ $(QUIET) $(LOADPOLICY) -q $(LOADPATH)
+ @touch tmp/load
+
+########################################
+#
+# Construct a monolithic policy.conf
+#
+policy.conf: $(POLICY_SECTIONS)
+ @echo "Creating $(NAME) policy.conf"
+# checkpolicy can use the #line directives provided by -s for error reporting:
+ $(QUIET) m4 -D self_contained_policy $(M4PARAM) -s $^ > tmp/$@.tmp
+ $(QUIET) sed -e /^portcon/d -e /^nodecon/d -e /^netifcon/d < tmp/$@.tmp > $@
+
+tmp/pre_te_files.conf: $(PRE_TE_FILES)
+ @test -d tmp || mkdir -p tmp
+ $(QUIET) cat $^ > $@
+
+tmp/generated_definitions.conf: $(ALL_LAYERS) $(ALL_TE_FILES)
+# per-userdomain templates:
+ @test -d tmp || mkdir -p tmp
+ $(QUIET) echo "define(\`per_userdomain_templates',\`" > $@
+ $(QUIET) for i in $(patsubst %.te,%,$(notdir $(ALL_MODULES))); do \
+ echo "ifdef(\`""$$i""_per_userdomain_template',\`""$$i""_per_userdomain_template("'$$*'")')" \
+ >> $@ ;\
+ done
+ $(QUIET) echo "')" >> $@
+# define foo.te
+ $(QUIET) for i in $(notdir $(ALL_MODULES)); do \
+ echo "define(\`$$i')" >> $@ ;\
+ done
+# $(QUIET) $(SETTUN) $(BOOLEANS) >> $@
+
+tmp/all_interfaces.conf: $(M4SUPPORT) $(ALL_INTERFACES)
+ifeq ($(ALL_INTERFACES),)
+ $(error No enabled modules! $(notdir $(MOD_CONF)) may need to be generated by using "make conf")
+endif
+ @test -d tmp || mkdir -p tmp
+ $(QUIET) m4 $^ | sed -e s/dollarsstar/\$$\*/g > $@
+
+tmp/all_te_files.conf: $(ALL_TE_FILES)
+ifeq ($(ALL_TE_FILES),)
+ $(error No enabled modules! $(notdir $(MOD_CONF)) may need to be generated by using "make conf")
+endif
+ @test -d tmp || mkdir -p tmp
+ $(QUIET) cat $^ > $@
+
+tmp/post_te_files.conf: $(POST_TE_FILES)
+ @test -d tmp || mkdir -p tmp
+ $(QUIET) cat $^ > $@
+
+# extract attributes and put them first. extract post te stuff
+# like genfscon and put last. portcon, nodecon, and netifcon
+# is delayed since they are generated by m4
+tmp/all_attrs_types.conf tmp/only_te_rules.conf tmp/all_post.conf: tmp/all_te_files.conf tmp/post_te_files.conf
+ $(QUIET) grep ^attribute tmp/all_te_files.conf > tmp/all_attrs_types.conf || true
+ $(QUIET) grep '^type ' tmp/all_te_files.conf >> tmp/all_attrs_types.conf
+ $(QUIET) cat tmp/post_te_files.conf > tmp/all_post.conf
+ $(QUIET) grep '^sid ' tmp/all_te_files.conf >> tmp/all_post.conf || true
+ $(QUIET) egrep '^fs_use_(xattr|task|trans)' tmp/all_te_files.conf >> tmp/all_post.conf || true
+ $(QUIET) grep ^genfscon tmp/all_te_files.conf >> tmp/all_post.conf || true
+ $(QUIET) sed -r -e /^attribute/d -e '/^type /d' -e /^genfscon/d \
+ -e '/^sid /d' -e '/^fs_use_(xattr|task|trans)/d' \
+ < tmp/all_te_files.conf > tmp/only_te_rules.conf
+
+########################################
+#
+# Remove the dontaudit rules from the policy.conf
+#
+enableaudit: policy.conf
+ @test -d tmp || mkdir -p tmp
+ @echo "Removing dontaudit rules from policy.conf"
+ $(QUIET) grep -v dontaudit policy.conf > tmp/policy.audit
+ $(QUIET) mv tmp/policy.audit policy.conf
+
+########################################
+#
+# Construct file_contexts
+#
+$(FC): $(M4SUPPORT) tmp/generated_definitions.conf $(ALL_FC_FILES)
+ifeq ($(ALL_FC_FILES),)
+ $(error No enabled modules! $(notdir $(MOD_CONF)) may need to be generated by using "make conf")
+endif
+ @echo "Creating $(NAME) file_contexts."
+ @test -d tmp || mkdir -p tmp
+ $(QUIET) m4 $(M4PARAM) $(M4SUPPORT) tmp/generated_definitions.conf $(ALL_FC_FILES) > tmp/$@.tmp
+# $(QUIET) grep -e HOME -e ROLE tmp/$@.tmp > $(HOMEDIR_TEMPLATE)
+# $(QUIET) sed -i -e /HOME/d -e /ROLE/d tmp/$@.tmp
+# $(QUIET) $(FCSORT) tmp/$@.tmp $@
+ $(QUIET) touch $(HOMEDIR_TEMPLATE)
+ $(QUIET) touch $@
+
+########################################
+#
+# Install file_contexts
+#
+$(FCPATH): $(FC) $(LOADPATH) $(USERPATH)/system.users
+ @echo "Validating $(NAME) file_contexts."
+# $(QUIET) $(SETFILES) -q -c $(LOADPATH) $(FC)
+ @echo "Installing file_contexts."
+ @mkdir -p $(CONTEXTPATH)/files
+ $(QUIET) install -m 644 $(FC) $(FCPATH)
+ $(QUIET) install -m 644 $(HOMEDIR_TEMPLATE) $(HOMEDIRPATH)
+# $(QUIET) $(GENHOMEDIRCON) -d $(TOPDIR) -t $(NAME) $(USEPWD)
+
+########################################
+#
+# Run policy source checks
+#
+check: policy.conf $(FC)
+ $(SECHECK) -s --profile=development --policy=policy.conf --fcfile=$(FC) > $@.res
+
+longcheck: policy.conf $(FC)
+ $(SECHECK) -s --profile=all --policy=policy.conf --fcfile=$(FC) > $@.res
+
+########################################
+#
+# Clean the sources
+#
+clean:
+ rm -fR tmp
+ rm -f policy.conf
+ rm -f policy.$(PV)
+ rm -f $(FC)
+ rm -f *.res
+
+.PHONY: default policy install load reload enableaudit checklabels restorelabels relabel check longcheck clean
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/constraints
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/constraints Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,27 @@
+
+#
+# Define the constraints
+#
+# constrain class_set perm_set expression ;
+#
+# expression : ( expression )
+# | not expression
+# | expression and expression
+# | expression or expression
+# | u1 op u2
+# | r1 role_op r2
+# | t1 op t2
+# | u1 op names
+# | u2 op names
+# | r1 op names
+# | r2 op names
+# | t1 op names
+# | t2 op names
+#
+# op : == | !=
+# role_op : == | != | eq | dom | domby | incomp
+#
+# names : name | { name_list }
+# name_list : name | name_list name
+#
+
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/flask/Makefile
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/flask/Makefile Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,41 @@
+# flask needs to know where to export the libselinux headers.
+LIBSEL ?= ../../libselinux
+
+# flask needs to know where to export the kernel headers.
+LINUXDIR ?= ../../../linux-2.6
+
+AWK = awk
+
+CONFIG_SHELL := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \
+ else if [ -x /bin/bash ]; then echo /bin/bash; \
+ else echo sh; fi ; fi)
+
+FLASK_H_DEPEND = security_classes initial_sids
+AV_H_DEPEND = access_vectors
+
+FLASK_H_FILES = class_to_string.h flask.h initial_sid_to_string.h
+AV_H_FILES = av_inherit.h common_perm_to_string.h av_perm_to_string.h av_permissions.h
+ALL_H_FILES = $(FLASK_H_FILES) $(AV_H_FILES)
+
+all: $(ALL_H_FILES)
+
+$(FLASK_H_FILES): $(FLASK_H_DEPEND)
+ $(CONFIG_SHELL) mkflask.sh $(AWK) $(FLASK_H_DEPEND)
+
+$(AV_H_FILES): $(AV_H_DEPEND)
+ $(CONFIG_SHELL) mkaccess_vector.sh $(AWK) $(AV_H_DEPEND)
+
+tolib: all
+ install -m 644 flask.h av_permissions.h $(LIBSEL)/include/selinux
+ install -m 644 class_to_string.h av_inherit.h common_perm_to_string.h av_perm_to_string.h $(LIBSEL)/src
+
+tokern: all
+ install -m 644 $(ALL_H_FILES) $(LINUXDIR)/security/selinux/include
+
+install: all
+
+relabel:
+
+clean:
+ rm -f $(FLASK_H_FILES)
+ rm -f $(AV_H_FILES)
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/flask/access_vectors
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/flask/access_vectors Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,166 @@
+#
+# Define common prefixes for access vectors
+#
+# common common_name { permission_name ... }
+
+#
+# Define a common prefix for file access vectors.
+#
+
+
+#
+# Define the access vectors.
+#
+# class class_name [ inherits common_name ] { permission_name ... }
+
+
+#
+# Define the access vector interpretation for file-related objects.
+#
+
+class xen
+{
+ scheduler
+ settime
+ tbufcontrol
+ readconsole
+ clearconsole
+ perfcontrol
+ mtrr_add
+ mtrr_del
+ mtrr_read
+ microcode
+ physinfo
+ quirk
+ writeconsole
+ readapic
+ writeapic
+ privprofile
+ nonprivprofile
+ kexec
+ firmware
+ sleep
+ frequency
+ getidle
+ debug
+ getcpuinfo
+ heap
+}
+
+class domain
+{
+ setvcpucontext
+ pause
+ unpause
+ resume
+ create
+ transition
+ max_vcpus
+ destroy
+ setvcpuaffinity
+ getvcpuaffinity
+ scheduler
+ getdomaininfo
+ getvcpuinfo
+ getvcpucontext
+ setdomainmaxmem
+ setdomainhandle
+ setdebugging
+ hypercall
+ settime
+ set_target
+ shutdown
+ setaddrsize
+ getaddrsize
+ trigger
+ getextvcpucontext
+ setextvcpucontext
+}
+
+class hvm
+{
+ sethvmc
+ gethvmc
+ setparam
+ getparam
+ pcilevel
+ irqlevel
+ pciroute
+ bind_irq
+ cacheattr
+}
+
+class event
+{
+ bind
+ send
+ status
+ notify
+ create
+ vector
+ reset
+}
+
+class grant
+{
+ map_read
+ map_write
+ unmap
+ transfer
+ setup
+ copy
+ query
+}
+
+class mmu
+{
+ map_read
+ map_write
+ pageinfo
+ pagelist
+ adjust
+ stat
+ translategp
+ updatemp
+ physmap
+ pinpage
+ mfnlist
+ memorymap
+}
+
+class shadow
+{
+ disable
+ enable
+ logdirty
+}
+
+class resource
+{
+ add
+ remove
+ use
+ add_irq
+ remove_irq
+ add_ioport
+ remove_ioport
+ add_iomem
+ remove_iomem
+ stat_device
+ add_device
+ remove_device
+}
+
+class security
+{
+ compute_av
+ compute_create
+ compute_member
+ check_context
+ load_policy
+ compute_relabel
+ compute_user
+ setenforce
+ setbool
+ setsecparam
+}
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/flask/initial_sids
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/flask/initial_sids Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,17 @@
+# FLASK
+
+#
+# Define initial security identifiers
+#
+sid xen
+sid dom0
+sid domU
+sid domio
+sid domxen
+sid unlabeled
+sid security
+sid ioport
+sid iomem
+sid pirq
+sid device
+# FLASK
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/flask/mkaccess_vector.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/flask/mkaccess_vector.sh Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,227 @@
+#!/bin/sh -
+#
+
+# FLASK
+
+set -e
+
+awk=$1
+shift
+
+# output files
+av_permissions="av_permissions.h"
+av_inherit="av_inherit.h"
+common_perm_to_string="common_perm_to_string.h"
+av_perm_to_string="av_perm_to_string.h"
+
+cat $* | $awk "
+BEGIN {
+ outfile = \"$av_permissions\"
+ inheritfile = \"$av_inherit\"
+ cpermfile = \"$common_perm_to_string\"
+ avpermfile = \"$av_perm_to_string\"
+ "'
+ nextstate = "COMMON_OR_AV";
+ printf("/* This file is automatically generated. Do not edit. */\n") > outfile;
+ printf("/* This file is automatically generated. Do not edit. */\n") > inheritfile;
+ printf("/* This file is automatically generated. Do not edit. */\n") > cpermfile;
+ printf("/* This file is automatically generated. Do not edit. */\n") > avpermfile;
+;
+ }
+/^[ \t]*#/ {
+ next;
+ }
+$1 == "common" {
+ if (nextstate != "COMMON_OR_AV")
+ {
+ printf("Parse error: Unexpected COMMON definition on line %d\n", NR);
+ next;
+ }
+
+ if ($2 in common_defined)
+ {
+ printf("Duplicate COMMON definition for %s on line %d.\n", $2, NR);
+ next;
+ }
+ common_defined[$2] = 1;
+
+ tclass = $2;
+ common_name = $2;
+ permission = 1;
+
+ printf("TB_(common_%s_perm_to_string)\n", $2) > cpermfile;
+
+ nextstate = "COMMON-OPENBRACKET";
+ next;
+ }
+$1 == "class" {
+ if (nextstate != "COMMON_OR_AV" &&
+ nextstate != "CLASS_OR_CLASS-OPENBRACKET")
+ {
+ printf("Parse error: Unexpected class definition on line %d\n", NR);
+ next;
+ }
+
+ tclass = $2;
+
+ if (tclass in av_defined)
+ {
+ printf("Duplicate access vector definition for %s on line %d\n", tclass, NR);
+ next;
+ }
+ av_defined[tclass] = 1;
+
+ inherits = "";
+ permission = 1;
+
+ nextstate = "INHERITS_OR_CLASS-OPENBRACKET";
+ next;
+ }
+$1 == "inherits" {
+ if (nextstate != "INHERITS_OR_CLASS-OPENBRACKET")
+ {
+ printf("Parse error: Unexpected INHERITS definition on line %d\n", NR);
+ next;
+ }
+
+ if (!($2 in common_defined))
+ {
+ printf("COMMON %s is not defined (line %d).\n", $2, NR);
+ next;
+ }
+
+ inherits = $2;
+ permission = common_base[$2];
+
+ for (combined in common_perms)
+ {
+ split(combined,separate, SUBSEP);
+ if (separate[1] == inherits)
+ {
+ inherited_perms[common_perms[combined]] = separate[2];
+ }
+ }
+
+ j = 1;
+ for (i in inherited_perms) {
+ ind[j] = i + 0;
+ j++;
+ }
+ n = asort(ind);
+ for (i = 1; i <= n; i++) {
+ perm = inherited_perms[ind[i]];
+ printf("#define %s__%s", toupper(tclass), toupper(perm)) > outfile;
+ spaces = 40 - (length(perm) + length(tclass));
+ if (spaces < 1)
+ spaces = 1;
+ for (j = 0; j < spaces; j++)
+ printf(" ") > outfile;
+ printf("0x%08xUL\n", ind[i]) > outfile;
+ }
+ printf("\n") > outfile;
+ for (i in ind) delete ind[i];
+ for (i in inherited_perms) delete inherited_perms[i];
+
+ printf(" S_(SECCLASS_%s, %s, 0x%08xUL)\n", toupper(tclass), inherits, permission) > inheritfile;
+
+ nextstate = "CLASS_OR_CLASS-OPENBRACKET";
+ next;
+ }
+$1 == "{" {
+ if (nextstate != "INHERITS_OR_CLASS-OPENBRACKET" &&
+ nextstate != "CLASS_OR_CLASS-OPENBRACKET" &&
+ nextstate != "COMMON-OPENBRACKET")
+ {
+ printf("Parse error: Unexpected { on line %d\n", NR);
+ next;
+ }
+
+ if (nextstate == "INHERITS_OR_CLASS-OPENBRACKET")
+ nextstate = "CLASS-CLOSEBRACKET";
+
+ if (nextstate == "CLASS_OR_CLASS-OPENBRACKET")
+ nextstate = "CLASS-CLOSEBRACKET";
+
+ if (nextstate == "COMMON-OPENBRACKET")
+ nextstate = "COMMON-CLOSEBRACKET";
+ }
+/[a-z][a-z_]*/ {
+ if (nextstate != "COMMON-CLOSEBRACKET" &&
+ nextstate != "CLASS-CLOSEBRACKET")
+ {
+ printf("Parse error: Unexpected symbol %s on line %d\n", $1, NR);
+ next;
+ }
+
+ if (nextstate == "COMMON-CLOSEBRACKET")
+ {
+ if ((common_name,$1) in common_perms)
+ {
+ printf("Duplicate permission %s for common %s on line %d.\n", $1, common_name, NR);
+ next;
+ }
+
+ common_perms[common_name,$1] = permission;
+
+ printf("#define COMMON_%s__%s", toupper(common_name), toupper($1)) > outfile;
+
+ printf(" S_(\"%s\")\n", $1) > cpermfile;
+ }
+ else
+ {
+ if ((tclass,$1) in av_perms)
+ {
+ printf("Duplicate permission %s for %s on line %d.\n", $1, tclass, NR);
+ next;
+ }
+
+ av_perms[tclass,$1] = permission;
+
+ if (inherits != "")
+ {
+ if ((inherits,$1) in common_perms)
+ {
+ printf("Permission %s in %s on line %d conflicts with common permission.\n", $1, tclass, inherits, NR);
+ next;
+ }
+ }
+
+ printf("#define %s__%s", toupper(tclass), toupper($1)) > outfile;
+
+ printf(" S_(SECCLASS_%s, %s__%s, \"%s\")\n", toupper(tclass), toupper(tclass), toupper($1), $1) > avpermfile;
+ }
+
+ spaces = 40 - (length($1) + length(tclass));
+ if (spaces < 1)
+ spaces = 1;
+
+ for (i = 0; i < spaces; i++)
+ printf(" ") > outfile;
+ printf("0x%08xUL\n", permission) > outfile;
+ permission = permission * 2;
+ }
+$1 == "}" {
+ if (nextstate != "CLASS-CLOSEBRACKET" &&
+ nextstate != "COMMON-CLOSEBRACKET")
+ {
+ printf("Parse error: Unexpected } on line %d\n", NR);
+ next;
+ }
+
+ if (nextstate == "COMMON-CLOSEBRACKET")
+ {
+ common_base[common_name] = permission;
+ printf("TE_(common_%s_perm_to_string)\n\n", common_name) > cpermfile;
+ }
+
+ printf("\n") > outfile;
+
+ nextstate = "COMMON_OR_AV";
+ }
+END {
+ if (nextstate != "COMMON_OR_AV" && nextstate != "CLASS_OR_CLASS-OPENBRACKET")
+ printf("Parse error: Unexpected end of file\n");
+
+ }'
+
+# FLASK
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/flask/mkflask.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/flask/mkflask.sh Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,95 @@
+#!/bin/sh -
+#
+
+# FLASK
+
+set -e
+
+awk=$1
+shift 1
+
+# output file
+output_file="flask.h"
+debug_file="class_to_string.h"
+debug_file2="initial_sid_to_string.h"
+
+cat $* | $awk "
+BEGIN {
+ outfile = \"$output_file\"
+ debugfile = \"$debug_file\"
+ debugfile2 = \"$debug_file2\"
+ "'
+ nextstate = "CLASS";
+
+ printf("/* This file is automatically generated. Do not edit. */\n") > outfile;
+
+ printf("#ifndef _SELINUX_FLASK_H_\n") > outfile;
+ printf("#define _SELINUX_FLASK_H_\n") > outfile;
+ printf("\n/*\n * Security object class definitions\n */\n") > outfile;
+ printf("/* This file is automatically generated. Do not edit. */\n") > debugfile;
+ printf("/*\n * Security object class definitions\n */\n") > debugfile;
+ printf(" S_(\"null\")\n") > debugfile;
+ printf("/* This file is automatically generated. Do not edit. */\n") > debugfile2;
+ printf("static char *initial_sid_to_string[] =\n{\n") > debugfile2;
+ printf(" \"null\",\n") > debugfile2;
+ }
+/^[ \t]*#/ {
+ next;
+ }
+$1 == "class" {
+ if (nextstate != "CLASS")
+ {
+ printf("Parse error: Unexpected class definition on line %d\n", NR);
+ next;
+ }
+
+ if ($2 in class_found)
+ {
+ printf("Duplicate class definition for %s on line %d.\n", $2, NR);
+ next;
+ }
+ class_found[$2] = 1;
+
+ class_value++;
+
+ printf("#define SECCLASS_%s", toupper($2)) > outfile;
+ for (i = 0; i < 40 - length($2); i++)
+ printf(" ") > outfile;
+ printf("%d\n", class_value) > outfile;
+
+ printf(" S_(\"%s\")\n", $2) > debugfile;
+ }
+$1 == "sid" {
+ if (nextstate == "CLASS")
+ {
+ nextstate = "SID";
+ printf("\n/*\n * Security identifier indices for initial entities\n */\n") > outfile;
+ }
+
+ if ($2 in sid_found)
+ {
+ printf("Duplicate SID definition for %s on line %d.\n", $2, NR);
+ next;
+ }
+ sid_found[$2] = 1;
+ sid_value++;
+
+ printf("#define SECINITSID_%s", toupper($2)) > outfile;
+ for (i = 0; i < 37 - length($2); i++)
+ printf(" ") > outfile;
+ printf("%d\n", sid_value) > outfile;
+ printf(" \"%s\",\n", $2) > debugfile2;
+ }
+END {
+ if (nextstate != "SID")
+ printf("Parse error: Unexpected end of file\n");
+
+ printf("\n#define SECINITSID_NUM") > outfile;
+ for (i = 0; i < 34; i++)
+ printf(" ") > outfile;
+ printf("%d\n", sid_value) > outfile;
+ printf("\n#endif\n") > outfile;
+ printf("};\n\n") > debugfile2;
+ }'
+
+# FLASK
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/flask/security_classes
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/flask/security_classes Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,20 @@
+# FLASK
+
+#
+# Define the security object classes
+#
+
+# Classes marked as userspace are classes
+# for userspace object managers
+
+class xen
+class domain
+class hvm
+class mmu
+class resource
+class shadow
+class event
+class grant
+class security
+
+# FLASK
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/global_booleans
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/global_booleans Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,5 @@
+#
+# This file is for the declaration of global booleans.
+# To change the default value at build time, the booleans.conf
+# file should be used.
+#
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/global_tunables
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/global_tunables Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,6 @@
+#
+# This file is for the declaration of global tunables.
+# To change the default value at build time, the booleans.conf
+# file should be used.
+#
+
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/mcs
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/mcs Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,324 @@
+ifdef(`enable_mcs',`
+#
+# Define sensitivities
+#
+# Each sensitivity has a name and zero or more aliases.
+#
+# MCS is single-sensitivity.
+#
+sensitivity s0;
+
+#
+# Define the ordering of the sensitivity levels (least to greatest)
+#
+dominance { s0 }
+
+
+#
+# Define the categories
+#
+# Each category has a name and zero or more aliases.
+#
+category c0;
+category c1;
+category c2;
+category c3;
+category c4;
+category c5;
+category c6;
+category c7;
+category c8;
+category c9;
+category c10;
+category c11;
+category c12;
+category c13;
+category c14;
+category c15;
+category c16;
+category c17;
+category c18;
+category c19;
+category c20;
+category c21;
+category c22;
+category c23;
+category c24;
+category c25;
+category c26;
+category c27;
+category c28;
+category c29;
+category c30;
+category c31;
+category c32;
+category c33;
+category c34;
+category c35;
+category c36;
+category c37;
+category c38;
+category c39;
+category c40;
+category c41;
+category c42;
+category c43;
+category c44;
+category c45;
+category c46;
+category c47;
+category c48;
+category c49;
+category c50;
+category c51;
+category c52;
+category c53;
+category c54;
+category c55;
+category c56;
+category c57;
+category c58;
+category c59;
+category c60;
+category c61;
+category c62;
+category c63;
+category c64;
+category c65;
+category c66;
+category c67;
+category c68;
+category c69;
+category c70;
+category c71;
+category c72;
+category c73;
+category c74;
+category c75;
+category c76;
+category c77;
+category c78;
+category c79;
+category c80;
+category c81;
+category c82;
+category c83;
+category c84;
+category c85;
+category c86;
+category c87;
+category c88;
+category c89;
+category c90;
+category c91;
+category c92;
+category c93;
+category c94;
+category c95;
+category c96;
+category c97;
+category c98;
+category c99;
+category c100;
+category c101;
+category c102;
+category c103;
+category c104;
+category c105;
+category c106;
+category c107;
+category c108;
+category c109;
+category c110;
+category c111;
+category c112;
+category c113;
+category c114;
+category c115;
+category c116;
+category c117;
+category c118;
+category c119;
+category c120;
+category c121;
+category c122;
+category c123;
+category c124;
+category c125;
+category c126;
+category c127;
+category c128;
+category c129;
+category c130;
+category c131;
+category c132;
+category c133;
+category c134;
+category c135;
+category c136;
+category c137;
+category c138;
+category c139;
+category c140;
+category c141;
+category c142;
+category c143;
+category c144;
+category c145;
+category c146;
+category c147;
+category c148;
+category c149;
+category c150;
+category c151;
+category c152;
+category c153;
+category c154;
+category c155;
+category c156;
+category c157;
+category c158;
+category c159;
+category c160;
+category c161;
+category c162;
+category c163;
+category c164;
+category c165;
+category c166;
+category c167;
+category c168;
+category c169;
+category c170;
+category c171;
+category c172;
+category c173;
+category c174;
+category c175;
+category c176;
+category c177;
+category c178;
+category c179;
+category c180;
+category c181;
+category c182;
+category c183;
+category c184;
+category c185;
+category c186;
+category c187;
+category c188;
+category c189;
+category c190;
+category c191;
+category c192;
+category c193;
+category c194;
+category c195;
+category c196;
+category c197;
+category c198;
+category c199;
+category c200;
+category c201;
+category c202;
+category c203;
+category c204;
+category c205;
+category c206;
+category c207;
+category c208;
+category c209;
+category c210;
+category c211;
+category c212;
+category c213;
+category c214;
+category c215;
+category c216;
+category c217;
+category c218;
+category c219;
+category c220;
+category c221;
+category c222;
+category c223;
+category c224;
+category c225;
+category c226;
+category c227;
+category c228;
+category c229;
+category c230;
+category c231;
+category c232;
+category c233;
+category c234;
+category c235;
+category c236;
+category c237;
+category c238;
+category c239;
+category c240;
+category c241;
+category c242;
+category c243;
+category c244;
+category c245;
+category c246;
+category c247;
+category c248;
+category c249;
+category c250;
+category c251;
+category c252;
+category c253;
+category c254;
+category c255;
+
+
+#
+# Each MCS level specifies a sensitivity and zero or more categories which may
+# be associated with that sensitivity.
+#
+level s0:c0.c255;
+
+#
+# Define the MCS policy
+#
+# mlsconstrain class_set perm_set expression ;
+#
+# mlsvalidatetrans class_set expression ;
+#
+# expression : ( expression )
+# | not expression
+# | expression and expression
+# | expression or expression
+# | u1 op u2
+# | r1 role_mls_op r2
+# | t1 op t2
+# | l1 role_mls_op l2
+# | l1 role_mls_op h2
+# | h1 role_mls_op l2
+# | h1 role_mls_op h2
+# | l1 role_mls_op h1
+# | l2 role_mls_op h2
+# | u1 op names
+# | u2 op names
+# | r1 op names
+# | r2 op names
+# | t1 op names
+# | t2 op names
+# | u3 op names (NOTE: this is only available for mlsvalidatetrans)
+# | r3 op names (NOTE: this is only available for mlsvalidatetrans)
+# | t3 op names (NOTE: this is only available for mlsvalidatetrans)
+#
+# op : == | !=
+# role_mls_op : == | != | eq | dom | domby | incomp
+#
+# names : name | { name_list }
+# name_list : name | name_list name
+#
+
+
+') dnl end enable_mcs
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/mls
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/mls Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,354 @@
+
+ifdef(`enable_mls',`
+#
+# Define sensitivities
+#
+# Each sensitivity has a name and zero or more aliases.
+#
+sensitivity s0;
+sensitivity s1;
+sensitivity s2;
+sensitivity s3;
+sensitivity s4;
+sensitivity s5;
+sensitivity s6;
+sensitivity s7;
+sensitivity s8;
+sensitivity s9;
+sensitivity s10;
+sensitivity s11;
+sensitivity s12;
+sensitivity s13;
+sensitivity s14;
+sensitivity s15;
+
+#
+# Define the ordering of the sensitivity levels (least to greatest)
+#
+dominance { s0 s1 s2 s3 s4 s5 s6 s7 s8 s9 s10 s11 s12 s13 s14 s15 }
+
+
+#
+# Define the categories
+#
+# Each category has a name and zero or more aliases.
+#
+category c0;
+category c1;
+category c2;
+category c3;
+category c4;
+category c5;
+category c6;
+category c7;
+category c8;
+category c9;
+category c10;
+category c11;
+category c12;
+category c13;
+category c14;
+category c15;
+category c16;
+category c17;
+category c18;
+category c19;
+category c20;
+category c21;
+category c22;
+category c23;
+category c24;
+category c25;
+category c26;
+category c27;
+category c28;
+category c29;
+category c30;
+category c31;
+category c32;
+category c33;
+category c34;
+category c35;
+category c36;
+category c37;
+category c38;
+category c39;
+category c40;
+category c41;
+category c42;
+category c43;
+category c44;
+category c45;
+category c46;
+category c47;
+category c48;
+category c49;
+category c50;
+category c51;
+category c52;
+category c53;
+category c54;
+category c55;
+category c56;
+category c57;
+category c58;
+category c59;
+category c60;
+category c61;
+category c62;
+category c63;
+category c64;
+category c65;
+category c66;
+category c67;
+category c68;
+category c69;
+category c70;
+category c71;
+category c72;
+category c73;
+category c74;
+category c75;
+category c76;
+category c77;
+category c78;
+category c79;
+category c80;
+category c81;
+category c82;
+category c83;
+category c84;
+category c85;
+category c86;
+category c87;
+category c88;
+category c89;
+category c90;
+category c91;
+category c92;
+category c93;
+category c94;
+category c95;
+category c96;
+category c97;
+category c98;
+category c99;
+category c100;
+category c101;
+category c102;
+category c103;
+category c104;
+category c105;
+category c106;
+category c107;
+category c108;
+category c109;
+category c110;
+category c111;
+category c112;
+category c113;
+category c114;
+category c115;
+category c116;
+category c117;
+category c118;
+category c119;
+category c120;
+category c121;
+category c122;
+category c123;
+category c124;
+category c125;
+category c126;
+category c127;
+category c128;
+category c129;
+category c130;
+category c131;
+category c132;
+category c133;
+category c134;
+category c135;
+category c136;
+category c137;
+category c138;
+category c139;
+category c140;
+category c141;
+category c142;
+category c143;
+category c144;
+category c145;
+category c146;
+category c147;
+category c148;
+category c149;
+category c150;
+category c151;
+category c152;
+category c153;
+category c154;
+category c155;
+category c156;
+category c157;
+category c158;
+category c159;
+category c160;
+category c161;
+category c162;
+category c163;
+category c164;
+category c165;
+category c166;
+category c167;
+category c168;
+category c169;
+category c170;
+category c171;
+category c172;
+category c173;
+category c174;
+category c175;
+category c176;
+category c177;
+category c178;
+category c179;
+category c180;
+category c181;
+category c182;
+category c183;
+category c184;
+category c185;
+category c186;
+category c187;
+category c188;
+category c189;
+category c190;
+category c191;
+category c192;
+category c193;
+category c194;
+category c195;
+category c196;
+category c197;
+category c198;
+category c199;
+category c200;
+category c201;
+category c202;
+category c203;
+category c204;
+category c205;
+category c206;
+category c207;
+category c208;
+category c209;
+category c210;
+category c211;
+category c212;
+category c213;
+category c214;
+category c215;
+category c216;
+category c217;
+category c218;
+category c219;
+category c220;
+category c221;
+category c222;
+category c223;
+category c224;
+category c225;
+category c226;
+category c227;
+category c228;
+category c229;
+category c230;
+category c231;
+category c232;
+category c233;
+category c234;
+category c235;
+category c236;
+category c237;
+category c238;
+category c239;
+category c240;
+category c241;
+category c242;
+category c243;
+category c244;
+category c245;
+category c246;
+category c247;
+category c248;
+category c249;
+category c250;
+category c251;
+category c252;
+category c253;
+category c254;
+category c255;
+
+
+#
+# Each MLS level specifies a sensitivity and zero or more categories which may
+# be associated with that sensitivity.
+#
+level s0:c0.c255;
+level s1:c0.c255;
+level s2:c0.c255;
+level s3:c0.c255;
+level s4:c0.c255;
+level s5:c0.c255;
+level s6:c0.c255;
+level s7:c0.c255;
+level s8:c0.c255;
+level s9:c0.c255;
+level s10:c0.c255;
+level s11:c0.c255;
+level s12:c0.c255;
+level s13:c0.c255;
+level s14:c0.c255;
+level s15:c0.c255;
+
+
+#
+# Define the MLS policy
+#
+# mlsconstrain class_set perm_set expression ;
+#
+# mlsvalidatetrans class_set expression ;
+#
+# expression : ( expression )
+# | not expression
+# | expression and expression
+# | expression or expression
+# | u1 op u2
+# | r1 role_mls_op r2
+# | t1 op t2
+# | l1 role_mls_op l2
+# | l1 role_mls_op h2
+# | h1 role_mls_op l2
+# | h1 role_mls_op h2
+# | l1 role_mls_op h1
+# | l2 role_mls_op h2
+# | u1 op names
+# | u2 op names
+# | r1 op names
+# | r2 op names
+# | t1 op names
+# | t2 op names
+# | u3 op names (NOTE: this is only available for mlsvalidatetrans)
+# | r3 op names (NOTE: this is only available for mlsvalidatetrans)
+# | t3 op names (NOTE: this is only available for mlsvalidatetrans)
+#
+# op : == | !=
+# role_mls_op : == | != | eq | dom | domby | incomp
+#
+# names : name | { name_list }
+# name_list : name | name_list name
+#
+
+
+') dnl end enable_mls
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/modules.conf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/modules.conf Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,21 @@
+#
+# This file contains a listing of available modules.
+# To prevent a module from being used in policy
+# creation, set the module name to "off".
+#
+# For monolithic policies, modules set to "base" and "module"
+# will be built into the policy.
+#
+# For modular policies, modules set to "base" will be
+# included in the base module. "module" will be compiled
+# as individual loadable modules.
+#
+
+# Layer: xen
+# Module: xen
+# Required in base
+#
+# Policy for xen.
+#
+xen = base
+
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/modules/xen/xen.if
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/modules/xen/xen.if Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,1 @@
+#
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/modules/xen/xen.te
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/modules/xen/xen.te Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,135 @@
+attribute xen_type;
+attribute domain_type;
+attribute resource_type;
+attribute event_type;
+
+type xen_t, xen_type, domain_type;
+
+type dom0_t, domain_type;
+
+type domio_t, domain_type;
+
+type domxen_t, domain_type;
+
+type unlabeled_t, domain_type;
+
+type security_t, domain_type;
+
+type pirq_t, resource_type;
+type ioport_t, resource_type;
+type iomem_t, resource_type;
+type device_t, resource_type;
+
+################################################################################
+#
+# create_domain(priv_dom, domain, channel)
+#
+################################################################################
+define(`create_domain', `
+ type $2, domain_type;
+ allow $1 $2:domain {create max_vcpus setdomainmaxmem
+ setaddrsize getdomaininfo hypercall
+ setvcpucontext scheduler unpause
+ getvcpuinfo getaddrsize getvcpuaffinity};
+ allow $1 $2:shadow {enable};
+ allow $1 $2:mmu {map_read map_write memorymap adjust pinpage};
+ allow $2 $2:mmu {map_read map_write pinpage};
+ allow $2 domio_t:mmu {map_read};
+ allow $2 $2:grant {query setup};
+ allow $1 $2:grant {map_read unmap};
+ allow $1 $3:event {create};
+')
+
+################################################################################
+#
+# manage_domain(priv_dom, domain)
+#
+################################################################################
+define(`manage_domain', `
+ allow $1 $2:domain {pause destroy};
+')
+
+################################################################################
+#
+# create_channel(caller, peer, channel)
+#
+################################################################################
+define(`create_channel', `
+ type $3, event_type;
+ type_transition $1 $2:event $3;
+ allow $1 $3:event {create};
+ allow $3 $2:event {bind};
+')
+
+################################################################################
+#
+# Boot the hypervisor and dom0
+#
+################################################################################
+allow dom0_t xen_t:xen {kexec readapic writeapic mtrr_read mtrr_add mtrr_del
+scheduler physinfo heap quirk readconsole writeconsole settime microcode};
+
+allow dom0_t domio_t:mmu {map_read map_write};
+allow dom0_t iomem_t:mmu {map_read map_write};
+allow dom0_t pirq_t:event {vector};
+allow dom0_t xen_t:mmu {memorymap};
+
+allow dom0_t dom0_t:mmu {pinpage map_read map_write adjust};
+allow dom0_t dom0_t:grant {query setup};
+allow dom0_t dom0_t:domain {scheduler getdomaininfo getvcpuinfo getvcpuaffinity};
+
+allow xen_t dom0_t:domain {create};
+allow xen_t dom0_t:resource {add remove};
+allow xen_t ioport_t:resource {add_ioport remove_ioport};
+allow dom0_t ioport_t:resource {use};
+allow xen_t iomem_t:resource {add_iomem remove_iomem};
+allow dom0_t iomem_t:resource {use};
+allow xen_t pirq_t:resource {add_irq remove_irq};
+allow dom0_t pirq_t:resource {use};
+
+allow dom0_t security_t:security {compute_av compute_create compute_member
+check_context load_policy compute_relabel compute_user setenforce setbool
+setsecparam};
+
+create_channel(dom0_t, dom0_t, evchn0-0_t)
+allow dom0_t evchn0-0_t:event {send};
+
+################################################################################
+#
+# Create and manage a domU w/ dom0 IO
+#
+################################################################################
+create_domain(dom0_t, domU_t, evchnU-0_t)
+
+create_channel(domU_t, domU_t, evchnU-U_t)
+allow domU_t evchnU-U_t:event {send};
+
+create_channel(dom0_t, domU_t, evchn0-U_t)
+allow dom0_t evchn0-U_t:event {send};
+
+create_channel(domU_t, dom0_t, evchnU-0_t)
+allow domU_t evchnU-0_t:event {send};
+
+manage_domain(dom0_t, domU_t)
+
+################################################################################
+#
+#
+#
+################################################################################
+sid xen gen_context(system_u:system_r:xen_t,s0)
+sid dom0 gen_context(system_u:system_r:dom0_t,s0)
+sid domU gen_context(system_u:system_r:domU_t,s0)
+sid domxen gen_context(system_u:system_r:domxen_t,s0)
+sid domio gen_context(system_u:system_r:domio_t,s0)
+sid unlabeled gen_context(system_u:system_r:unlabeled_t,s0)
+sid security gen_context(system_u:system_r:security_t,s0)
+sid pirq gen_context(system_u:object_r:pirq_t,s0)
+sid iomem gen_context(system_u:object_r:iomem_t,s0)
+sid ioport gen_context(system_u:object_r:ioport_t,s0)
+sid device gen_context(system_u:object_r:device_t,s0)
+
+role system_r types { xen_type domain_type };
+role user_r types { xen_type domain_type };
+role sysadm_r types { xen_type domain_type };
+role staff_r types { xen_type domain_type };
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/support/loadable_module.spt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/support/loadable_module.spt Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,166 @@
+########################################
+#
+# Macros for switching between source policy
+# and loadable policy module support
+#
+
+##############################
+#
+# For adding the module statement
+#
+define(`policy_module',`
+ ifdef(`self_contained_policy',`',`
+ module $1 $2;
+
+ require {
+ role system_r;
+ all_kernel_class_perms
+ }
+ ')
+')
+
+##############################
+#
+# For use in interfaces, to optionally insert a require block
+#
+define(`gen_require',`
+ ifdef(`self_contained_policy',`',`
+ define(`in_gen_require_block')
+ require {
+ $1
+ }
+ undefine(`in_gen_require_block')
+ ')
+')
+
+##############################
+#
+# In the future interfaces should be in loadable modules
+#
+# template(name,rules)
+#
+define(`template',`
+ `define(`$1',`
+##### begin $1(dollarsstar)
+ $2
+##### end $1(dollarsstar)
+ '')
+')
+
+# helper function, since m4 wont expand macros
+# if a line is a comment (#):
+define(`policy_m4_comment',`dnl
+##### $2 depth: $1
+')dnl
+
+##############################
+#
+# In the future interfaces should be in loadable modules
+#
+# interface(name,rules)
+#
+define(`interface',`
+ `define(`$1',`
+
+ define(`policy_temp',incr(policy_call_depth))
+ pushdef(`policy_call_depth',policy_temp)
+ undefine(`policy_temp')
+
+ policy_m4_comment(policy_call_depth,begin `$1'(dollarsstar))
+
+ $2
+
+ define(`policy_temp',decr(policy_call_depth))
+ pushdef(`policy_call_depth',policy_temp)
+ undefine(`policy_temp')
+
+ policy_m4_comment(policy_call_depth,end `$1'(dollarsstar))
+
+ '')
+')
+
+define(`policy_call_depth',0)
+
+##############################
+#
+# Optional policy handling
+#
+define(`optional_policy',`
+ ifdef(`self_contained_policy',`
+ ifdef(`$1',`$2',`$3')
+ ',`
+ optional {
+ $2
+ ifelse(`$3',`',`',`
+ } else {
+ $3
+ ')
+ }
+ ')
+')
+
+##############################
+#
+# Determine if we should use the default
+# tunable value as specified by the policy
+# or if the override value should be used
+#
+define(`dflt_or_overr',`ifdef(`$1',$1,$2)')
+
+##############################
+#
+# Extract booleans out of an expression.
+# This needs to be reworked so expressions
+# with parentheses can work.
+
+define(`delcare_required_symbols',`
+ifelse(regexp($1, `\w'), -1, `', `dnl
+bool regexp($1, `\(\w+\)', `\1');
+delcare_required_symbols(regexp($1, `\w+\(.*\)', `\1'))dnl
+') dnl
+')
+
+##############################
+#
+# Tunable declaration
+#
+define(`gen_tunable',`
+ ifdef(`self_contained_policy',`
+ bool $1 dflt_or_overr(`$1'_conf,$2);
+ ',`
+ # loadable module tunable
+ # declaration will go here
+ # instead of bool when
+ # loadable modules support
+ # tunables
+ bool $1 dflt_or_overr(`$1'_conf,$2);
+ ')
+')
+
+##############################
+#
+# Tunable policy handling
+#
+define(`tunable_policy',`
+ ifdef(`self_contained_policy',`
+ if (`$1') {
+ $2
+ } else {
+ $3
+ }
+ ',`
+ # structure for tunables
+ # will go here instead of a
+ # conditional when loadable
+ # modules support tunables
+ gen_require(`
+ delcare_required_symbols(`$1')
+ ')
+
+ if (`$1') {
+ $2
+ } else {
+ $3
+ }
+ ')
+')
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/support/misc_macros.spt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/support/misc_macros.spt Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,32 @@
+
+########################################
+#
+# Helper macros
+#
+
+#
+# shiftn(num,list...)
+#
+# shift the list num times
+#
+define(`shiftn',`ifelse($1,0,`shift($*)',`shiftn(decr($1),shift(shift($*)))')')
+
+########################################
+#
+# gen_user(username, role_set, mls_defaultlevel, mls_range, [mcs_categories])
+#
+define(`gen_user',`user $1 roles { $2 }`'ifdef(`enable_mls', ` level $3 range $4')`'ifdef(`enable_mcs',` level s0 range s0`'ifelse(`$5',,,` - s0:$5')');')
+
+########################################
+#
+# gen_context(context,mls_sensitivity,[mcs_categories])
+#
+define(`gen_context',`$1`'ifdef(`enable_mls',`:$2')`'ifdef(`enable_mcs',`:s0`'ifelse(`$3',,,`:$3')')') dnl
+
+########################################
+#
+# gen_bool(name,default_value)
+#
+define(`gen_bool',`
+ bool $1 dflt_or_overr(`$1'_conf,$2);
+')
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/systemuser
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/systemuser Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,19 @@
+##################################
+#
+# System User configuration.
+#
+
+#
+# gen_user(username, role_set, mls_defaultlevel, mls_range, [mcs_categories])
+#
+
+#
+# system_u is the user identity for system processes and objects.
+# There should be no corresponding Unix user identity for system,
+# and a user process should never be assigned the system user
+# identity.
+#
+gen_user(system_u, system_r, s0, s0 - s9:c0.c127, c0.c127)
+
+# Normal users should not be added to this file,
+# but instead added to the users file.
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/users
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/users Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,39 @@
+
+##################################
+#
+# Core User configuration.
+#
+
+#
+# gen_user(username, role_set, mls_defaultlevel, mls_range, [mcs_catetories])
+#
+
+#
+# user_u is a generic user identity for Linux users who have no
+# SELinux user identity defined. The modified daemons will use
+# this user identity in the security context if there is no matching
+# SELinux user identity for a Linux user. If you do not want to
+# permit any access to such users, then remove this entry.
+#
+ifdef(`targeted_policy',`
+gen_user(user_u, user_r sysadm_r system_r, s0, s0 - s9:c0.c127)
+',`
+gen_user(user_u, user_r, s0, s0 - s9:c0.c127)
+')
+
+#
+# The following users correspond to Unix identities.
+# These identities are typically assigned as the user attribute
+# when login starts the user shell. Users with access to the sysadm_r
+# role should use the staff_r role instead of the user_r role when
+# not in the sysadm_r.
+#
+ifdef(`targeted_policy',`
+ gen_user(root, user_r sysadm_r system_r, s0, s0 - s9:c0.c127, c0.c127)
+',`
+ ifdef(`direct_sysadm_daemon',`
+ gen_user(root, sysadm_r staff_r system_r, s0, s0 - s9:c0.c127, c0.c127)
+ ',`
+ gen_user(root, sysadm_r staff_r, s0, s0 - s9:c0.c127, c0.c127)
+ ')
+')
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/ioemu/hw/cirrus_vga.c
--- a/tools/ioemu/hw/cirrus_vga.c Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/ioemu/hw/cirrus_vga.c Fri Sep 12 14:47:40 2008 +0900
@@ -2554,6 +2554,9 @@ static void set_vram_mapping(CirrusVGASt

fprintf(logfile,"mapping vram to %lx - %lx\n", begin, end);

+ if (!s->vram_mfns)
+ return;
+
xatp.domid = domid;
xatp.space = XENMAPSPACE_mfn;

diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/ioemu/hw/pass-through.c
--- a/tools/ioemu/hw/pass-through.c Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/ioemu/hw/pass-through.c Fri Sep 12 14:47:40 2008 +0900
@@ -57,6 +57,10 @@ static uint32_t pt_irqpin_reg_init(struc
struct pt_reg_info_tbl *reg, uint32_t real_offset);
static uint32_t pt_bar_reg_init(struct pt_dev *ptdev,
struct pt_reg_info_tbl *reg, uint32_t real_offset);
+static uint32_t pt_linkctrl_reg_init(struct pt_dev *ptdev,
+ struct pt_reg_info_tbl *reg, uint32_t real_offset);
+static uint32_t pt_devctrl2_reg_init(struct pt_dev *ptdev,
+ struct pt_reg_info_tbl *reg, uint32_t real_offset);
static uint32_t pt_linkctrl2_reg_init(struct pt_dev *ptdev,
struct pt_reg_info_tbl *reg, uint32_t real_offset);
static uint32_t pt_msgctrl_reg_init(struct pt_dev *ptdev,
@@ -76,6 +80,8 @@ static uint8_t pt_msix_size_init(struct
static uint8_t pt_msix_size_init(struct pt_dev *ptdev,
struct pt_reg_grp_info_tbl *grp_reg, uint32_t base_offset);
static uint8_t pt_vendor_size_init(struct pt_dev *ptdev,
+ struct pt_reg_grp_info_tbl *grp_reg, uint32_t base_offset);
+static uint8_t pt_pcie_size_init(struct pt_dev *ptdev,
struct pt_reg_grp_info_tbl *grp_reg, uint32_t base_offset);
static int pt_byte_reg_read(struct pt_dev *ptdev,
struct pt_reg_tbl *cfg_entry,
@@ -438,7 +444,7 @@ static struct pt_reg_info_tbl pt_emu_reg
.init_val = 0x0000,
.ro_mask = 0x0000,
.emu_mask = 0xFFFF,
- .init = pt_common_reg_init,
+ .init = pt_linkctrl_reg_init,
.u.w.read = pt_word_reg_read,
.u.w.write = pt_linkctrl_reg_write,
},
@@ -449,7 +455,7 @@ static struct pt_reg_info_tbl pt_emu_reg
.init_val = 0x0000,
.ro_mask = 0x0000,
.emu_mask = 0xFFFF,
- .init = pt_common_reg_init,
+ .init = pt_devctrl2_reg_init,
.u.w.read = pt_word_reg_read,
.u.w.write = pt_devctrl2_reg_write,
},
@@ -666,8 +672,8 @@ static const struct pt_reg_grp_info_tbl
{
.grp_id = PCI_CAP_ID_EXP,
.grp_type = GRP_TYPE_EMU,
- .grp_size = 0x3C,
- .size_init = pt_reg_grp_size_init,
+ .grp_size = 0xFF,
+ .size_init = pt_pcie_size_init,
.emu_reg_tbl= pt_emu_reg_pcie_tbl,
},
/* MSI-X Capability Structure reg group */
@@ -1869,12 +1875,57 @@ static uint32_t pt_bar_reg_init(struct p
return reg_field;
}

+/* initialize Link Control register */
+static uint32_t pt_linkctrl_reg_init(struct pt_dev *ptdev,
+ struct pt_reg_info_tbl *reg, uint32_t real_offset)
+{
+ uint8_t cap_ver = 0;
+ uint8_t dev_type = 0;
+
+ cap_ver = (ptdev->dev.config[(real_offset - reg->offset) + PCI_EXP_FLAGS] &
+ (uint8_t)PCI_EXP_FLAGS_VERS);
+ dev_type = (ptdev->dev.config[(real_offset - reg->offset) + PCI_EXP_FLAGS] &
+ (uint8_t)PCI_EXP_FLAGS_TYPE) >> 4;
+
+ /* no need to initialize in case of Root Complex Integrated Endpoint
+ * with cap_ver 1.x
+ */
+ if ((dev_type == PCI_EXP_TYPE_ROOT_INT_EP) && (cap_ver == 1))
+ return PT_INVALID_REG;
+
+ return reg->init_val;
+}
+
+/* initialize Device Control 2 register */
+static uint32_t pt_devctrl2_reg_init(struct pt_dev *ptdev,
+ struct pt_reg_info_tbl *reg, uint32_t real_offset)
+{
+ uint8_t cap_ver = 0;
+
+ cap_ver = (ptdev->dev.config[(real_offset - reg->offset) + PCI_EXP_FLAGS] &
+ (uint8_t)PCI_EXP_FLAGS_VERS);
+
+ /* no need to initialize in case of cap_ver 1.x */
+ if (cap_ver == 1)
+ return PT_INVALID_REG;
+
+ return reg->init_val;
+}
+
/* initialize Link Control 2 register */
static uint32_t pt_linkctrl2_reg_init(struct pt_dev *ptdev,
struct pt_reg_info_tbl *reg, uint32_t real_offset)
{
int reg_field = 0;
-
+ uint8_t cap_ver = 0;
+
+ cap_ver = (ptdev->dev.config[(real_offset - reg->offset) + PCI_EXP_FLAGS] &
+ (uint8_t)PCI_EXP_FLAGS_VERS);
+
+ /* no need to initialize in case of cap_ver 1.x */
+ if (cap_ver == 1)
+ return PT_INVALID_REG;
+
/* set Supported Link Speed */
reg_field |=
(0x0F &
@@ -2034,6 +2085,91 @@ static uint8_t pt_vendor_size_init(struc
struct pt_reg_grp_info_tbl *grp_reg, uint32_t base_offset)
{
return ptdev->dev.config[base_offset + 0x02];
+}
+
+/* get PCI Express Capability Structure register group size */
+static uint8_t pt_pcie_size_init(struct pt_dev *ptdev,
+ struct pt_reg_grp_info_tbl *grp_reg, uint32_t base_offset)
+{
+ PCIDevice *d = &ptdev->dev;
+ uint16_t exp_flag = 0;
+ uint16_t type = 0;
+ uint16_t vers = 0;
+ uint8_t pcie_size = 0;
+
+ exp_flag = *((uint16_t*)(d->config + (base_offset + PCI_EXP_FLAGS)));
+ type = (exp_flag & PCI_EXP_FLAGS_TYPE) >> 4;
+ vers = (exp_flag & PCI_EXP_FLAGS_VERS);
+
+ /* calculate size depend on capability version and device/port type */
+ /* in case of PCI Express Base Specification Rev 1.x */
+ if (vers == 1)
+ {
+ /* The PCI Express Capabilities, Device Capabilities, and Device
+ * Status/Control registers are required for all PCI Express devices.
+ * The Link Capabilities and Link Status/Control are required for all
+ * Endpoints that are not Root Complex Integrated Endpoints. Endpoints
+ * are not required to implement registers other than those listed
+ * above and terminate the capability structure.
+ */
+ switch (type) {
+ case PCI_EXP_TYPE_ENDPOINT:
+ case PCI_EXP_TYPE_LEG_END:
+ pcie_size = 0x14;
+ break;
+ case PCI_EXP_TYPE_ROOT_INT_EP:
+ /* has no link */
+ pcie_size = 0x0C;
+ break;
+ /* only EndPoint passthrough is supported */
+ case PCI_EXP_TYPE_ROOT_PORT:
+ case PCI_EXP_TYPE_UPSTREAM:
+ case PCI_EXP_TYPE_DOWNSTREAM:
+ case PCI_EXP_TYPE_PCI_BRIDGE:
+ case PCI_EXP_TYPE_PCIE_BRIDGE:
+ case PCI_EXP_TYPE_ROOT_EC:
+ default:
+ /* exit I/O emulator */
+ PT_LOG("Internal error: Unsupported device/port type[%d]. "
+ "I/O emulator exit.\n", type);
+ exit(1);
+ }
+ }
+ /* in case of PCI Express Base Specification Rev 2.0 */
+ else if (vers == 2)
+ {
+ switch (type) {
+ case PCI_EXP_TYPE_ENDPOINT:
+ case PCI_EXP_TYPE_LEG_END:
+ case PCI_EXP_TYPE_ROOT_INT_EP:
+ /* For Functions that do not implement the registers,
+ * these spaces must be hardwired to 0b.
+ */
+ pcie_size = 0x3C;
+ break;
+ /* only EndPoint passthrough is supported */
+ case PCI_EXP_TYPE_ROOT_PORT:
+ case PCI_EXP_TYPE_UPSTREAM:
+ case PCI_EXP_TYPE_DOWNSTREAM:
+ case PCI_EXP_TYPE_PCI_BRIDGE:
+ case PCI_EXP_TYPE_PCIE_BRIDGE:
+ case PCI_EXP_TYPE_ROOT_EC:
+ default:
+ /* exit I/O emulator */
+ PT_LOG("Internal error: Unsupported device/port type[%d]. "
+ "I/O emulator exit.\n", type);
+ exit(1);
+ }
+ }
+ else
+ {
+ /* exit I/O emulator */
+ PT_LOG("Internal error: Unsupported capability version[%d]. "
+ "I/O emulator exit.\n", vers);
+ exit(1);
+ }
+
+ return pcie_size;
}

/* read byte size emulate register */
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/ioemu/hw/pass-through.h
--- a/tools/ioemu/hw/pass-through.h Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/ioemu/hw/pass-through.h Fri Sep 12 14:47:40 2008 +0900
@@ -60,6 +60,21 @@
#ifndef PCI_MSI_FLAGS_MASK_BIT
/* interrupt masking & reporting supported */
#define PCI_MSI_FLAGS_MASK_BIT 0x0100
+#endif
+
+#ifndef PCI_EXP_TYPE_PCIE_BRIDGE
+/* PCI/PCI-X to PCIE Bridge */
+#define PCI_EXP_TYPE_PCIE_BRIDGE 0x8
+#endif
+
+#ifndef PCI_EXP_TYPE_ROOT_INT_EP
+/* Root Complex Integrated Endpoint */
+#define PCI_EXP_TYPE_ROOT_INT_EP 0x9
+#endif
+
+#ifndef PCI_EXP_TYPE_ROOT_EC
+/* Root Complex Event Collector */
+#define PCI_EXP_TYPE_ROOT_EC 0xa
#endif

#define PT_INVALID_REG 0xFFFFFFFF /* invalid register value */
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/ioemu/hw/pci.c
--- a/tools/ioemu/hw/pci.c Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/ioemu/hw/pci.c Fri Sep 12 14:47:40 2008 +0900
@@ -45,7 +45,6 @@ static void pci_update_mappings(PCIDevic
static void pci_update_mappings(PCIDevice *d);

target_phys_addr_t pci_mem_base;
-static int pci_irq_index;
static PCIBus *first_bus;

PCIBus *pci_register_bus(pci_set_irq_fn set_irq, pci_map_irq_fn map_irq,
@@ -114,9 +113,6 @@ PCIDevice *pci_register_device(PCIBus *b
{
PCIDevice *pci_dev;

- if (pci_irq_index >= PCI_DEVICES_MAX)
- return NULL;
-
if (devfn < 0) {
for(devfn = bus->devfn_min ; devfn < 256; devfn += 8) {
if ( !bus->devices[devfn] &&
@@ -140,7 +136,6 @@ PCIDevice *pci_register_device(PCIBus *b
config_write = pci_default_write_config;
pci_dev->config_read = config_read;
pci_dev->config_write = config_write;
- pci_dev->irq_index = pci_irq_index++;
bus->devices[devfn] = pci_dev;
return pci_dev;
}
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/ioemu/hw/pt-msi.c
--- a/tools/ioemu/hw/pt-msi.c Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/ioemu/hw/pt-msi.c Fri Sep 12 14:47:40 2008 +0900
@@ -313,7 +313,7 @@ int pt_msix_init(struct pt_dev *dev, int

table_off = pci_read_long(pd, pos + PCI_MSIX_TABLE);
bar_index = dev->msix->bar_index = table_off & PCI_MSIX_BIR;
- table_off &= table_off & ~PCI_MSIX_BIR;
+ table_off = dev->msix->table_off = table_off & ~PCI_MSIX_BIR;
dev->msix->table_base = dev->pci_dev->base_addr[bar_index];
PT_LOG("get MSI-X table bar base %llx\n",
(unsigned long long)dev->msix->table_base);
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/ioemu/hw/vga.c
--- a/tools/ioemu/hw/vga.c Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/ioemu/hw/vga.c Fri Sep 12 14:47:40 2008 +0900
@@ -2080,7 +2080,13 @@ void xen_vga_vram_map(uint64_t vram_addr

if (copy)
memcpy(vram, xen_vga_state->vram_ptr, VGA_RAM_SIZE);
- qemu_free(xen_vga_state->vram_ptr);
+ if (xen_vga_state->vram_mfns) {
+ /* In case this function is called more than once */
+ free(xen_vga_state->vram_mfns);
+ munmap(xen_vga_state->vram_ptr, VGA_RAM_SIZE);
+ } else {
+ qemu_free(xen_vga_state->vram_ptr);
+ }
xen_vga_state->vram_ptr = vram;
xen_vga_state->vram_mfns = pfn_list;
#ifdef CONFIG_STUBDOM
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/ioemu/hw/xen_machine_fv.c
--- a/tools/ioemu/hw/xen_machine_fv.c Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/ioemu/hw/xen_machine_fv.c Fri Sep 12 14:47:40 2008 +0900
@@ -139,8 +139,10 @@ uint8_t *qemu_map_cache(target_phys_addr
!test_bit(address_offset>>XC_PAGE_SHIFT, entry->valid_mapping))
qemu_remap_bucket(entry, address_index);

- if (!test_bit(address_offset>>XC_PAGE_SHIFT, entry->valid_mapping))
+ if (!test_bit(address_offset>>XC_PAGE_SHIFT, entry->valid_mapping)) {
+ last_address_index = ~0UL;
return NULL;
+ }

last_address_index = address_index;
last_address_vaddr = entry->vaddr_base;
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/ioemu/vl.h
--- a/tools/ioemu/vl.h Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/ioemu/vl.h Fri Sep 12 14:47:40 2008 +0900
@@ -812,8 +812,6 @@ struct PCIDevice {
/* do not access the following fields */
PCIConfigReadFunc *config_read;
PCIConfigWriteFunc *config_write;
- /* ??? This is a PC-specific hack, and should be removed. */
- int irq_index;

/* Current IRQ levels. Used internally by the generic PCI code. */
int irq_state[4];
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/libxc/ia64/xc_ia64_linux_save.c
--- a/tools/libxc/ia64/xc_ia64_linux_save.c Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/libxc/ia64/xc_ia64_linux_save.c Fri Sep 12 14:47:40 2008 +0900
@@ -53,12 +53,12 @@ static inline void set_bit(int nr, volat
}

static int
-suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
+suspend_and_state(int (*suspend)(void), int xc_handle, int io_fd,
int dom, xc_dominfo_t *info)
{
int i = 0;

- if (!(*suspend)(dom)) {
+ if (!(*suspend)()) {
ERROR("Suspend request failed");
return -1;
}
@@ -406,7 +406,7 @@ out:

int
xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
- uint32_t max_factor, uint32_t flags, int (*suspend)(int),
+ uint32_t max_factor, uint32_t flags, int (*suspend)(void),
int hvm, void *(*init_qemu_maps)(int, unsigned),
void (*qemu_flip_buffer)(int, int))
{
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/libxc/xc_domain_save.c
--- a/tools/libxc/xc_domain_save.c Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/libxc/xc_domain_save.c Fri Sep 12 14:47:40 2008 +0900
@@ -338,72 +338,23 @@ static int analysis_phase(int xc_handle,
}


-static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
+static int suspend_and_state(int (*suspend)(void), int xc_handle, int io_fd,
int dom, xc_dominfo_t *info)
{
- int i = 0;
-
- if ( !(*suspend)(dom) )
+ if ( !(*suspend)() )
{
ERROR("Suspend request failed");
return -1;
}

- retry:
-
- if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1 )
- {
- ERROR("Could not get domain info");
+ if ( (xc_domain_getinfo(xc_handle, dom, 1, info) != 1) ||
+ !info->shutdown || (info->shutdown_reason != SHUTDOWN_suspend) )
+ {
+ ERROR("Domain not in suspended state");
return -1;
}

- if ( info->dying )
- {
- ERROR("domain is dying");
- return -1;
- }
-
- if ( info->crashed )
- {
- ERROR("domain has crashed");
- return -1;
- }
-
- if ( info->shutdown )
- {
- switch ( info->shutdown_reason )
- {
- case SHUTDOWN_poweroff:
- case SHUTDOWN_reboot:
- ERROR("domain has shut down");
- return -1;
- case SHUTDOWN_suspend:
- return 0;
- case SHUTDOWN_crash:
- ERROR("domain has crashed");
- return -1;
- }
- }
-
- if ( info->paused )
- {
- /* Try unpausing domain, wait, and retest. */
- xc_domain_unpause( xc_handle, dom );
- ERROR("Domain was paused. Wait and re-test.");
- usleep(10000); /* 10ms */
- goto retry;
- }
-
- if ( ++i < 100 )
- {
- ERROR("Retry suspend domain");
- usleep(10000); /* 10ms */
- goto retry;
- }
-
- ERROR("Unable to suspend domain.");
-
- return -1;
+ return 0;
}

/*
@@ -796,7 +747,7 @@ static xen_pfn_t *map_and_save_p2m_table


int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
- uint32_t max_factor, uint32_t flags, int (*suspend)(int),
+ uint32_t max_factor, uint32_t flags, int (*suspend)(void),
int hvm, void *(*init_qemu_maps)(int, unsigned),
void (*qemu_flip_buffer)(int, int))
{
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/libxc/xc_evtchn.c
--- a/tools/libxc/xc_evtchn.c Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/libxc/xc_evtchn.c Fri Sep 12 14:47:40 2008 +0900
@@ -59,17 +59,8 @@ int xc_evtchn_reset(int xc_handle,
return do_evtchn_op(xc_handle, EVTCHNOP_reset, &arg, sizeof(arg), 0);
}

-int xc_evtchn_status(int xc_handle,
- uint32_t dom,
- uint32_t port)
+int xc_evtchn_status(int xc_handle, xc_evtchn_status_t *status)
{
- int rc;
- struct evtchn_status arg = { .dom = (domid_t)dom,
- .port = (evtchn_port_t)port };
-
- rc = do_evtchn_op(xc_handle, EVTCHNOP_status, &arg, sizeof(arg), 1);
- if ( rc == 0 )
- rc = arg.status;
-
- return rc;
+ return do_evtchn_op(xc_handle, EVTCHNOP_status, status,
+ sizeof(*status), 1);
}
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/libxc/xc_private.c
--- a/tools/libxc/xc_private.c Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/libxc/xc_private.c Fri Sep 12 14:47:40 2008 +0900
@@ -307,6 +307,13 @@ int xc_memory_op(int xc_handle,
goto out1;
}
break;
+ case XENMEM_remove_from_physmap:
+ if ( lock_pages(arg, sizeof(struct xen_remove_from_physmap)) )
+ {
+ PERROR("Could not lock");
+ goto out1;
+ }
+ break;
case XENMEM_current_reservation:
case XENMEM_maximum_reservation:
case XENMEM_maximum_gpfn:
@@ -339,6 +346,9 @@ int xc_memory_op(int xc_handle,
break;
case XENMEM_add_to_physmap:
unlock_pages(arg, sizeof(struct xen_add_to_physmap));
+ break;
+ case XENMEM_remove_from_physmap:
+ unlock_pages(arg, sizeof(struct xen_remove_from_physmap));
break;
case XENMEM_current_reservation:
case XENMEM_maximum_reservation:
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/libxc/xenctrl.h Fri Sep 12 14:47:40 2008 +0900
@@ -502,9 +502,9 @@ xc_evtchn_alloc_unbound(int xc_handle,

int xc_evtchn_reset(int xc_handle,
uint32_t dom);
-int xc_evtchn_status(int xc_handle,
- uint32_t dom,
- uint32_t port);
+
+typedef struct evtchn_status xc_evtchn_status_t;
+int xc_evtchn_status(int xc_handle, xc_evtchn_status_t *status);

/*
* Return a handle to the event channel driver, or -1 on failure, in which case
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/libxc/xenguest.h
--- a/tools/libxc/xenguest.h Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/libxc/xenguest.h Fri Sep 12 14:47:40 2008 +0900
@@ -25,7 +25,7 @@
*/
int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
- int (*suspend)(int domid), int hvm,
+ int (*suspend)(void), int hvm,
void *(*init_qemu_maps)(int, unsigned), /* HVM only */
void (*qemu_flip_buffer)(int, int)); /* HVM only */

diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/Makefile
--- a/tools/python/Makefile Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/python/Makefile Fri Sep 12 14:47:40 2008 +0900
@@ -1,13 +1,5 @@ XEN_ROOT = ../..
XEN_ROOT = ../..
include $(XEN_ROOT)/tools/Rules.mk
-
-XEN_SECURITY_MODULE = dummy
-ifeq ($(FLASK_ENABLE),y)
-XEN_SECURITY_MODULE = flask
-endif
-ifeq ($(ACM_SECURITY),y)
-XEN_SECURITY_MODULE = acm
-endif

.PHONY: all
all: build
@@ -23,8 +15,8 @@ NLSDIR = /usr/share/locale
NLSDIR = /usr/share/locale

.PHONY: build buildpy
-buildpy: xsm.py
- CC="$(CC)" CFLAGS="$(CFLAGS)" XEN_SECURITY_MODULE="$(XEN_SECURITY_MODULE)" python setup.py build
+buildpy:
+ CC="$(CC)" CFLAGS="$(CFLAGS)" python setup.py build

build: buildpy refresh-pot refresh-po $(CATALOGS)

@@ -61,18 +53,6 @@ refresh-po: $(POTFILE)
%.mo: %.po
$(MSGFMT) -c -o $@ $<

-xsm.py:
- @(set -e; \
- echo "XEN_SECURITY_MODULE = \""$(XEN_SECURITY_MODULE)"\""; \
- echo "from xsm_core import *"; \
- echo ""; \
- echo "import xen.util.xsm."$(XEN_SECURITY_MODULE)"."$(XEN_SECURITY_MODULE)" as xsm_module"; \
- echo ""; \
- echo "xsm_init(xsm_module)"; \
- echo "from xen.util.xsm."$(XEN_SECURITY_MODULE)"."$(XEN_SECURITY_MODULE)" import *"; \
- echo "del xsm_module"; \
- echo "") >xen/util/xsm/$@
-
.PHONY: install
ifndef XEN_PYTHON_NATIVE_INSTALL
install: LIBPATH=$(shell PYTHONPATH=xen/util python -c "import auxbin; print auxbin.libpath()")
@@ -104,4 +84,4 @@ test:

.PHONY: clean
clean:
- rm -rf build *.pyc *.pyo *.o *.a *~ $(CATALOGS) xen/util/xsm/xsm.py xen/util/auxbin.pyc
+ rm -rf build *.pyc *.pyo *.o *.a *~ $(CATALOGS) xen/util/auxbin.pyc
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/util/xsconstants.py
--- a/tools/python/xen/util/xsconstants.py Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/python/xen/util/xsconstants.py Fri Sep 12 14:47:40 2008 +0900
@@ -20,8 +20,10 @@ XS_INST_BOOT = (1 << 0)
XS_INST_BOOT = (1 << 0)
XS_INST_LOAD = (1 << 1)

-XS_POLICY_NONE = 0
XS_POLICY_ACM = (1 << 0)
+XS_POLICY_FLASK = (1 << 1)
+XS_POLICY_DUMMY = (1 << 2)
+XS_POLICY_USE = 0

# Some internal variables used by the Xen-API
ACM_LABEL_VM = (1 << 0)
@@ -107,6 +109,6 @@ ACM_POLICY_ID = 'ACM'

INVALID_POLICY_PREFIX = 'INV_'

-INVALID_SSIDREF = 0xFFFFFFFF
+INVALID_SSIDREF = 0xFFFFFFFFL

XS_INACCESSIBLE_LABEL = '__INACCESSIBLE__'
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/util/xsm/flask/flask.py
--- a/tools/python/xen/util/xsm/flask/flask.py Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/python/xen/util/xsm/flask/flask.py Fri Sep 12 14:47:40 2008 +0900
@@ -1,5 +1,6 @@ import sys
import sys
from xen.lowlevel import flask
+from xen.util import xsconstants
from xen.xend import sxp

#Functions exported through XML-RPC
@@ -12,7 +13,7 @@ def err(msg):
raise XSMError(msg)

def on():
- return 0 #xsconstants.XS_POLICY_FLASK
+ return xsconstants.XS_POLICY_FLASK

def ssidref2label(ssidref):
try:
@@ -37,8 +38,9 @@ def set_security_label(policy, label):
return label

def ssidref2security_label(ssidref):
- return ssidref2label(ssidref)
+ label = ssidref2label(ssidref)
+ return label

def get_security_label(self, xspol=None):
- label = self.info.get('security_label', '')
+ label = self.info['security_label']
return label
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/util/xsm/xsm.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/python/xen/util/xsm/xsm.py Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,20 @@
+import sys
+import string
+from xen.xend import XendOptions
+from xen.util import xsconstants
+from xsm_core import xsm_init
+
+xoptions = XendOptions.instance()
+xsm_module_name = xoptions.get_xsm_module_name()
+
+xsconstants.XS_POLICY_USE = eval("xsconstants.XS_POLICY_" +
+ string.upper(xsm_module_name))
+
+xsm_module_path = "xen.util.xsm." + xsm_module_name + "." + xsm_module_name
+xsm_module = __import__(xsm_module_path, globals(), locals(), ['*'])
+
+xsm_init(xsm_module)
+
+for op in dir(xsm_module):
+ if not hasattr(sys.modules[__name__], op):
+ setattr(sys.modules[__name__], op, getattr(xsm_module, op, None))
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/xend/XendConfig.py
--- a/tools/python/xen/xend/XendConfig.py Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/python/xen/xend/XendConfig.py Fri Sep 12 14:47:40 2008 +0900
@@ -729,7 +729,7 @@ class XendConfig(dict):
self.parse_cpuid(cfg, 'cpuid_check')

import xen.util.xsm.xsm as security
- if security.on() == xsconstants.XS_POLICY_ACM:
+ if security.on() == xsconstants.XS_POLICY_USE:
from xen.util.acmpolicy import ACM_LABEL_UNLABELED
if not 'security' in cfg and sxp.child_value(sxp_cfg, 'security'):
cfg['security'] = sxp.child_value(sxp_cfg, 'security')
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/python/xen/xend/XendDomainInfo.py Fri Sep 12 14:47:40 2008 +0900
@@ -2069,7 +2069,7 @@ class XendDomainInfo:
balloon.free(2*1024) # 2MB should be plenty

ssidref = 0
- if security.on() == xsconstants.XS_POLICY_ACM:
+ if security.on() == xsconstants.XS_POLICY_USE:
ssidref = security.calc_dom_ssidref_from_info(self.info)
if security.has_authorization(ssidref) == False:
raise VmError("VM is not authorized to run.")
@@ -2855,10 +2855,6 @@ class XendDomainInfo:
info["maxmem_kb"] = XendNode.instance() \
.physinfo_dict()['total_memory'] * 1024

- #ssidref field not used any longer
- if 'ssidref' in info:
- info.pop('ssidref')
-
# make sure state is reset for info
# TODO: we should eventually get rid of old_dom_states

diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/xend/XendOptions.py
--- a/tools/python/xen/xend/XendOptions.py Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/python/xen/xend/XendOptions.py Fri Sep 12 14:47:40 2008 +0900
@@ -131,6 +131,9 @@ class XendOptions:

"""Default script to configure a backend network interface"""
vif_script = osdep.vif_script
+
+ """Default Xen Security Module"""
+ xsm_module_default = 'dummy'

"""Default rotation count of qemu-dm log file."""
qemu_dm_logrotate_count = 10
@@ -427,6 +430,11 @@ class XendOptionsFile(XendOptions):
return self.get_config_value('xen-api-server',
self.xen_api_server_default)

+ def get_xsm_module_name(self):
+ """Get the Xen Security Module name.
+ """
+ return self.get_config_string('xsm_module_name', self.xsm_module_default)
+
if os.uname()[0] == 'SunOS':
class XendOptionsSMF(XendOptions):

diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/xend/server/blkif.py
--- a/tools/python/xen/xend/server/blkif.py Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/python/xen/xend/server/blkif.py Fri Sep 12 14:47:40 2008 +0900
@@ -78,7 +78,7 @@ class BlkifController(DevController):
if uuid:
back['uuid'] = uuid

- if security.on() == xsconstants.XS_POLICY_ACM:
+ if security.on() == xsconstants.XS_POLICY_USE:
self.do_access_control(config, uname)

(device_path, devid) = blkif.blkdev_name_to_number(dev)
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/xend/server/netif.py
--- a/tools/python/xen/xend/server/netif.py Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/python/xen/xend/server/netif.py Fri Sep 12 14:47:40 2008 +0900
@@ -156,7 +156,7 @@ class NetifController(DevController):
front = { 'handle' : "%i" % devid,
'mac' : mac }

- if security.on() == xsconstants.XS_POLICY_ACM:
+ if security.on() == xsconstants.XS_POLICY_USE:
self.do_access_control(config)

return (devid, back, front)
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/xend/server/pciif.py
--- a/tools/python/xen/xend/server/pciif.py Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/python/xen/xend/server/pciif.py Fri Sep 12 14:47:40 2008 +0900
@@ -286,7 +286,7 @@ class PciController(DevController):
)%(dev.name))

if dev.has_non_page_aligned_bar and arch.type != "ia64":
- raise VmError("pci: %: non-page-aligned MMIO BAR found." % dev.name)
+ raise VmError("pci: %s: non-page-aligned MMIO BAR found." % dev.name)

self.CheckSiblingDevices(fe_domid, dev)

diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/xm/create.py
--- a/tools/python/xen/xm/create.py Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/python/xen/xm/create.py Fri Sep 12 14:47:40 2008 +0900
@@ -566,11 +566,11 @@ gopts.var('hap', val='HAP',
use="""Hap status (0=hap is disabled;
1=hap is enabled.""")

-gopts.var('cpuid', val="IN[,SIN]:eax=EAX,ebx=EBX,exc=ECX,edx=EDX",
+gopts.var('cpuid', val="IN[,SIN]:eax=EAX,ebx=EBX,ecx=ECX,edx=EDX",
fn=append_value, default=[],
use="""Cpuid description.""")

-gopts.var('cpuid_check', val="IN[,SIN]:eax=EAX,ebx=EBX,exc=ECX,edx=EDX",
+gopts.var('cpuid_check', val="IN[,SIN]:eax=EAX,ebx=EBX,ecx=ECX,edx=EDX",
fn=append_value, default=[],
use="""Cpuid check description.""")

@@ -971,7 +971,7 @@ def preprocess_cpuid(vals, attr_name):
"of the register %s for input %s\n"
% (res['reg'], input) )
cpuid[input][res['reg']] = res['val'] # new register
- setattr(vals, attr_name, cpuid)
+ setattr(vals, attr_name, cpuid)

def preprocess_pci(vals):
if not vals.pci: return
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/xm/main.py
--- a/tools/python/xen/xm/main.py Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/python/xen/xm/main.py Fri Sep 12 14:47:40 2008 +0900
@@ -1812,7 +1812,7 @@ def domain_name_to_domid(domain_name):
else:
dom = server.xend.domain(domain_name)
domid = int(sxp.child_value(dom, 'domid', '-1'))
- return domid
+ return int(domid)

def xm_vncviewer(args):
autopass = False;
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/xcutils/lsevtchn.c
--- a/tools/xcutils/lsevtchn.c Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/xcutils/lsevtchn.c Fri Sep 12 14:47:40 2008 +0900
@@ -8,49 +8,55 @@
#include <xenctrl.h>
#include <xenguest.h>

-int
-main(int argc, char **argv)
+int main(int argc, char **argv)
{
- int xc_fd;
- int domid = 0, port = 0, status;
- const char *msg;
+ int xc_fd, domid, port, rc;
+ xc_evtchn_status_t status;

- if ( argc > 1 )
- domid = strtol(argv[1], NULL, 10);
+ domid = (argc > 1) ? strtol(argv[1], NULL, 10) : 0;

xc_fd = xc_interface_open();
if ( xc_fd < 0 )
errx(1, "failed to open control interface");

- while ( (status = xc_evtchn_status(xc_fd, domid, port)) >= 0 )
+ for ( port = 0; ; port++ )
{
- switch ( status )
+ status.dom = domid;
+ status.port = port;
+ rc = xc_evtchn_status(xc_fd, &status);
+ if ( rc < 0 )
+ break;
+
+ if ( status.status == EVTCHNSTAT_closed )
+ continue;
+
+ printf("%4d: VCPU %u: ", port, status.vcpu);
+
+ switch ( status.status )
{
- case EVTCHNSTAT_closed:
- msg = "Channel is not in use.";
- break;
case EVTCHNSTAT_unbound:
- msg = "Channel is waiting interdom connection.";
+ printf("Interdomain (Waiting connection) - Remote Domain %u",
+ status.u.unbound.dom);
break;
case EVTCHNSTAT_interdomain:
- msg = "Channel is connected to remote domain.";
+ printf("Interdomain (Connected) - Remote Domain %u, Port %u",
+ status.u.interdomain.dom, status.u.interdomain.port);
break;
case EVTCHNSTAT_pirq:
- msg = "Channel is bound to a phys IRQ line.";
+ printf("Physical IRQ %u", status.u.pirq);
break;
case EVTCHNSTAT_virq:
- msg = "Channel is bound to a virtual IRQ line.";
+ printf("Virtual IRQ %u", status.u.virq);
break;
case EVTCHNSTAT_ipi:
- msg = "Channel is bound to a virtual IPI line.";
+ printf("IPI");
break;
default:
- msg = "Unknown.";
+ printf("Unknown");
break;
+ }

- }
- printf("%03d: %d: %s\n", port, status, msg);
- port++;
+ printf("\n");
}

xc_interface_close(xc_fd);
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/xcutils/xc_save.c
--- a/tools/xcutils/xc_save.c Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/xcutils/xc_save.c Fri Sep 12 14:47:40 2008 +0900
@@ -32,7 +32,7 @@ static struct suspendinfo {
* Issue a suspend request through stdout, and receive the acknowledgement
* from stdin. This is handled by XendCheckpoint in the Python layer.
*/
-static int compat_suspend(int domid)
+static int compat_suspend(void)
{
char ans[30];

@@ -43,16 +43,35 @@ static int compat_suspend(int domid)
!strncmp(ans, "done\n", 5));
}

-static int suspend_evtchn_release(int xc, int domid)
+static int suspend_evtchn_release(void)
{
if (si.suspend_evtchn >= 0) {
- xc_evtchn_unbind(si.xce, si.suspend_evtchn);
- si.suspend_evtchn = -1;
+ xc_evtchn_unbind(si.xce, si.suspend_evtchn);
+ si.suspend_evtchn = -1;
}
if (si.xce >= 0) {
- xc_evtchn_close(si.xce);
- si.xce = -1;
- }
+ xc_evtchn_close(si.xce);
+ si.xce = -1;
+ }
+
+ return 0;
+}
+
+static int await_suspend(void)
+{
+ int rc;
+
+ do {
+ rc = xc_evtchn_pending(si.xce);
+ if (rc < 0) {
+ warnx("error polling suspend notification channel: %d", rc);
+ return -1;
+ }
+ } while (rc != si.suspend_evtchn);
+
+ /* harmless for one-off suspend */
+ if (xc_evtchn_unmask(si.xce, si.suspend_evtchn) < 0)
+ warnx("failed to unmask suspend notification channel: %d", rc);

return 0;
}
@@ -71,16 +90,16 @@ static int suspend_evtchn_init(int xc, i

xs = xs_daemon_open();
if (!xs) {
- errx(1, "failed to get xenstore handle");
- return -1;
+ warnx("failed to get xenstore handle");
+ return -1;
}
sprintf(path, "/local/domain/%d/device/suspend/event-channel", domid);
portstr = xs_read(xs, XBT_NULL, path, &plen);
xs_daemon_close(xs);

if (!portstr || !plen) {
- warnx("could not read suspend event channel");
- return -1;
+ warnx("could not read suspend event channel");
+ return -1;
}

port = atoi(portstr);
@@ -88,27 +107,29 @@ static int suspend_evtchn_init(int xc, i

si.xce = xc_evtchn_open();
if (si.xce < 0) {
- errx(1, "failed to open event channel handle");
- goto cleanup;
+ warnx("failed to open event channel handle");
+ goto cleanup;
}

si.suspend_evtchn = xc_evtchn_bind_interdomain(si.xce, domid, port);
if (si.suspend_evtchn < 0) {
- errx(1, "failed to bind suspend event channel: %d",
- si.suspend_evtchn);
- goto cleanup;
+ warnx("failed to bind suspend event channel: %d", si.suspend_evtchn);
+ goto cleanup;
}

rc = xc_domain_subscribe_for_suspend(xc, domid, port);
if (rc < 0) {
- errx(1, "failed to subscribe to domain: %d", rc);
- goto cleanup;
- }
+ warnx("failed to subscribe to domain: %d", rc);
+ goto cleanup;
+ }
+
+ /* event channel is pending immediately after binding */
+ await_suspend();

return 0;

cleanup:
- suspend_evtchn_release(xc, domid);
+ suspend_evtchn_release();

return -1;
}
@@ -116,29 +137,20 @@ static int suspend_evtchn_init(int xc, i
/**
* Issue a suspend request to a dedicated event channel in the guest, and
* receive the acknowledgement from the subscribe event channel. */
-static int evtchn_suspend(int domid)
-{
- int xcefd;
+static int evtchn_suspend(void)
+{
int rc;

rc = xc_evtchn_notify(si.xce, si.suspend_evtchn);
if (rc < 0) {
- errx(1, "failed to notify suspend request channel: %d", rc);
- return 0;
- }
-
- xcefd = xc_evtchn_fd(si.xce);
- do {
- rc = xc_evtchn_pending(si.xce);
- if (rc < 0) {
- errx(1, "error polling suspend notification channel: %d", rc);
- return 0;
- }
- } while (rc != si.suspend_evtchn);
-
- /* harmless for one-off suspend */
- if (xc_evtchn_unmask(si.xce, si.suspend_evtchn) < 0)
- errx(1, "failed to unmask suspend notification channel: %d", rc);
+ warnx("failed to notify suspend request channel: %d", rc);
+ return 0;
+ }
+
+ if (await_suspend() < 0) {
+ warnx("suspend failed");
+ return 0;
+ }

/* notify xend that it can do device migration */
printf("suspended\n");
@@ -147,12 +159,12 @@ static int evtchn_suspend(int domid)
return 1;
}

-static int suspend(int domid)
+static int suspend(void)
{
if (si.suspend_evtchn >= 0)
- return evtchn_suspend(domid);
-
- return compat_suspend(domid);
+ return evtchn_suspend();
+
+ return compat_suspend();
}

/* For HVM guests, there are two sources of dirty pages: the Xen shadow
@@ -195,11 +207,9 @@ static void qemu_flip_buffer(int domid,

/* Tell qemu that we want it to start writing log-dirty bits to the
* other buffer */
- if (!xs_write(xs, XBT_NULL, qemu_next_active_path, &digit, 1)) {
+ if (!xs_write(xs, XBT_NULL, qemu_next_active_path, &digit, 1))
errx(1, "can't write next-active to store path (%s)\n",
- qemu_next_active_path);
- exit(1);
- }
+ qemu_next_active_path);

/* Wait a while for qemu to signal that it has switched to the new
* active buffer */
@@ -208,10 +218,8 @@ static void qemu_flip_buffer(int domid,
tv.tv_usec = 0;
FD_ZERO(&fdset);
FD_SET(xs_fileno(xs), &fdset);
- if ((select(xs_fileno(xs) + 1, &fdset, NULL, NULL, &tv)) != 1) {
+ if ((select(xs_fileno(xs) + 1, &fdset, NULL, NULL, &tv)) != 1)
errx(1, "timed out waiting for qemu to switch buffers\n");
- exit(1);
- }
watch = xs_read_watch(xs, &len);
free(watch);

@@ -221,7 +229,7 @@ static void qemu_flip_buffer(int domid,
goto read_again;
}

-static void * init_qemu_maps(int domid, unsigned int bitmap_size)
+static void *init_qemu_maps(int domid, unsigned int bitmap_size)
{
key_t key;
char key_ascii[17] = {0,};
@@ -293,7 +301,7 @@ main(int argc, char **argv)
int ret;

if (argc != 6)
- errx(1, "usage: %s iofd domid maxit maxf flags", argv[0]);
+ errx(1, "usage: %s iofd domid maxit maxf flags", argv[0]);

xc_fd = xc_interface_open();
if (xc_fd < 0)
@@ -305,13 +313,14 @@ main(int argc, char **argv)
max_f = atoi(argv[4]);
flags = atoi(argv[5]);

- suspend_evtchn_init(xc_fd, domid);
+ if (suspend_evtchn_init(xc_fd, domid) < 0)
+ warnx("suspend event channel initialization failed, using slow path");

ret = xc_domain_save(xc_fd, io_fd, domid, maxit, max_f, flags,
&suspend, !!(flags & XCFLAGS_HVM),
&init_qemu_maps, &qemu_flip_buffer);

- suspend_evtchn_release(xc_fd, domid);
+ suspend_evtchn_release();

xc_interface_close(xc_fd);

diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/xenstore/xs.c
--- a/tools/xenstore/xs.c Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/xenstore/xs.c Fri Sep 12 14:47:40 2008 +0900
@@ -795,8 +795,11 @@ char *xs_get_domain_path(struct xs_handl

bool xs_is_domain_introduced(struct xs_handle *h, unsigned int domid)
{
- return strcmp("F",
- single_with_domid(h, XS_IS_DOMAIN_INTRODUCED, domid));
+ char *domain = single_with_domid(h, XS_IS_DOMAIN_INTRODUCED, domid);
+ int rc = strcmp("F", domain);
+
+ free(domain);
+ return rc;
}

/* Only useful for DEBUG versions */
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/xentrace/formats
--- a/tools/xentrace/formats Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/xentrace/formats Fri Sep 12 14:47:40 2008 +0900
@@ -4,56 +4,69 @@ 0x0001f002 CPU%(cpu)d %(tsc)d (+%(relt
0x0001f002 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) wrap_buffer 0x%(1)08x
0x0001f003 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) cpu_change 0x%(1)08x

-0x0002f001 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) sched_add_domain [ domid = 0x%(1)08x, edomid = 0x%(2)08x ]
-0x0002f002 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) sched_rem_domain [ domid = 0x%(1)08x, edomid = 0x%(2)08x ]
-0x0002f003 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) domain_sleep [ domid = 0x%(1)08x, edomid = 0x%(2)08x ]
-0x0002f004 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) domain_wake [ domid = 0x%(1)08x, edomid = 0x%(2)08x ]
-0x0002f005 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) do_yield [ domid = 0x%(1)08x, edomid = 0x%(2)08x ]
-0x0002f006 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) do_block [ domid = 0x%(1)08x, edomid = 0x%(2)08x ]
-0x0002f007 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) domain_shutdown [ domid = 0x%(1)08x, edomid = 0x%(2)08x, reason = 0x%(3)08x ]
-0x0002f008 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) sched_ctl
-0x0002f009 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) sched_adjdom [ domid = 0x%(1)08x ]
-0x0002f00a CPU%(cpu)d %(tsc)d (+%(reltsc)8d) __enter_scheduler [. prev<domid:edomid> = 0x%(1)08x : 0x%(2)08x, next<domid:edomid> = 0x%(3)08x : 0x%(4)08x ]
-0x0002f00B CPU%(cpu)d %(tsc)d (+%(reltsc)8d) s_timer_fn
-0x0002f00c CPU%(cpu)d %(tsc)d (+%(reltsc)8d) t_timer_fn
-0x0002f00d CPU%(cpu)d %(tsc)d (+%(reltsc)8d) dom_timer_fn
-0x0002f00e CPU%(cpu)d %(tsc)d (+%(reltsc)8d) switch_infprev [ old_domid = 0x%(1)08x, runtime = %(2)d ]
-0x0002f00f CPU%(cpu)d %(tsc)d (+%(reltsc)8d) switch_infnext [ new_domid = 0x%(1)08x, time = %(2)d, r_time = %(3)d ]
+0x00021011 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) running_to_runnable [ dom:vcpu = 0x%(1)08x ]
+0x00021021 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) running_to_blocked [ dom:vcpu = 0x%(1)08x ]
+0x00021031 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) running_to_offline [ dom:vcpu = 0x%(1)08x ]
+0x00021101 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) runnable_to_running [ dom:vcpu = 0x%(1)08x ]
+0x00021121 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) runnable_to_blocked [ dom:vcpu = 0x%(1)08x ]
+0x00021131 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) runnable_to_offline [ dom:vcpu = 0x%(1)08x ]
+0x00021201 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) blocked_to_running [ dom:vcpu = 0x%(1)08x ]
+0x00021211 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) blocked_to_runnable [ dom:vcpu = 0x%(1)08x ]
+0x00021231 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) blocked_to_offline [ dom:vcpu = 0x%(1)08x ]
+0x00021301 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) offline_to_running [ dom:vcpu = 0x%(1)08x ]
+0x00021311 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) offline_to_runnable [ dom:vcpu = 0x%(1)08x ]
+0x00021321 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) offline_to_blocked [ dom:vcpu = 0x%(1)08x ]

-0x00081001 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) VMENTRY [ dom:vcpu = 0x%(1)08x ]
-0x00081002 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) VMEXIT [ dom:vcpu = 0x%(1)08x, exitcode = 0x%(2)08x, rIP = 0x%(3)08x ]
-0x00081102 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) VMEXIT [ dom:vcpu = 0x%(1)08x, exitcode = 0x%(2)08x, rIP = 0x%(3)016x ]
-0x00082001 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) PF_XEN [ dom:vcpu = 0x%(1)08x, errorcode = 0x%(2)02x, virt = 0x%(3)08x ]
-0x00082101 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) PF_XEN [ dom:vcpu = 0x%(1)08x, errorcode = 0x%(2)02x, virt = 0x%(3)016x ]
-0x00082002 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) PF_INJECT [ dom:vcpu = 0x%(1)08x, errorcode = 0x%(2)02x, virt = 0x%(3)08x ]
-0x00082102 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) PF_INJECT [ dom:vcpu = 0x%(1)08x, errorcode = 0x%(2)02x, virt = 0x%(3)016x ]
-0x00082003 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) INJ_EXC [ dom:vcpu = 0x%(1)08x, vector = 0x%(2)02x, errorcode = 0x%(3)04x ]
-0x00082004 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) INJ_VIRQ [ dom:vcpu = 0x%(1)08x, vector = 0x%(2)02x, fake = %(3)d ]
-0x00082005 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) REINJ_VIRQ [ dom:vcpu = 0x%(1)08x, vector = 0x%(2)02x ]
-0x00082006 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) IO_READ [ dom:vcpu = 0x%(1)08x, port = 0x%(2)04x, size = %(3)d ]
-0x00082007 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) IO_WRITE [ dom:vcpu = 0x%(1)08x, port = 0x%(2)04x, size = %(3)d ]
-0x00082008 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) CR_READ [ dom:vcpu = 0x%(1)08x, CR# = %(2)d, value = 0x%(3)08x ]
-0x00082108 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) CR_READ [ dom:vcpu = 0x%(1)08x, CR# = %(2)d, value = 0x%(3)016x ]
-0x00082009 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) CR_WRITE [ dom:vcpu = 0x%(1)08x, CR# = %(2)d, value = 0x%(3)08x ]
-0x00082109 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) CR_WRITE [ dom:vcpu = 0x%(1)08x, CR# = %(2)d, value = 0x%(3)016x ]
-0x0008200A CPU%(cpu)d %(tsc)d (+%(reltsc)8d) DR_READ [ dom:vcpu = 0x%(1)08x ]
-0x0008200B CPU%(cpu)d %(tsc)d (+%(reltsc)8d) DR_WRITE [ dom:vcpu = 0x%(1)08x ]
-0x0008200C CPU%(cpu)d %(tsc)d (+%(reltsc)8d) MSR_READ [ dom:vcpu = 0x%(1)08x, MSR# = 0x%(2)08x, value = 0x%(3)016x ]
-0x0008200D CPU%(cpu)d %(tsc)d (+%(reltsc)8d) MSR_WRITE [ dom:vcpu = 0x%(1)08x, MSR# = 0x%(2)08x, value = 0x%(3)016x ]
-0x0008200E CPU%(cpu)d %(tsc)d (+%(reltsc)8d) CPUID [. dom:vcpu = 0x%(1)08x, func = 0x%(2)08x, eax = 0x%(3)08x, ebx = 0x%(4)08x, ecx=0x%(5)08x, edx = 0x%(6)08x ]
-0x0008200F CPU%(cpu)d %(tsc)d (+%(reltsc)8d) INTR [ dom:vcpu = 0x%(1)08x, vector = 0x%(2)02x ]
-0x00082010 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) NMI [ dom:vcpu = 0x%(1)08x ]
-0x00082011 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) SMI [ dom:vcpu = 0x%(1)08x ]
-0x00082012 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) VMMCALL [ dom:vcpu = 0x%(1)08x, func = 0x%(2)08x ]
-0x00082013 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) HLT [ dom:vcpu = 0x%(1)08x, intpending = %(2)d ]
-0x00082014 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) INVLPG [. dom:vcpu = 0x%(1)08x, is invlpga? = %(2)d, virt = 0x%(3)08x ]
-0x00082114 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) INVLPG [. dom:vcpu = 0x%(1)08x, is invlpga? = %(2)d, virt = 0x%(3)016x ]
-0x00082015 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) MCE [ dom:vcpu = 0x%(1)08x ]
-0x00082016 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) IO_ASSIST [ dom:vcpu = 0x%(1)08x, data = 0x%(2)04x ]
-0x00082017 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) MMIO_ASSIST [ dom:vcpu = 0x%(1)08x, data = 0x%(2)04x ]
-0x00082018 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) CLTS [ dom:vcpu = 0x%(1)08x ]
-0x00082019 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) LMSW [ dom:vcpu = 0x%(1)08x, value = 0x%(2)08x ]
-0x00082119 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) LMSW [ dom:vcpu = 0x%(1)08x, value = 0x%(2)016x ]
+0x00028001 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) sched_add_domain [ domid = 0x%(1)08x, edomid = 0x%(2)08x ]
+0x00028002 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) sched_rem_domain [ domid = 0x%(1)08x, edomid = 0x%(2)08x ]
+0x00028003 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) domain_sleep [ domid = 0x%(1)08x, edomid = 0x%(2)08x ]
+0x00028004 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) domain_wake [ domid = 0x%(1)08x, edomid = 0x%(2)08x ]
+0x00028005 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) do_yield [ domid = 0x%(1)08x, edomid = 0x%(2)08x ]
+0x00028006 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) do_block [ domid = 0x%(1)08x, edomid = 0x%(2)08x ]
+0x00028007 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) domain_shutdown [ domid = 0x%(1)08x, edomid = 0x%(2)08x, reason = 0x%(3)08x ]
+0x00028008 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) sched_ctl
+0x00028009 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) sched_adjdom [ domid = 0x%(1)08x ]
+0x0002800a CPU%(cpu)d %(tsc)d (+%(reltsc)8d) __enter_scheduler [. prev<domid:edomid> = 0x%(1)08x : 0x%(2)08x, next<domid:edomid> = 0x%(3)08x : 0x%(4)08x ]
+0x0002800b CPU%(cpu)d %(tsc)d (+%(reltsc)8d) s_timer_fn
+0x0002800c CPU%(cpu)d %(tsc)d (+%(reltsc)8d) t_timer_fn
+0x0002800d CPU%(cpu)d %(tsc)d (+%(reltsc)8d) dom_timer_fn
+0x0002800e CPU%(cpu)d %(tsc)d (+%(reltsc)8d) switch_infprev [ old_domid = 0x%(1)08x, runtime = %(2)d ]
+0x0002800f CPU%(cpu)d %(tsc)d (+%(reltsc)8d) switch_infnext [ new_domid = 0x%(1)08x, time = %(2)d, r_time = %(3)d ]
+
+0x00081001 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) VMENTRY
+0x00081002 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) VMEXIT [ exitcode = 0x%(1)08x, rIP = 0x%(2)08x ]
+0x00081102 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) VMEXIT [ exitcode = 0x%(1)08x, rIP = 0x%(2)016x ]
+0x00082001 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) PF_XEN [ errorcode = 0x%(2)02x, virt = 0x%(1)08x ]
+0x00082101 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) PF_XEN [ errorcode = 0x%(2)02x, virt = 0x%(1)016x ]
+0x00082002 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) PF_INJECT [ errorcode = 0x%(1)02x, virt = 0x%(2)08x ]
+0x00082102 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) PF_INJECT [ errorcode = 0x%(1)02x, virt = 0x%(2)016x ]
+0x00082003 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) INJ_EXC [ vector = 0x%(1)02x, errorcode = 0x%(2)04x ]
+0x00082004 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) INJ_VIRQ [ vector = 0x%(1)02x, fake = %(2)d ]
+0x00082005 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) REINJ_VIRQ [ vector = 0x%(1)02x ]
+0x00082006 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) IO_READ [ port = 0x%(1)04x, size = %(2)d ]
+0x00082007 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) IO_WRITE [ port = 0x%(1)04x, size = %(2)d ]
+0x00082008 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) CR_READ [ CR# = %(1)d, value = 0x%(2)08x ]
+0x00082108 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) CR_READ [ CR# = %(1)d, value = 0x%(2)016x ]
+0x00082009 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) CR_WRITE [ CR# = %(1)d, value = 0x%(2)08x ]
+0x00082109 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) CR_WRITE [ CR# = %(1)d, value = 0x%(2)016x ]
+0x0008200A CPU%(cpu)d %(tsc)d (+%(reltsc)8d) DR_READ
+0x0008200B CPU%(cpu)d %(tsc)d (+%(reltsc)8d) DR_WRITE
+0x0008200C CPU%(cpu)d %(tsc)d (+%(reltsc)8d) MSR_READ [ MSR# = 0x%(1)08x, value = 0x%(2)016x ]
+0x0008200D CPU%(cpu)d %(tsc)d (+%(reltsc)8d) MSR_WRITE [ MSR# = 0x%(1)08x, value = 0x%(2)016x ]
+0x0008200E CPU%(cpu)d %(tsc)d (+%(reltsc)8d) CPUID [. func = 0x%(1)08x, eax = 0x%(2)08x, ebx = 0x%(3)08x, ecx=0x%(4)08x, edx = 0x%(5)08x ]
+0x0008200F CPU%(cpu)d %(tsc)d (+%(reltsc)8d) INTR [ vector = 0x%(1)02x ]
+0x00082010 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) NMI
+0x00082011 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) SMI
+0x00082012 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) VMMCALL [ func = 0x%(1)08x ]
+0x00082013 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) HLT [ intpending = %(1)d ]
+0x00082014 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) INVLPG [ is invlpga? = %(1)d, virt = 0x%(2)08x ]
+0x00082114 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) INVLPG [ is invlpga? = %(1)d, virt = 0x%(2)016x ]
+0x00082015 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) MCE
+0x00082016 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) IO_ASSIST [ data = 0x%(1)04x ]
+0x00082017 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) MMIO_ASSIST [ data = 0x%(1)04x ]
+0x00082018 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) CLTS
+0x00082019 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) LMSW [ value = 0x%(1)08x ]
+0x00082119 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) LMSW [ value = 0x%(1)016x ]

0x0010f001 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) page_grant_map [ domid = %(1)d ]
0x0010f002 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) page_grant_unmap [ domid = %(1)d ]
@@ -65,3 +78,41 @@ 0x0020f103 CPU%(cpu)d %(tsc)d (+%(relt
0x0020f103 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) trap [ rip = 0x%(1)016x, trapnr:error = 0x%(2)08x ]
0x0020f004 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) page_fault [ eip = 0x%(1)08x, addr = 0x%(2)08x, error = 0x%(3)08x ]
0x0020f104 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) page_fault [ rip = 0x%(1)16x, addr = 0x%(3)16x, error = 0x%(5)08x ]
+
+0x0020f006 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) emulate_privop [ eip = 0x%(1)08x ]
+0x0020f106 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) emulate_privop [ rip = 0x%(1)16x ]
+0x0020f007 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) emulate_4G [ eip = 0x%(1)08x ]
+0x0020f107 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) emulate_4G [ rip = 0x%(1)16x ]
+0x0020f00c CPU%(cpu)d %(tsc)d (+%(reltsc)8d) ptwr_emulation_pae [ addr = 0x%(2)08x, eip = 0x%(1)08x, npte = 0x%(1)16x ]
+0x0020f10c CPU%(cpu)d %(tsc)d (+%(reltsc)8d) ptwr_emulation_pae [ addr = 0x%(2)16x, rip = 0x%(1)16x, npte = 0x%(1)16x ]
+
+0x0040f001 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_not_shadow [ gl1e = 0x%(1)16x, va = 0x%(2)08x, flags = 0x%(3)08x ]
+0x0040f101 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_not_shadow [ gl1e = 0x%(1)16x, va = 0x%(2)16x, flags = 0x%(3)08x ]
+0x0040f002 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_fast_propagate [ va = 0x%(1)08x ]
+0x0040f102 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_fast_propagate [ va = 0x%(1)16x ]
+0x0040f003 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_fast_mmio [ va = 0x%(1)08x ]
+0x0040f103 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_fast_mmio [ va = 0x%(1)16x ]
+0x0040f004 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_false_fast_path [ va = 0x%(1)08x ]
+0x0040f104 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_false_fast_path [ va = 0x%(1)16x ]
+0x0040f005 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_mmio [ va = 0x%(1)08x ]
+0x0040f105 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_mmio [ va = 0x%(1)16x ]
+0x0040f006 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_fixup [ gl1e = 0x%(1)08x, va = 0x%(2)08x, flags = 0x%(3)08x ]
+0x0040f106 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_fixup [ gl1e = 0x%(1)16x, va = 0x%(2)16x, flags = 0x%(3)08x ]
+0x0040f007 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_domf_dying [ va = 0x%(1)08x ]
+0x0040f107 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_domf_dying [ va = 0x%(1)16x ]
+0x0040f008 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate [. gl1e = 0x%(1)08x, write_val = 0x%(2)08x, va = 0x%(3)08x, flags = 0x%(4)08x, emulation_count = 0x%(5)08x]
+0x0040f108 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate [. gl1e = 0x%(1)16x, write_val = 0x%(2)16x, va = 0x%(3)16x, flags = 0x%(4)08x, emulation_count = 0x%(5)08x]
+0x0040f009 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate_unshadow_user [ va = 0x%(1)08x, gfn = 0x%(2)08x ]
+0x0040f109 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate_unshadow_user [ va = 0x%(1)16x, gfn = 0x%(2)16x ]
+0x0040f00a CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate_unshadow_evtinj [ va = 0x%(1)08x, gfn = 0x%(2)08x ]
+0x0040f10a CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate_unshadow_evtinj [ va = 0x%(1)16x, gfn = 0x%(2)16x ]
+0x0040f00b CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate_unshadow_unhandled [ va = 0x%(1)08x, gfn = 0x%(2)08x ]
+0x0040f10b CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate_unshadow_unhandled [ va = 0x%(1)16x, gfn = 0x%(2)16x ]
+0x0040f00c CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate_wrmap_bf [ gfn = 0x%(1)08x ]
+0x0040f10c CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate_wrmap_bf [ gfn = 0x%(1)16x ]
+0x0040f00d CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate_prealloc_unpin [ gfn = 0x%(1)08x ]
+0x0040f10d CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate_prealloc_unpin [ gfn = 0x%(1)16x ]
+0x0040f00e CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate_resync_full [ gfn = 0x%(1)08x ]
+0x0040f10e CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate_resync_full [ gfn = 0x%(1)16x ]
+0x0040f00f CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate_resync_only [ gfn = 0x%(1)08x ]
+0x0040f10f CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate_resync_only [ gfn = 0x%(1)16x ]
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/xentrace/xentrace.c
--- a/tools/xentrace/xentrace.c Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/xentrace/xentrace.c Fri Sep 12 14:47:40 2008 +0900
@@ -56,6 +56,7 @@ typedef struct settings_st {
unsigned long tbuf_size;
unsigned long disk_rsvd;
unsigned long timeout;
+ unsigned long memory_buffer;
uint8_t discard:1,
disable_tracing:1;
} settings_t;
@@ -67,10 +68,243 @@ static int xc_handle = -1;
static int xc_handle = -1;
static int event_fd = -1;
static int virq_port = -1;
+static int outfd = 1;

static void close_handler(int signal)
{
interrupted = 1;
+}
+
+static struct {
+ char * buf;
+ unsigned long prod, cons, size;
+ unsigned long pending_size, pending_prod;
+} membuf = { 0 };
+
+#define MEMBUF_INDEX_RESET_THRESHOLD (1<<29)
+
+/* FIXME -- make a power of 2 so we can mask instead. */
+#define MEMBUF_POINTER(_i) (membuf.buf + ((_i) % membuf.size))
+#define MEMBUF_CONS_INCREMENT(_n) \
+ do { \
+ membuf.cons += (_n); \
+ } while(0)
+#define MEMBUF_PROD_SET(_x) \
+ do { \
+ if ( (_x) < membuf.prod ) { \
+ fprintf(stderr, "%s: INTERNAL_ERROR: prod %lu, trying to set to %lu!\n", \
+ __func__, membuf.prod, (unsigned long)(_x)); \
+ exit(1); \
+ } \
+ membuf.prod = (_x); \
+ if ( (_x) > MEMBUF_INDEX_RESET_THRESHOLD ) \
+ { \
+ membuf.prod %= membuf.size; \
+ membuf.cons %= membuf.size; \
+ if( membuf.prod < membuf.cons ) \
+ membuf.prod += membuf.size; \
+ } \
+ } while(0)
+
+struct cpu_change_record {
+ uint32_t header;
+ struct {
+ int cpu;
+ unsigned window_size;
+ } data;
+};
+
+#define CPU_CHANGE_HEADER \
+ (TRC_TRACE_CPU_CHANGE \
+ | (((sizeof(struct cpu_change_record)/sizeof(uint32_t)) - 1) \
+ << TRACE_EXTRA_SHIFT) )
+
+void membuf_alloc(unsigned long size)
+{
+ membuf.buf = malloc(size);
+
+ if(!membuf.buf)
+ {
+ fprintf(stderr, "%s: Couldn't malloc %lu bytes!\n",
+ __func__, size);
+ exit(1);
+ }
+
+ membuf.prod = membuf.cons = 0;
+ membuf.size = size;
+}
+
+/*
+ * Reserve a new window in the buffer. Move the 'consumer' forward size
+ * bytes, re-adjusting the cpu window sizes as necessary, and insert a
+ * cpu_change record.
+ */
+void membuf_reserve_window(unsigned cpu, unsigned long window_size)
+{
+ struct cpu_change_record *rec;
+ long need_to_consume, free, freed;
+
+ if ( membuf.pending_size > 0 )
+ {
+ fprintf(stderr, "%s: INTERNAL_ERROR: pending_size %lu\n",
+ __func__, membuf.pending_size);
+ exit(1);
+ }
+
+ need_to_consume = window_size + sizeof(*rec);
+
+ if ( window_size > membuf.size )
+ {
+ fprintf(stderr, "%s: reserve size %lu larger than buffer size %lu!\n",
+ __func__, window_size, membuf.size);
+ exit(1);
+ }
+
+ /* Subtract free space already in buffer. */
+ free = membuf.size - (membuf.prod - membuf.cons);
+ if( need_to_consume < free)
+ goto start_window;
+
+ need_to_consume -= free;
+
+ /*
+ * "Free" up full windows until we have enough for this window.
+ * It's a bit wasteful to throw away partial buffers, but the only
+ * other option is to scan throught he buffer headers. Since the
+ * common case is that it's going to be thrown away next anyway, I
+ * think minimizing the overall impact is more important.
+ */
+ do {
+ rec = (struct cpu_change_record *)MEMBUF_POINTER(membuf.cons);
+ if( rec->header != CPU_CHANGE_HEADER )
+ {
+ fprintf(stderr, "%s: INTERNAL ERROR: no cpu_change record at consumer!\n",
+ __func__);
+ exit(EXIT_FAILURE);
+ }
+
+ freed = sizeof(*rec) + rec->data.window_size;
+
+ if ( need_to_consume > 0 )
+ {
+ MEMBUF_CONS_INCREMENT(freed);
+ need_to_consume -= freed;
+ }
+ } while( need_to_consume > 0 );
+
+start_window:
+ /*
+ * Start writing "pending" data. Update prod once all this data is
+ * written.
+ */
+ membuf.pending_prod = membuf.prod;
+ membuf.pending_size = window_size;
+
+ rec = (struct cpu_change_record *)MEMBUF_POINTER(membuf.pending_prod);
+
+ rec->header = CPU_CHANGE_HEADER;
+ rec->data.cpu = cpu;
+ rec->data.window_size = window_size;
+
+ membuf.pending_prod += sizeof(*rec);
+}
+
+void membuf_write(void *start, unsigned long size) {
+ char * p;
+ unsigned long wsize;
+
+ if( (membuf.size - (membuf.prod - membuf.cons)) < size )
+ {
+ fprintf(stderr, "%s: INTERNAL ERROR: need %lu bytes, only have %lu!\n",
+ __func__, size, membuf.prod - membuf.cons);
+ exit(1);
+ }
+
+ if( size > membuf.pending_size )
+ {
+ fprintf(stderr, "%s: INTERNAL ERROR: size %lu, pending %lu!\n",
+ __func__, size, membuf.pending_size);
+ exit(1);
+ }
+
+ wsize = size;
+ p = MEMBUF_POINTER(membuf.pending_prod);
+
+ /* If the buffer overlaps the "wrap", do an extra write */
+ if ( p + size > membuf.buf + membuf.size )
+ {
+ int usize = ( membuf.buf + membuf.size ) - p;
+
+ memcpy(p, start, usize);
+
+ start += usize;
+ wsize -= usize;
+ p = membuf.buf;
+ }
+
+ memcpy(p, start, wsize);
+
+ membuf.pending_prod += size;
+ membuf.pending_size -= size;
+
+ if ( membuf.pending_size == 0 )
+ {
+ MEMBUF_PROD_SET(membuf.pending_prod);
+ }
+}
+
+void membuf_dump(void) {
+ /* Dump circular memory buffer */
+ int cons, prod, wsize, written;
+ char * wstart;
+
+ fprintf(stderr, "Dumping memory buffer.\n");
+
+ cons = membuf.cons % membuf.size;
+ prod = membuf.prod % membuf.size;
+
+ if(prod > cons)
+ {
+ /* Write in one go */
+ wstart = membuf.buf + cons;
+ wsize = prod - cons;
+
+ written = write(outfd, wstart, wsize);
+ if ( written != wsize )
+ goto fail;
+ }
+ else
+ {
+ /* Write in two pieces: cons->end, beginning->prod. */
+ wstart = membuf.buf + cons;
+ wsize = membuf.size - cons;
+
+ written = write(outfd, wstart, wsize);
+ if ( written != wsize )
+ {
+ fprintf(stderr, "Write failed! (size %d, returned %d)\n",
+ wsize, written);
+ goto fail;
+ }
+
+ wstart = membuf.buf;
+ wsize = prod;
+
+ written = write(outfd, wstart, wsize);
+ if ( written != wsize )
+ {
+ fprintf(stderr, "Write failed! (size %d, returned %d)\n",
+ wsize, written);
+ goto fail;
+ }
+ }
+
+ membuf.cons = membuf.prod = 0;
+
+ return;
+fail:
+ exit(1);
+ return;
}

/**
@@ -85,20 +319,20 @@ static void close_handler(int signal)
* of the buffer write.
*/
static void write_buffer(unsigned int cpu, unsigned char *start, int size,
- int total_size, int outfd)
+ int total_size)
{
struct statvfs stat;
size_t written = 0;

- if ( opts.disk_rsvd != 0 )
+ if ( opts.memory_buffer == 0 && opts.disk_rsvd != 0 )
{
unsigned long long freespace;

/* Check that filesystem has enough space. */
if ( fstatvfs (outfd, &stat) )
{
- fprintf(stderr, "Statfs failed!\n");
- goto fail;
+ fprintf(stderr, "Statfs failed!\n");
+ goto fail;
}

freespace = stat.f_frsize * (unsigned long long)stat.f_bfree;
@@ -112,8 +346,8 @@ static void write_buffer(unsigned int cp

if ( freespace <= opts.disk_rsvd )
{
- fprintf(stderr, "Disk space limit reached (free space: %lluMB, limit: %luMB).\n", freespace, opts.disk_rsvd);
- exit (EXIT_FAILURE);
+ fprintf(stderr, "Disk space limit reached (free space: %lluMB, limit: %luMB).\n", freespace, opts.disk_rsvd);
+ exit (EXIT_FAILURE);
}
}

@@ -122,40 +356,46 @@ static void write_buffer(unsigned int cp
* first write. */
if ( total_size != 0 )
{
- struct {
- uint32_t header;
- struct {
- unsigned cpu;
- unsigned byte_count;
- } extra;
- } rec;
-
- rec.header = TRC_TRACE_CPU_CHANGE
- | ((sizeof(rec.extra)/sizeof(uint32_t)) << TRACE_EXTRA_SHIFT);
- rec.extra.cpu = cpu;
- rec.extra.byte_count = total_size;
-
- written = write(outfd, &rec, sizeof(rec));
-
- if ( written != sizeof(rec) )
- {
- fprintf(stderr, "Cannot write cpu change (write returned %zd)\n",
- written);
+ if ( opts.memory_buffer )
+ {
+ membuf_reserve_window(cpu, total_size);
+ }
+ else
+ {
+ struct cpu_change_record rec;
+
+ rec.header = CPU_CHANGE_HEADER;
+ rec.data.cpu = cpu;
+ rec.data.window_size = total_size;
+
+ written = write(outfd, &rec, sizeof(rec));
+ if ( written != sizeof(rec) )
+ {
+ fprintf(stderr, "Cannot write cpu change (write returned %zd)\n",
+ written);
+ goto fail;
+ }
+ }
+ }
+
+ if ( opts.memory_buffer )
+ {
+ membuf_write(start, size);
+ }
+ else
+ {
+ written = write(outfd, start, size);
+ if ( written != size )
+ {
+ fprintf(stderr, "Write failed! (size %d, returned %zd)\n",
+ size, written);
goto fail;
}
}

- written = write(outfd, start, size);
- if ( written != size )
- {
- fprintf(stderr, "Write failed! (size %d, returned %zd)\n",
- size, written);
- goto fail;
- }
-
return;

- fail:
+fail:
PERROR("Failed to write trace data");
exit(EXIT_FAILURE);
}
@@ -394,7 +634,7 @@ static void wait_for_event_or_timeout(un
* monitor_tbufs - monitor the contents of tbufs and output to a file
* @logfile: the FILE * representing the file to log to
*/
-static int monitor_tbufs(int outfd)
+static int monitor_tbufs(void)
{
int i;

@@ -429,9 +669,9 @@ static int monitor_tbufs(int outfd)
meta[i]->cons = meta[i]->prod;

/* now, scan buffers for events */
- while ( !interrupted )
- {
- for ( i = 0; (i < num) && !interrupted; i++ )
+ while ( 1 )
+ {
+ for ( i = 0; i < num; i++ )
{
unsigned long start_offset, end_offset, window_size, cons, prod;

@@ -463,8 +703,7 @@ static int monitor_tbufs(int outfd)
/* If window does not wrap, write in one big chunk */
write_buffer(i, data[i]+start_offset,
window_size,
- window_size,
- outfd);
+ window_size);
}
else
{
@@ -474,23 +713,28 @@ static int monitor_tbufs(int outfd)
*/
write_buffer(i, data[i] + start_offset,
data_size - start_offset,
- window_size,
- outfd);
+ window_size);
write_buffer(i, data[i],
end_offset,
- 0,
- outfd);
+ 0);
}

xen_mb(); /* read buffer, then update cons. */
meta[i]->cons = prod;
- }
+
+ }
+
+ if ( interrupted )
+ break;

wait_for_event_or_timeout(opts.poll_sleep);
}

- if(opts.disable_tracing)
+ if ( opts.disable_tracing )
disable_tbufs();
+
+ if ( opts.memory_buffer )
+ membuf_dump();

/* cleanup */
free(meta);
@@ -538,6 +782,8 @@ static void usage(void)
" -T --time-interval=s Run xentrace for s seconds and quit.\n" \
" -?, --help Show this message\n" \
" -V, --version Print program version\n" \
+" -M, --memory-buffer=b Copy trace records to a circular memory buffer.\n" \
+" Dump to file on exit.\n" \
"\n" \
"This tool is used to capture trace buffer data from Xen. The\n" \
"data is output in a binary format, in the following order:\n" \
@@ -551,6 +797,53 @@ static void usage(void)
printf("\nReport bugs to %s\n", program_bug_address);

exit(EXIT_FAILURE);
+}
+
+/* convert the argument string pointed to by arg to a long int representation,
+ * including suffixes such as 'M' and 'k'. */
+#define MB (1024*1024)
+#define KB (1024)
+long sargtol(const char *restrict arg, int base)
+{
+ char *endp;
+ long val;
+
+ errno = 0;
+ val = strtol(arg, &endp, base);
+
+ if ( errno != 0 )
+ {
+ fprintf(stderr, "Invalid option argument: %s\n", arg);
+ fprintf(stderr, "Error: %s\n\n", strerror(errno));
+ usage();
+ }
+ else if (endp == arg)
+ {
+ goto invalid;
+ }
+
+ switch(*endp)
+ {
+ case '\0':
+ break;
+ case 'M':
+ val *= MB;
+ break;
+ case 'K':
+ case 'k':
+ val *= KB;
+ break;
+ default:
+ fprintf(stderr, "Unknown suffix %c\n", *endp);
+ exit(1);
+ }
+
+
+ return val;
+invalid:
+ return 0;
+ fprintf(stderr, "Invalid option argument: %s\n\n", arg);
+ usage();
}

/* convert the argument string pointed to by arg to a long int representation */
@@ -606,6 +899,7 @@ static void parse_args(int argc, char **
{ "trace-buf-size", required_argument, 0, 'S' },
{ "reserve-disk-space", required_argument, 0, 'r' },
{ "time-interval", required_argument, 0, 'T' },
+ { "memory-buffer", required_argument, 0, 'M' },
{ "discard-buffers", no_argument, 0, 'D' },
{ "dont-disable-tracing", no_argument, 0, 'x' },
{ "help", no_argument, 0, '?' },
@@ -613,7 +907,7 @@ static void parse_args(int argc, char **
{ 0, 0, 0, 0 }
};

- while ( (option = getopt_long(argc, argv, "c:e:s:S:t:?V",
+ while ( (option = getopt_long(argc, argv, "t:s:c:e:S:r:T:M:Dx?V",
long_options, NULL)) != -1)
{
switch ( option )
@@ -653,6 +947,10 @@ static void parse_args(int argc, char **

case 'T':
opts.timeout = argtol(optarg, 0);
+ break;
+
+ case 'M':
+ opts.memory_buffer = sargtol(optarg, 0);
break;

default:
@@ -674,7 +972,7 @@ static void parse_args(int argc, char **

int main(int argc, char **argv)
{
- int outfd = 1, ret;
+ int ret;
struct sigaction act;

opts.outfile = 0;
@@ -719,6 +1017,9 @@ int main(int argc, char **argv)
fprintf(stderr, "Cannot output to a TTY, specify a log file.\n");
exit(EXIT_FAILURE);
}
+
+ if ( opts.memory_buffer > 0 )
+ membuf_alloc(opts.memory_buffer);

/* ensure that if we get a signal, we'll do cleanup, then exit */
act.sa_handler = close_handler;
@@ -729,7 +1030,7 @@ int main(int argc, char **argv)
sigaction(SIGINT, &act, NULL);
sigaction(SIGALRM, &act, NULL);

- ret = monitor_tbufs(outfd);
+ ret = monitor_tbufs();

return ret;
}
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/acpi/Makefile
--- a/xen/arch/x86/acpi/Makefile Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/acpi/Makefile Fri Sep 12 14:47:40 2008 +0900
@@ -1,5 +1,5 @@ subdir-y += cpufreq
subdir-y += cpufreq

obj-y += boot.o
-obj-y += power.o suspend.o wakeup_prot.o cpu_idle.o
+obj-y += power.o suspend.o wakeup_prot.o cpu_idle.o cpuidle_menu.o
obj-y += pmstat.o
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/acpi/cpu_idle.c
--- a/xen/arch/x86/acpi/cpu_idle.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/acpi/cpu_idle.c Fri Sep 12 14:47:40 2008 +0900
@@ -39,6 +39,7 @@
#include <xen/smp.h>
#include <xen/guest_access.h>
#include <xen/keyhandler.h>
+#include <xen/cpuidle.h>
#include <asm/cache.h>
#include <asm/io.h>
#include <asm/hpet.h>
@@ -49,12 +50,9 @@
#define DEBUG_PM_CX

#define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
+#define PM_TIMER_TICKS_TO_US(t) ((t * 1000) / (PM_TIMER_FREQUENCY / 1000))
#define C2_OVERHEAD 4 /* 1us (3.579 ticks per us) */
#define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */
-
-#define ACPI_PROCESSOR_MAX_POWER 8
-#define ACPI_PROCESSOR_MAX_C2_LATENCY 100
-#define ACPI_PROCESSOR_MAX_C3_LATENCY 1000

static void (*lapic_timer_off)(void);
static void (*lapic_timer_on)(void);
@@ -65,66 +63,6 @@ static void (*pm_idle_save) (void) __rea
static void (*pm_idle_save) (void) __read_mostly;
unsigned int max_cstate __read_mostly = 2;
integer_param("max_cstate", max_cstate);
-/*
- * bm_history -- bit-mask with a bit per jiffy of bus-master activity
- * 1000 HZ: 0xFFFFFFFF: 32 jiffies = 32ms
- * 800 HZ: 0xFFFFFFFF: 32 jiffies = 40ms
- * 100 HZ: 0x0000000F: 4 jiffies = 40ms
- * reduce history for more aggressive entry into C3
- */
-unsigned int bm_history __read_mostly =
- (HZ >= 800 ? 0xFFFFFFFF : ((1U << (HZ / 25)) - 1));
-integer_param("bm_history", bm_history);
-
-struct acpi_processor_cx;
-
-struct acpi_processor_cx_policy
-{
- u32 count;
- struct acpi_processor_cx *state;
- struct
- {
- u32 time;
- u32 ticks;
- u32 count;
- u32 bm;
- } threshold;
-};
-
-struct acpi_processor_cx
-{
- u8 valid;
- u8 type;
- u32 address;
- u8 space_id;
- u32 latency;
- u32 latency_ticks;
- u32 power;
- u32 usage;
- u64 time;
- struct acpi_processor_cx_policy promotion;
- struct acpi_processor_cx_policy demotion;
-};
-
-struct acpi_processor_flags
-{
- u8 bm_control:1;
- u8 bm_check:1;
- u8 has_cst:1;
- u8 power_setup_done:1;
- u8 bm_rld_set:1;
-};
-
-struct acpi_processor_power
-{
- struct acpi_processor_flags flags;
- struct acpi_processor_cx *state;
- s_time_t bm_check_timestamp;
- u32 default_state;
- u32 bm_activity;
- u32 count;
- struct acpi_processor_cx states[ACPI_PROCESSOR_MAX_POWER];
-};

static struct acpi_processor_power processor_powers[NR_CPUS];

@@ -133,26 +71,21 @@ static void print_acpi_power(uint32_t cp
uint32_t i;

printk("==cpu%d==\n", cpu);
- printk("active state:\t\tC%d\n", (power->state)?power->state->type:-1);
+ printk("active state:\t\tC%d\n",
+ (power->last_state) ? power->last_state->type : -1);
printk("max_cstate:\t\tC%d\n", max_cstate);
- printk("bus master activity:\t%08x\n", power->bm_activity);
printk("states:\n");

for ( i = 1; i < power->count; i++ )
{
- printk((power->states[i].type == power->state->type) ? " *" : " ");
+ if ( power->last_state &&
+ power->states[i].type == power->last_state->type )
+ printk(" *");
+ else
+ printk(" ");
printk("C%d:\t\t", i);
printk("type[C%d] ", power->states[i].type);
- if ( power->states[i].promotion.state )
- printk("promotion[C%d] ", power->states[i].promotion.state->type);
- else
- printk("promotion[--] ");
- if ( power->states[i].demotion.state )
- printk("demotion[C%d] ", power->states[i].demotion.state->type);
- else
- printk("demotion[--] ");
- printk("latency[%03d]\n ", power->states[i].latency);
- printk("\t\t\t");
+ printk("latency[%03d] ", power->states[i].latency);
printk("usage[%08d] ", power->states[i].usage);
printk("duration[%"PRId64"]\n", power->states[i].time);
}
@@ -180,48 +113,6 @@ static inline u32 ticks_elapsed(u32 t1,
return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
else
return ((0xFFFFFFFF - t1) + t2);
-}
-
-static void acpi_processor_power_activate(struct acpi_processor_power *power,
- struct acpi_processor_cx *new)
-{
- struct acpi_processor_cx *old;
-
- if ( !power || !new )
- return;
-
- old = power->state;
-
- if ( old )
- old->promotion.count = 0;
- new->demotion.count = 0;
-
- /* Cleanup from old state. */
- if ( old )
- {
- switch ( old->type )
- {
- case ACPI_STATE_C3:
- /* Disable bus master reload */
- if ( new->type != ACPI_STATE_C3 && power->flags.bm_check )
- acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
- break;
- }
- }
-
- /* Prepare to use new state. */
- switch ( new->type )
- {
- case ACPI_STATE_C3:
- /* Enable bus master reload */
- if ( old->type != ACPI_STATE_C3 && power->flags.bm_check )
- acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
- break;
- }
-
- power->state = new;
-
- return;
}

static void acpi_safe_halt(void)
@@ -263,13 +154,50 @@ static void acpi_idle_do_entry(struct ac
}
}

-static atomic_t c3_cpu_count;
+static inline void acpi_idle_update_bm_rld(struct acpi_processor_power *power,
+ struct acpi_processor_cx *target)
+{
+ if ( !power->flags.bm_check )
+ return;
+
+ if ( power->flags.bm_rld_set && target->type != ACPI_STATE_C3 )
+ {
+ acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
+ power->flags.bm_rld_set = 0;
+ }
+
+ if ( !power->flags.bm_rld_set && target->type == ACPI_STATE_C3 )
+ {
+ acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
+ power->flags.bm_rld_set = 1;
+ }
+}
+
+static int acpi_idle_bm_check(void)
+{
+ u32 bm_status = 0;
+
+ acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status);
+ if ( bm_status )
+ acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
+ /*
+ * TBD: PIIX4 Erratum #18: Note that BM_STS doesn't always reflect
+ * the true state of bus mastering activity; forcing us to
+ * manually check the BMIDEA bit of each IDE channel.
+ */
+ return bm_status;
+}
+
+static struct {
+ spinlock_t lock;
+ unsigned int count;
+} c3_cpu_status = { .lock = SPIN_LOCK_UNLOCKED };

static void acpi_processor_idle(void)
{
struct acpi_processor_power *power = NULL;
struct acpi_processor_cx *cx = NULL;
- struct acpi_processor_cx *next_state = NULL;
+ int next_state;
int sleep_ticks = 0;
u32 t1, t2 = 0;

@@ -287,7 +215,16 @@ static void acpi_processor_idle(void)
return;
}

- cx = power->state;
+ next_state = cpuidle_current_governor->select(power);
+ if ( next_state > 0 )
+ {
+ cx = &power->states[next_state];
+ if ( power->flags.bm_check && acpi_idle_bm_check()
+ && cx->type == ACPI_STATE_C3 )
+ cx = power->safe_state;
+ if ( cx->type > max_cstate )
+ cx = &power->states[max_cstate];
+ }
if ( !cx )
{
if ( pm_idle_save )
@@ -303,69 +240,14 @@ static void acpi_processor_idle(void)
return;
}

- /*
- * Check BM Activity
- * -----------------
- * Check for bus mastering activity (if required), record, and check
- * for demotion.
- */
- if ( power->flags.bm_check )
- {
- u32 bm_status = 0;
- unsigned long diff = (NOW() - power->bm_check_timestamp) >> 23;
-
- if ( diff > 31 )
- diff = 31;
-
- power->bm_activity <<= diff;
-
- acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status);
- if ( bm_status )
- {
- power->bm_activity |= 0x1;
- acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
- }
- /*
- * PIIX4 Erratum #18: Note that BM_STS doesn't always reflect
- * the true state of bus mastering activity; forcing us to
- * manually check the BMIDEA bit of each IDE channel.
- */
- /*else if ( errata.piix4.bmisx )
- {
- if ( (inb_p(errata.piix4.bmisx + 0x02) & 0x01)
- || (inb_p(errata.piix4.bmisx + 0x0A) & 0x01) )
- pr->power.bm_activity |= 0x1;
- }*/
-
- power->bm_check_timestamp = NOW();
-
- /*
- * If bus mastering is or was active this jiffy, demote
- * to avoid a faulty transition. Note that the processor
- * won't enter a low-power state during this call (to this
- * function) but should upon the next.
- *
- * TBD: A better policy might be to fallback to the demotion
- * state (use it for this quantum only) istead of
- * demoting -- and rely on duration as our sole demotion
- * qualification. This may, however, introduce DMA
- * issues (e.g. floppy DMA transfer overrun/underrun).
- */
- if ( (power->bm_activity & 0x1) && cx->demotion.threshold.bm )
- {
- local_irq_enable();
- next_state = cx->demotion.state;
- goto end;
- }
- }
+ power->last_state = cx;

/*
* Sleep:
* ------
* Invoke the current Cx state to put the processor to sleep.
*/
- if ( cx->type == ACPI_STATE_C2 || cx->type == ACPI_STATE_C3 )
- smp_mb__after_clear_bit();
+ acpi_idle_update_bm_rld(power, cx);

switch ( cx->type )
{
@@ -399,8 +281,7 @@ static void acpi_processor_idle(void)
/* Re-enable interrupts */
local_irq_enable();
/* Compute time (ticks) that we were actually asleep */
- sleep_ticks =
- ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD;
+ sleep_ticks = ticks_elapsed(t1, t2);
break;

case ACPI_STATE_C3:
@@ -416,8 +297,8 @@ static void acpi_processor_idle(void)
*/
if ( power->flags.bm_check && power->flags.bm_control )
{
- atomic_inc(&c3_cpu_count);
- if ( atomic_read(&c3_cpu_count) == num_online_cpus() )
+ spin_lock(&c3_cpu_status.lock);
+ if ( ++c3_cpu_status.count == num_online_cpus() )
{
/*
* All CPUs are trying to go to C3
@@ -425,6 +306,7 @@ static void acpi_processor_idle(void)
*/
acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1);
}
+ spin_unlock(&c3_cpu_status.lock);
}
else if ( !power->flags.bm_check )
{
@@ -455,8 +337,10 @@ static void acpi_processor_idle(void)
if ( power->flags.bm_check && power->flags.bm_control )
{
/* Enable bus master arbitration */
- atomic_dec(&c3_cpu_count);
- acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
+ spin_lock(&c3_cpu_status.lock);
+ if ( c3_cpu_status.count-- == num_online_cpus() )
+ acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
+ spin_unlock(&c3_cpu_status.lock);
}

/* Re-enable interrupts */
@@ -465,8 +349,6 @@ static void acpi_processor_idle(void)
lapic_timer_on();
/* Compute time (ticks) that we were actually asleep */
sleep_ticks = ticks_elapsed(t1, t2);
- /* Do not account our idle-switching overhead: */
- sleep_ticks -= cx->latency_ticks + C3_OVERHEAD;

break;

@@ -476,163 +358,14 @@ static void acpi_processor_idle(void)
}

cx->usage++;
- if ( (cx->type != ACPI_STATE_C1) && (sleep_ticks > 0) )
+ if ( sleep_ticks > 0 )
+ {
+ power->last_residency = PM_TIMER_TICKS_TO_US(sleep_ticks);
cx->time += sleep_ticks;
-
- next_state = power->state;
-
- /*
- * Promotion?
- * ----------
- * Track the number of longs (time asleep is greater than threshold)
- * and promote when the count threshold is reached. Note that bus
- * mastering activity may prevent promotions.
- * Do not promote above max_cstate.
- */
- if ( cx->promotion.state &&
- ((cx->promotion.state - power->states) <= max_cstate) )
- {
- if ( sleep_ticks > cx->promotion.threshold.ticks )
- {
- cx->promotion.count++;
- cx->demotion.count = 0;
- if ( cx->promotion.count >= cx->promotion.threshold.count )
- {
- if ( power->flags.bm_check )
- {
- if ( !(power->bm_activity & cx->promotion.threshold.bm) )
- {
- next_state = cx->promotion.state;
- goto end;
- }
- }
- else
- {
- next_state = cx->promotion.state;
- goto end;
- }
- }
- }
- }
-
- /*
- * Demotion?
- * ---------
- * Track the number of shorts (time asleep is less than time threshold)
- * and demote when the usage threshold is reached.
- */
- if ( cx->demotion.state )
- {
- if ( sleep_ticks < cx->demotion.threshold.ticks )
- {
- cx->demotion.count++;
- cx->promotion.count = 0;
- if ( cx->demotion.count >= cx->demotion.threshold.count )
- {
- next_state = cx->demotion.state;
- goto end;
- }
- }
- }
-
-end:
- /*
- * Demote if current state exceeds max_cstate
- */
- if ( (power->state - power->states) > max_cstate )
- {
- if ( cx->demotion.state )
- next_state = cx->demotion.state;
- }
-
- /*
- * New Cx State?
- * -------------
- * If we're going to start using a new Cx state we must clean up
- * from the previous and prepare to use the new.
- */
- if ( next_state != power->state )
- acpi_processor_power_activate(power, next_state);
-}
-
-static int acpi_processor_set_power_policy(struct acpi_processor_power *power)
-{
- unsigned int i;
- unsigned int state_is_set = 0;
- struct acpi_processor_cx *lower = NULL;
- struct acpi_processor_cx *higher = NULL;
- struct acpi_processor_cx *cx;
-
- if ( !power )
- return -EINVAL;
-
- /*
- * This function sets the default Cx state policy (OS idle handler).
- * Our scheme is to promote quickly to C2 but more conservatively
- * to C3. We're favoring C2 for its characteristics of low latency
- * (quick response), good power savings, and ability to allow bus
- * mastering activity. Note that the Cx state policy is completely
- * customizable and can be altered dynamically.
- */
-
- /* startup state */
- for ( i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++ )
- {
- cx = &power->states[i];
- if ( !cx->valid )
- continue;
-
- if ( !state_is_set )
- power->state = cx;
- state_is_set++;
- break;
- }
-
- if ( !state_is_set )
- return -ENODEV;
-
- /* demotion */
- for ( i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++ )
- {
- cx = &power->states[i];
- if ( !cx->valid )
- continue;
-
- if ( lower )
- {
- cx->demotion.state = lower;
- cx->demotion.threshold.ticks = cx->latency_ticks;
- cx->demotion.threshold.count = 1;
- if ( cx->type == ACPI_STATE_C3 )
- cx->demotion.threshold.bm = bm_history;
- }
-
- lower = cx;
- }
-
- /* promotion */
- for ( i = (ACPI_PROCESSOR_MAX_POWER - 1); i > 0; i-- )
- {
- cx = &power->states[i];
- if ( !cx->valid )
- continue;
-
- if ( higher )
- {
- cx->promotion.state = higher;
- cx->promotion.threshold.ticks = cx->latency_ticks;
- if ( cx->type >= ACPI_STATE_C2 )
- cx->promotion.threshold.count = 4;
- else
- cx->promotion.threshold.count = 10;
- if ( higher->type == ACPI_STATE_C3 )
- cx->promotion.threshold.bm = bm_history;
- }
-
- higher = cx;
- }
-
- return 0;
+ }
+
+ if ( cpuidle_current_governor->reflect )
+ cpuidle_current_governor->reflect(power);
}

static int init_cx_pminfo(struct acpi_processor_power *acpi_power)
@@ -821,6 +554,8 @@ static int check_cx(struct acpi_processo
return 0;
}

+static unsigned int latency_factor = 2;
+
static void set_cx(
struct acpi_processor_power *acpi_power,
xen_processor_cx_t *xen_cx)
@@ -842,6 +577,9 @@ static void set_cx(
cx->power = xen_cx->power;

cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);
+ cx->target_residency = cx->latency * latency_factor;
+ if ( cx->type == ACPI_STATE_C1 || cx->type == ACPI_STATE_C2 )
+ acpi_power->safe_state = cx;
}

int get_cpu_id(u8 acpi_id)
@@ -936,6 +674,7 @@ long set_cx_pminfo(uint32_t cpu, struct

init_cx_pminfo(acpi_power);

+ acpi_power->cpu = cpu_id;
acpi_power->flags.bm_check = power->flags.bm_check;
acpi_power->flags.bm_control = power->flags.bm_control;
acpi_power->flags.has_cst = power->flags.has_cst;
@@ -950,10 +689,11 @@ long set_cx_pminfo(uint32_t cpu, struct
set_cx(acpi_power, &xen_cx);
}

+ if ( cpuidle_current_governor->enable &&
+ cpuidle_current_governor->enable(acpi_power) )
+ return -EFAULT;
+
/* FIXME: C-state dependency is not supported by far */
-
- /* initialize default policy */
- acpi_processor_set_power_policy(acpi_power);

print_acpi_power(cpu_id, acpi_power);

@@ -978,7 +718,7 @@ int pmstat_get_cx_stat(uint32_t cpuid, s
uint64_t usage;
int i;

- stat->last = (power->state) ? power->state->type : 0;
+ stat->last = (power->last_state) ? power->last_state->type : 0;
stat->nr = processor_powers[cpuid].count;
stat->idle_time = v->runstate.time[RUNSTATE_running];
if ( v->is_running )
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/acpi/cpufreq/cpufreq.c
--- a/xen/arch/x86/acpi/cpufreq/cpufreq.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/acpi/cpufreq/cpufreq.c Fri Sep 12 14:47:40 2008 +0900
@@ -48,7 +48,7 @@ struct cpufreq_policy xen_px_policy[NR_C
struct cpufreq_policy xen_px_policy[NR_CPUS];

static cpumask_t *cpufreq_dom_pt;
-static cpumask_t cpufreq_dom_mask;
+static unsigned long *cpufreq_dom_mask;
static unsigned int cpufreq_dom_max;

enum {
@@ -562,7 +562,8 @@ void cpufreq_dom_exit(void)
void cpufreq_dom_exit(void)
{
cpufreq_dom_max = 0;
- cpus_clear(cpufreq_dom_mask);
+ if (cpufreq_dom_mask)
+ xfree(cpufreq_dom_mask);
if (cpufreq_dom_pt)
xfree(cpufreq_dom_pt);
}
@@ -572,22 +573,28 @@ int cpufreq_dom_init(void)
unsigned int i;

cpufreq_dom_max = 0;
- cpus_clear(cpufreq_dom_mask);

for_each_online_cpu(i) {
- cpu_set(processor_pminfo[i].perf.domain_info.domain, cpufreq_dom_mask);
if (cpufreq_dom_max < processor_pminfo[i].perf.domain_info.domain)
cpufreq_dom_max = processor_pminfo[i].perf.domain_info.domain;
}
cpufreq_dom_max++;
+
+ cpufreq_dom_mask = xmalloc_array(unsigned long,
+ BITS_TO_LONGS(cpufreq_dom_max));
+ if (!cpufreq_dom_mask)
+ return -ENOMEM;
+ bitmap_zero(cpufreq_dom_mask, cpufreq_dom_max);

cpufreq_dom_pt = xmalloc_array(cpumask_t, cpufreq_dom_max);
if (!cpufreq_dom_pt)
return -ENOMEM;
memset(cpufreq_dom_pt, 0, cpufreq_dom_max * sizeof(cpumask_t));

- for_each_online_cpu(i)
+ for_each_online_cpu(i) {
+ __set_bit(processor_pminfo[i].perf.domain_info.domain, cpufreq_dom_mask);
cpu_set(i, cpufreq_dom_pt[processor_pminfo[i].perf.domain_info.domain]);
+ }

for_each_online_cpu(i)
processor_pminfo[i].perf.shared_cpu_map =
@@ -616,10 +623,11 @@ static int cpufreq_cpu_init(void)

int cpufreq_dom_dbs(unsigned int event)
{
- int cpu, dom, ret = 0;
-
- for (dom=0; dom<cpufreq_dom_max; dom++) {
- if (!cpu_isset(dom, cpufreq_dom_mask))
+ unsigned int cpu, dom;
+ int ret = 0;
+
+ for (dom = 0; dom < cpufreq_dom_max; dom++) {
+ if (!test_bit(dom, cpufreq_dom_mask))
continue;
cpu = first_cpu(cpufreq_dom_pt[dom]);
ret = cpufreq_governor_dbs(&xen_px_policy[cpu], event);
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/acpi/cpufreq/powernow.c
--- a/xen/arch/x86/acpi/cpufreq/powernow.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/acpi/cpufreq/powernow.c Fri Sep 12 14:47:40 2008 +0900
@@ -197,8 +197,8 @@ static int powernow_cpufreq_cpu_init(str

data->max_freq = perf->states[0].core_frequency * 1000;
/* table init */
- for (i=0; i<perf->state_count && i<max_hw_pstate; i++) {
- if (i>0 && perf->states[i].core_frequency >=
+ for (i = 0; i < perf->state_count && i <= max_hw_pstate; i++) {
+ if (i > 0 && perf->states[i].core_frequency >=
data->freq_table[valid_states-1].frequency / 1000)
continue;

diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/acpi/cpuidle_menu.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/acpi/cpuidle_menu.c Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,132 @@
+/*
+ * cpuidle_menu - menu governor for cpu idle, main idea come from Linux.
+ * drivers/cpuidle/governors/menu.c
+ *
+ * Copyright (C) 2006-2007 Adam Belay <abelay@novell.com>
+ * Copyright (C) 2007, 2008 Intel Corporation
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#include <xen/config.h>
+#include <xen/errno.h>
+#include <xen/lib.h>
+#include <xen/types.h>
+#include <xen/acpi.h>
+#include <xen/timer.h>
+#include <xen/cpuidle.h>
+
+#define BREAK_FUZZ 4 /* 4 us */
+#define USEC_PER_SEC 1000000
+
+struct menu_device
+{
+ int last_state_idx;
+ unsigned int expected_us;
+ unsigned int predicted_us;
+ unsigned int last_measured_us;
+ unsigned int elapsed_us;
+};
+
+static DEFINE_PER_CPU(struct menu_device, menu_devices);
+
+static s_time_t get_sleep_length_ns(void)
+{
+ return per_cpu(timer_deadline, smp_processor_id()) - NOW();
+}
+
+static int menu_select(struct acpi_processor_power *power)
+{
+ struct menu_device *data = &__get_cpu_var(menu_devices);
+ int i;
+
+ /* determine the expected residency time */
+ data->expected_us = (u32) get_sleep_length_ns() / 1000;
+
+ /* find the deepest idle state that satisfies our constraints */
+ for ( i = 1; i < power->count; i++ )
+ {
+ struct acpi_processor_cx *s = &power->states[i];
+
+ if ( s->target_residency > data->expected_us + s->latency )
+ break;
+ if ( s->target_residency > data->predicted_us )
+ break;
+ /* TBD: we need to check the QoS requirment in future */
+ }
+
+ data->last_state_idx = i - 1;
+ return i - 1;
+}
+
+static void menu_reflect(struct acpi_processor_power *power)
+{
+ struct menu_device *data = &__get_cpu_var(menu_devices);
+ struct acpi_processor_cx *target = &power->states[data->last_state_idx];
+ unsigned int last_residency;
+ unsigned int measured_us;
+
+ /*
+ * Ugh, this idle state doesn't support residency measurements, so we
+ * are basically lost in the dark. As a compromise, assume we slept
+ * for one full standard timer tick. However, be aware that this
+ * could potentially result in a suboptimal state transition.
+ */
+ if ( target->type == ACPI_STATE_C1 )
+ last_residency = USEC_PER_SEC / HZ;
+ else
+ last_residency = power->last_residency;
+
+ measured_us = last_residency + data->elapsed_us;
+
+ /* if wrapping, set to max uint (-1) */
+ measured_us = data->elapsed_us <= measured_us ? measured_us : -1;
+
+ /* Predict time remaining until next break event */
+ data->predicted_us = max(measured_us, data->last_measured_us);
+
+ /* Distinguish between expected & non-expected events */
+ if ( last_residency + BREAK_FUZZ
+ < data->expected_us + target->latency )
+ {
+ data->last_measured_us = measured_us;
+ data->elapsed_us = 0;
+ }
+ else
+ data->elapsed_us = measured_us;
+}
+
+static int menu_enable_device(struct acpi_processor_power *power)
+{
+ struct menu_device *data = &per_cpu(menu_devices, power->cpu);
+
+ memset(data, 0, sizeof(struct menu_device));
+
+ return 0;
+}
+
+static struct cpuidle_governor menu_governor =
+{
+ .name = "menu",
+ .rating = 20,
+ .enable = menu_enable_device,
+ .select = menu_select,
+ .reflect = menu_reflect,
+};
+
+struct cpuidle_governor *cpuidle_current_governor = &menu_governor;
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/domain.c Fri Sep 12 14:47:40 2008 +0900
@@ -31,6 +31,7 @@
#include <xen/compat.h>
#include <xen/acpi.h>
#include <xen/pci.h>
+#include <xen/paging.h>
#include <asm/regs.h>
#include <asm/mc146818rtc.h>
#include <asm/system.h>
@@ -40,7 +41,6 @@
#include <asm/i387.h>
#include <asm/mpspec.h>
#include <asm/ldt.h>
-#include <asm/paging.h>
#include <asm/hypercall.h>
#include <asm/hvm/hvm.h>
#include <asm/hvm/support.h>
@@ -302,7 +302,8 @@ int vcpu_initialise(struct vcpu *v)
else
{
/* PV guests by default have a 100Hz ticker. */
- v->periodic_period = MILLISECS(10);
+ if ( !is_idle_domain(d) )
+ v->periodic_period = MILLISECS(10);

/* PV guests get an emulated PIT too for video BIOSes to use. */
if ( !is_idle_domain(d) && (v->vcpu_id == 0) )
@@ -1645,23 +1646,26 @@ static int relinquish_memory(

/*
* Forcibly invalidate top-most, still valid page tables at this point
- * to break circular 'linear page table' references. This is okay
- * because MMU structures are not shared across domains and this domain
- * is now dead. Thus top-most valid tables are not in use so a non-zero
- * count means circular reference.
+ * to break circular 'linear page table' references as well as clean up
+ * partially validated pages. This is okay because MMU structures are
+ * not shared across domains and this domain is now dead. Thus top-most
+ * valid tables are not in use so a non-zero count means circular
+ * reference or partially validated.
*/
y = page->u.inuse.type_info;
for ( ; ; )
{
x = y;
- if ( likely((x & (PGT_type_mask|PGT_validated)) !=
- (type|PGT_validated)) )
+ if ( likely((x & PGT_type_mask) != type) ||
+ likely(!(x & (PGT_validated|PGT_partial))) )
break;

- y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
+ y = cmpxchg(&page->u.inuse.type_info, x,
+ x & ~(PGT_validated|PGT_partial));
if ( likely(y == x) )
{
- free_page_type(page, type);
+ if ( free_page_type(page, x, 0) != 0 )
+ BUG();
break;
}
}
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/domain_build.c
--- a/xen/arch/x86/domain_build.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/domain_build.c Fri Sep 12 14:47:40 2008 +0900
@@ -26,6 +26,7 @@
#include <asm/desc.h>
#include <asm/i387.h>
#include <asm/paging.h>
+#include <asm/p2m.h>
#include <asm/e820.h>

#include <public/version.h>
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/domctl.c
--- a/xen/arch/x86/domctl.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/domctl.c Fri Sep 12 14:47:40 2008 +0900
@@ -20,7 +20,7 @@
#include <xen/trace.h>
#include <xen/console.h>
#include <xen/iocap.h>
-#include <asm/paging.h>
+#include <xen/paging.h>
#include <asm/irq.h>
#include <asm/hvm/hvm.h>
#include <asm/hvm/support.h>
@@ -67,14 +67,6 @@ long arch_do_domctl(
ret = -ESRCH;
if ( unlikely((d = rcu_lock_domain_by_id(domctl->domain)) == NULL) )
break;
-
- ret = xsm_ioport_permission(d, fp,
- domctl->u.ioport_permission.allow_access);
- if ( ret )
- {
- rcu_unlock_domain(d);
- break;
- }

if ( np == 0 )
ret = 0;
@@ -550,6 +542,10 @@ long arch_do_domctl(
if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL )
break;

+ ret = xsm_sendtrigger(d);
+ if ( ret )
+ goto sendtrigger_out;
+
ret = -EINVAL;
if ( domctl->u.sendtrigger.vcpu >= MAX_VIRT_CPUS )
goto sendtrigger_out;
@@ -628,6 +624,10 @@ long arch_do_domctl(
bus = (domctl->u.assign_device.machine_bdf >> 16) & 0xff;
devfn = (domctl->u.assign_device.machine_bdf >> 8) & 0xff;

+ ret = xsm_test_assign_device(domctl->u.assign_device.machine_bdf);
+ if ( ret )
+ break;
+
if ( device_assigned(bus, devfn) )
{
gdprintk(XENLOG_ERR, "XEN_DOMCTL_test_assign_device: "
@@ -655,6 +655,11 @@ long arch_do_domctl(
"XEN_DOMCTL_assign_device: get_domain_by_id() failed\n");
break;
}
+
+ ret = xsm_assign_device(d, domctl->u.assign_device.machine_bdf);
+ if ( ret )
+ goto assign_device_out;
+
bus = (domctl->u.assign_device.machine_bdf >> 16) & 0xff;
devfn = (domctl->u.assign_device.machine_bdf >> 8) & 0xff;

@@ -680,6 +685,7 @@ long arch_do_domctl(
"assign device (%x:%x:%x) failed\n",
bus, PCI_SLOT(devfn), PCI_FUNC(devfn));

+ assign_device_out:
put_domain(d);
}
break;
@@ -700,6 +706,11 @@ long arch_do_domctl(
"XEN_DOMCTL_deassign_device: get_domain_by_id() failed\n");
break;
}
+
+ ret = xsm_assign_device(d, domctl->u.assign_device.machine_bdf);
+ if ( ret )
+ goto deassign_device_out;
+
bus = (domctl->u.assign_device.machine_bdf >> 16) & 0xff;
devfn = (domctl->u.assign_device.machine_bdf >> 8) & 0xff;

@@ -720,6 +731,8 @@ long arch_do_domctl(
deassign_device(d, bus, devfn);
gdprintk(XENLOG_INFO, "XEN_DOMCTL_deassign_device: bdf = %x:%x:%x\n",
bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+
+ deassign_device_out:
put_domain(d);
}
break;
@@ -733,10 +746,17 @@ long arch_do_domctl(
if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL )
break;
bind = &(domctl->u.bind_pt_irq);
+
+ ret = xsm_bind_pt_irq(d, bind);
+ if ( ret )
+ goto bind_out;
+
if ( iommu_enabled )
ret = pt_irq_create_bind_vtd(d, bind);
if ( ret < 0 )
gdprintk(XENLOG_ERR, "pt_irq_create_bind failed!\n");
+
+ bind_out:
rcu_unlock_domain(d);
}
break;
@@ -877,11 +897,16 @@ long arch_do_domctl(
if ( d == NULL )
break;

+ ret = xsm_pin_mem_cacheattr(d);
+ if ( ret )
+ goto pin_out;
+
ret = hvm_set_mem_pinned_cacheattr(
d, domctl->u.pin_mem_cacheattr.start,
domctl->u.pin_mem_cacheattr.end,
domctl->u.pin_mem_cacheattr.type);

+ pin_out:
rcu_unlock_domain(d);
}
break;
@@ -899,6 +924,10 @@ long arch_do_domctl(
d = rcu_lock_domain_by_id(domctl->domain);
if ( d == NULL )
break;
+
+ ret = xsm_ext_vcpucontext(d, domctl->cmd);
+ if ( ret )
+ goto ext_vcpucontext_out;

ret = -ESRCH;
if ( (evc->vcpu >= MAX_VIRT_CPUS) ||
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/hpet.c
--- a/xen/arch/x86/hpet.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/hpet.c Fri Sep 12 14:47:40 2008 +0900
@@ -100,6 +100,13 @@ static int reprogram_hpet_evt_channel(

ch->next_event = expire;

+ if ( expire == STIME_MAX )
+ {
+ /* We assume it will take a long time for the timer to wrap. */
+ hpet_write32(0, HPET_T0_CMP);
+ return 0;
+ }
+
delta = min_t(int64_t, delta, MAX_DELTA_NS);
delta = max_t(int64_t, delta, MIN_DELTA_NS);
delta = ns2ticks(delta, ch->shift, ch->mult);
@@ -206,9 +213,11 @@ void hpet_broadcast_enter(void)
{
struct hpet_event_channel *ch = &hpet_event;

+ spin_lock(&ch->lock);
+
+ disable_APIC_timer();
+
cpu_set(smp_processor_id(), ch->cpumask);
-
- spin_lock(&ch->lock);

/* reprogram if current cpu expire time is nearer */
if ( this_cpu(timer_deadline) < ch->next_event )
@@ -222,8 +231,23 @@ void hpet_broadcast_exit(void)
struct hpet_event_channel *ch = &hpet_event;
int cpu = smp_processor_id();

+ spin_lock_irq(&ch->lock);
+
if ( cpu_test_and_clear(cpu, ch->cpumask) )
- reprogram_timer(per_cpu(timer_deadline, cpu));
+ {
+ /* Cancel any outstanding LAPIC event and re-enable interrupts. */
+ reprogram_timer(0);
+ enable_APIC_timer();
+
+ /* Reprogram the deadline; trigger timer work now if it has passed. */
+ if ( !reprogram_timer(per_cpu(timer_deadline, cpu)) )
+ raise_softirq(TIMER_SOFTIRQ);
+
+ if ( cpus_empty(ch->cpumask) && ch->next_event != STIME_MAX )
+ reprogram_hpet_evt_channel(ch, STIME_MAX, 0, 0);
+ }
+
+ spin_unlock_irq(&ch->lock);
}

int hpet_broadcast_is_available(void)
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/hvm/hvm.c Fri Sep 12 14:47:40 2008 +0900
@@ -31,10 +31,11 @@
#include <xen/hypercall.h>
#include <xen/guest_access.h>
#include <xen/event.h>
+#include <xen/paging.h>
+#include <asm/shadow.h>
#include <asm/current.h>
#include <asm/e820.h>
#include <asm/io.h>
-#include <asm/paging.h>
#include <asm/regs.h>
#include <asm/cpufeature.h>
#include <asm/processor.h>
@@ -772,7 +773,7 @@ void hvm_hlt(unsigned long rflags)

do_sched_op_compat(SCHEDOP_block, 0);

- HVMTRACE_1D(HLT, curr, /* pending = */ vcpu_runnable(curr));
+ HVMTRACE_1D(HLT, /* pending = */ vcpu_runnable(curr));
}

void hvm_triple_fault(void)
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/hvm/svm/intr.c
--- a/xen/arch/x86/hvm/svm/intr.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/hvm/svm/intr.c Fri Sep 12 14:47:40 2008 +0900
@@ -80,7 +80,7 @@ static void enable_intr_window(struct vc

ASSERT(intack.source != hvm_intsrc_none);

- HVMTRACE_2D(INJ_VIRQ, v, 0x0, /*fake=*/ 1);
+ HVMTRACE_2D(INJ_VIRQ, 0x0, /*fake=*/ 1);

/*
* Create a dummy virtual interrupt to intercept as soon as the
@@ -199,7 +199,7 @@ asmlinkage void svm_intr_assist(void)
}
else
{
- HVMTRACE_2D(INJ_VIRQ, v, intack.vector, /*fake=*/ 0);
+ HVMTRACE_2D(INJ_VIRQ, intack.vector, /*fake=*/ 0);
svm_inject_extint(v, intack.vector);
pt_intr_post(v, intack);
}
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/hvm/svm/svm.c
--- a/xen/arch/x86/hvm/svm/svm.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/hvm/svm/svm.c Fri Sep 12 14:47:40 2008 +0900
@@ -759,11 +759,11 @@ static void svm_inject_exception(
if ( trapnr == TRAP_page_fault )
{
vmcb->cr2 = curr->arch.hvm_vcpu.guest_cr[2] = cr2;
- HVMTRACE_LONG_2D(PF_INJECT, curr, errcode, TRC_PAR_LONG(cr2));
+ HVMTRACE_LONG_2D(PF_INJECT, errcode, TRC_PAR_LONG(cr2));
}
else
{
- HVMTRACE_2D(INJ_EXC, curr, trapnr, errcode);
+ HVMTRACE_2D(INJ_EXC, trapnr, errcode);
}

if ( (trapnr == TRAP_debug) &&
@@ -919,7 +919,7 @@ static void svm_cpuid_intercept(
__clear_bit(X86_FEATURE_APIC & 31, edx);
}

- HVMTRACE_5D (CPUID, v, input, *eax, *ebx, *ecx, *edx);
+ HVMTRACE_5D (CPUID, input, *eax, *ebx, *ecx, *edx);
}

static void svm_vmexit_do_cpuid(struct cpu_user_regs *regs)
@@ -946,7 +946,7 @@ static void svm_vmexit_do_cpuid(struct c

static void svm_dr_access(struct vcpu *v, struct cpu_user_regs *regs)
{
- HVMTRACE_0D(DR_WRITE, v);
+ HVMTRACE_0D(DR_WRITE);
__restore_debug_registers(v);
}

@@ -1018,7 +1018,7 @@ static int svm_msr_read_intercept(struct
regs->edx = msr_content >> 32;

done:
- HVMTRACE_3D (MSR_READ, v, ecx, regs->eax, regs->edx);
+ HVMTRACE_3D (MSR_READ, ecx, regs->eax, regs->edx);
HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
ecx, (unsigned long)regs->eax, (unsigned long)regs->edx);
return X86EMUL_OKAY;
@@ -1037,7 +1037,7 @@ static int svm_msr_write_intercept(struc

msr_content = (u32)regs->eax | ((u64)regs->edx << 32);

- HVMTRACE_3D (MSR_WRITE, v, ecx, regs->eax, regs->edx);
+ HVMTRACE_3D (MSR_WRITE, ecx, regs->eax, regs->edx);

switch ( ecx )
{
@@ -1168,7 +1168,7 @@ static void svm_invlpg_intercept(unsigne
static void svm_invlpg_intercept(unsigned long vaddr)
{
struct vcpu *curr = current;
- HVMTRACE_LONG_2D(INVLPG, curr, 0, TRC_PAR_LONG(vaddr));
+ HVMTRACE_LONG_2D(INVLPG, 0, TRC_PAR_LONG(vaddr));
paging_invlpg(curr, vaddr);
svm_asid_g_invlpg(curr, vaddr);
}
@@ -1191,7 +1191,7 @@ asmlinkage void svm_vmexit_handler(struc

exit_reason = vmcb->exitcode;

- HVMTRACE_ND(VMEXIT64, 1/*cycles*/, v, 3, exit_reason,
+ HVMTRACE_ND(VMEXIT64, 1/*cycles*/, 3, exit_reason,
(uint32_t)regs->eip, (uint32_t)((uint64_t)regs->eip >> 32),
0, 0, 0);

@@ -1216,17 +1216,17 @@ asmlinkage void svm_vmexit_handler(struc
{
case VMEXIT_INTR:
/* Asynchronous event, handled when we STGI'd after the VMEXIT. */
- HVMTRACE_0D(INTR, v);
+ HVMTRACE_0D(INTR);
break;

case VMEXIT_NMI:
/* Asynchronous event, handled when we STGI'd after the VMEXIT. */
- HVMTRACE_0D(NMI, v);
+ HVMTRACE_0D(NMI);
break;

case VMEXIT_SMI:
/* Asynchronous event, handled when we STGI'd after the VMEXIT. */
- HVMTRACE_0D(SMI, v);
+ HVMTRACE_0D(SMI);
break;

case VMEXIT_EXCEPTION_DB:
@@ -1261,10 +1261,12 @@ asmlinkage void svm_vmexit_handler(struc

if ( paging_fault(va, regs) )
{
- if (hvm_long_mode_enabled(v))
- HVMTRACE_LONG_2D(PF_XEN, v, regs->error_code, TRC_PAR_LONG(va));
+ if ( trace_will_trace_event(TRC_SHADOW) )
+ break;
+ if ( hvm_long_mode_enabled(v) )
+ HVMTRACE_LONG_2D(PF_XEN, regs->error_code, TRC_PAR_LONG(va));
else
- HVMTRACE_2D(PF_XEN, v, regs->error_code, va);
+ HVMTRACE_2D(PF_XEN, regs->error_code, va);
break;
}

@@ -1274,7 +1276,7 @@ asmlinkage void svm_vmexit_handler(struc

/* Asynchronous event, handled when we STGI'd after the VMEXIT. */
case VMEXIT_EXCEPTION_MC:
- HVMTRACE_0D(MCE, v);
+ HVMTRACE_0D(MCE);
break;

case VMEXIT_VINTR:
@@ -1331,7 +1333,7 @@ asmlinkage void svm_vmexit_handler(struc
case VMEXIT_VMMCALL:
if ( (inst_len = __get_instruction_length(v, INSTR_VMCALL)) == 0 )
break;
- HVMTRACE_1D(VMMCALL, v, regs->eax);
+ HVMTRACE_1D(VMMCALL, regs->eax);
rc = hvm_do_hypercall(regs);
if ( rc != HVM_HCALL_preempted )
{
@@ -1406,7 +1408,7 @@ asmlinkage void svm_vmexit_handler(struc

asmlinkage void svm_trace_vmentry(void)
{
- HVMTRACE_ND (VMENTRY, 1/*cycles*/, current, 0, 0, 0, 0, 0, 0, 0);
+ HVMTRACE_ND (VMENTRY, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
}

/*
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/hvm/vmx/intr.c
--- a/xen/arch/x86/hvm/vmx/intr.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/hvm/vmx/intr.c Fri Sep 12 14:47:40 2008 +0900
@@ -198,7 +198,7 @@ asmlinkage void vmx_intr_assist(void)
}
else
{
- HVMTRACE_2D(INJ_VIRQ, v, intack.vector, /*fake=*/ 0);
+ HVMTRACE_2D(INJ_VIRQ, intack.vector, /*fake=*/ 0);
vmx_inject_extint(v, intack.vector);
pt_intr_post(v, intack);
}
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/hvm/vmx/vmx.c Fri Sep 12 14:47:40 2008 +0900
@@ -1114,10 +1114,10 @@ static void __vmx_inject_exception(
__vmwrite(VM_ENTRY_INTR_INFO, intr_fields);

if ( trap == TRAP_page_fault )
- HVMTRACE_LONG_2D(PF_INJECT, v, error_code,
+ HVMTRACE_LONG_2D(PF_INJECT, error_code,
TRC_PAR_LONG(v->arch.hvm_vcpu.guest_cr[2]));
else
- HVMTRACE_2D(INJ_EXC, v, trap, error_code);
+ HVMTRACE_2D(INJ_EXC, trap, error_code);
}

void vmx_inject_hw_exception(struct vcpu *v, int trap, int error_code)
@@ -1345,7 +1345,7 @@ static void vmx_cpuid_intercept(
break;
}

- HVMTRACE_5D (CPUID, current, input, *eax, *ebx, *ecx, *edx);
+ HVMTRACE_5D (CPUID, input, *eax, *ebx, *ecx, *edx);
}

static void vmx_do_cpuid(struct cpu_user_regs *regs)
@@ -1370,7 +1370,7 @@ static void vmx_dr_access(unsigned long
{
struct vcpu *v = current;

- HVMTRACE_0D(DR_WRITE, v);
+ HVMTRACE_0D(DR_WRITE);

if ( !v->arch.hvm_vcpu.flag_dr_dirty )
__restore_debug_registers(v);
@@ -1383,7 +1383,7 @@ static void vmx_invlpg_intercept(unsigne
static void vmx_invlpg_intercept(unsigned long vaddr)
{
struct vcpu *curr = current;
- HVMTRACE_LONG_2D(INVLPG, curr, /*invlpga=*/ 0, TRC_PAR_LONG(vaddr));
+ HVMTRACE_LONG_2D(INVLPG, /*invlpga=*/ 0, TRC_PAR_LONG(vaddr));
if ( paging_invlpg(curr, vaddr) )
vpid_sync_vcpu_gva(curr, vaddr);
}
@@ -1434,7 +1434,7 @@ static int mov_to_cr(int gp, int cr, str
goto exit_and_crash;
}

- HVMTRACE_LONG_2D(CR_WRITE, v, cr, TRC_PAR_LONG(value));
+ HVMTRACE_LONG_2D(CR_WRITE, cr, TRC_PAR_LONG(value));

HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);

@@ -1505,7 +1505,7 @@ static void mov_from_cr(int cr, int gp,
break;
}

- HVMTRACE_LONG_2D(CR_READ, v, cr, TRC_PAR_LONG(value));
+ HVMTRACE_LONG_2D(CR_READ, cr, TRC_PAR_LONG(value));

HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
}
@@ -1531,13 +1531,13 @@ static int vmx_cr_access(unsigned long e
case VMX_CONTROL_REG_ACCESS_TYPE_CLTS:
v->arch.hvm_vcpu.guest_cr[0] &= ~X86_CR0_TS;
vmx_update_guest_cr(v, 0);
- HVMTRACE_0D(CLTS, current);
+ HVMTRACE_0D(CLTS);
break;
case VMX_CONTROL_REG_ACCESS_TYPE_LMSW:
value = v->arch.hvm_vcpu.guest_cr[0];
/* LMSW can: (1) set bits 0-3; (2) clear bits 1-3. */
value = (value & ~0xe) | ((exit_qualification >> 16) & 0xf);
- HVMTRACE_LONG_1D(LMSW, current, value);
+ HVMTRACE_LONG_1D(LMSW, value);
return !hvm_set_cr0(value);
default:
BUG();
@@ -1692,7 +1692,7 @@ static int vmx_msr_read_intercept(struct
regs->edx = (uint32_t)(msr_content >> 32);

done:
- HVMTRACE_3D (MSR_READ, v, ecx, regs->eax, regs->edx);
+ HVMTRACE_3D (MSR_READ, ecx, regs->eax, regs->edx);
HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
ecx, (unsigned long)regs->eax,
(unsigned long)regs->edx);
@@ -1803,7 +1803,7 @@ static int vmx_msr_write_intercept(struc

msr_content = (u32)regs->eax | ((u64)regs->edx << 32);

- HVMTRACE_3D (MSR_WRITE, v, ecx, regs->eax, regs->edx);
+ HVMTRACE_3D (MSR_WRITE, ecx, regs->eax, regs->edx);

switch ( ecx )
{
@@ -1894,7 +1894,7 @@ static void vmx_do_extint(struct cpu_use
BUG_ON(!(vector & INTR_INFO_VALID_MASK));

vector &= INTR_INFO_VECTOR_MASK;
- HVMTRACE_1D(INTR, current, vector);
+ HVMTRACE_1D(INTR, vector);

switch ( vector )
{
@@ -2010,7 +2010,7 @@ static void vmx_failed_vmentry(unsigned
break;
case EXIT_REASON_MACHINE_CHECK:
printk("caused by machine check.\n");
- HVMTRACE_0D(MCE, curr);
+ HVMTRACE_0D(MCE);
do_machine_check(regs);
break;
default:
@@ -2037,7 +2037,7 @@ asmlinkage void vmx_vmexit_handler(struc

exit_reason = __vmread(VM_EXIT_REASON);

- HVMTRACE_ND(VMEXIT64, 1/*cycles*/, v, 3, exit_reason,
+ HVMTRACE_ND(VMEXIT64, 1/*cycles*/, 3, exit_reason,
(uint32_t)regs->eip, (uint32_t)((uint64_t)regs->eip >> 32),
0, 0, 0);

@@ -2101,7 +2101,8 @@ asmlinkage void vmx_vmexit_handler(struc
!(__vmread(IDT_VECTORING_INFO) & INTR_INFO_VALID_MASK) &&
(vector != TRAP_double_fault) )
__vmwrite(GUEST_INTERRUPTIBILITY_INFO,
- __vmread(GUEST_INTERRUPTIBILITY_INFO)|VMX_INTR_SHADOW_NMI);
+ __vmread(GUEST_INTERRUPTIBILITY_INFO)
+ | VMX_INTR_SHADOW_NMI);

perfc_incra(cause_vector, vector);

@@ -2128,12 +2129,14 @@ asmlinkage void vmx_vmexit_handler(struc

if ( paging_fault(exit_qualification, regs) )
{
+ if ( trace_will_trace_event(TRC_SHADOW) )
+ break;
if ( hvm_long_mode_enabled(v) )
- HVMTRACE_LONG_2D (PF_XEN, v, regs->error_code,
- TRC_PAR_LONG(exit_qualification) );
+ HVMTRACE_LONG_2D(PF_XEN, regs->error_code,
+ TRC_PAR_LONG(exit_qualification) );
else
- HVMTRACE_2D (PF_XEN, v,
- regs->error_code, exit_qualification );
+ HVMTRACE_2D(PF_XEN,
+ regs->error_code, exit_qualification );
break;
}

@@ -2144,11 +2147,11 @@ asmlinkage void vmx_vmexit_handler(struc
if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) !=
(X86_EVENTTYPE_NMI << 8) )
goto exit_and_crash;
- HVMTRACE_0D(NMI, v);
+ HVMTRACE_0D(NMI);
do_nmi(regs); /* Real NMI, vector 2: normal processing. */
break;
case TRAP_machine_check:
- HVMTRACE_0D(MCE, v);
+ HVMTRACE_0D(MCE);
do_machine_check(regs);
break;
default:
@@ -2213,7 +2216,7 @@ asmlinkage void vmx_vmexit_handler(struc
case EXIT_REASON_VMCALL:
{
int rc;
- HVMTRACE_1D(VMMCALL, v, regs->eax);
+ HVMTRACE_1D(VMMCALL, regs->eax);
inst_len = __get_instruction_length(); /* Safe: VMCALL */
rc = hvm_do_hypercall(regs);
if ( rc != HVM_HCALL_preempted )
@@ -2300,7 +2303,7 @@ asmlinkage void vmx_vmexit_handler(struc

asmlinkage void vmx_trace_vmentry(void)
{
- HVMTRACE_ND (VMENTRY, 1/*cycles*/, current, 0, 0, 0, 0, 0, 0, 0);
+ HVMTRACE_ND (VMENTRY, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
}

/*
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/io_apic.c
--- a/xen/arch/x86/io_apic.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/io_apic.c Fri Sep 12 14:47:40 2008 +0900
@@ -45,23 +45,14 @@ int (*ioapic_renumber_irq)(int ioapic, i
int (*ioapic_renumber_irq)(int ioapic, int irq);
atomic_t irq_mis_count;

-int msi_enable = 0;
-boolean_param("msi", msi_enable);
-
int domain_irq_to_vector(struct domain *d, int irq)
{
- if ( !msi_enable )
- return irq_to_vector(irq);
- else
- return d->arch.pirq_vector[irq];
+ return d->arch.pirq_vector[irq];
}

int domain_vector_to_irq(struct domain *d, int vector)
{
- if ( !msi_enable )
- return vector_to_irq(vector);
- else
- return d->arch.vector_pirq[vector];
+ return d->arch.vector_pirq[vector];
}

/* Where if anywhere is the i8259 connect in external int mode */
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/irq.c
--- a/xen/arch/x86/irq.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/irq.c Fri Sep 12 14:47:40 2008 +0900
@@ -737,9 +737,12 @@ __initcall(setup_dump_irqs);

void fixup_irqs(cpumask_t map)
{
- unsigned int irq;
+ unsigned int irq, sp;
static int warned;
-
+ irq_guest_action_t *action;
+ struct pending_eoi *peoi;
+
+ /* Direct all future interrupts away from this CPU. */
for ( irq = 0; irq < NR_IRQS; irq++ )
{
cpumask_t mask;
@@ -758,8 +761,24 @@ void fixup_irqs(cpumask_t map)
printk("Cannot set affinity for irq %i\n", irq);
}

+ /* Service any interrupts that beat us in the re-direction race. */
local_irq_enable();
mdelay(1);
local_irq_disable();
+
+ /* Clean up cpu_eoi_map of every interrupt to exclude this CPU. */
+ for ( irq = 0; irq < NR_IRQS; irq++ )
+ {
+ if ( !(irq_desc[irq].status & IRQ_GUEST) )
+ continue;
+ action = (irq_guest_action_t *)irq_desc[irq].action;
+ cpu_clear(smp_processor_id(), action->cpu_eoi_map);
+ }
+
+ /* Flush the interrupt EOI stack. */
+ peoi = this_cpu(pending_eoi);
+ for ( sp = 0; sp < pending_eoi_sp(peoi); sp++ )
+ peoi[sp].ready = 1;
+ flush_ready_eoi(NULL);
}
#endif
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/mm.c Fri Sep 12 14:47:40 2008 +0900
@@ -507,11 +507,11 @@ static int alloc_segdesc_page(struct pag
goto fail;

unmap_domain_page(descs);
- return 1;
+ return 0;

fail:
unmap_domain_page(descs);
- return 0;
+ return -EINVAL;
}


@@ -565,20 +565,23 @@ static int get_page_from_pagenr(unsigned

static int get_page_and_type_from_pagenr(unsigned long page_nr,
unsigned long type,
- struct domain *d)
+ struct domain *d,
+ int preemptible)
{
struct page_info *page = mfn_to_page(page_nr);
+ int rc;

if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
- return 0;
-
- if ( unlikely(!get_page_type(page, type)) )
- {
+ return -EINVAL;
+
+ rc = (preemptible ?
+ get_page_type_preemptible(page, type) :
+ (get_page_type(page, type) ? 0 : -EINVAL));
+
+ if ( rc )
put_page(page);
- return 0;
- }
-
- return 1;
+
+ return rc;
}

/*
@@ -754,22 +757,22 @@ get_page_from_l2e(
if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
{
MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
- return 0;
- }
-
- rc = get_page_and_type_from_pagenr(l2e_get_pfn(l2e), PGT_l1_page_table, d);
- if ( unlikely(!rc) )
- rc = get_l2_linear_pagetable(l2e, pfn, d);
+ return -EINVAL;
+ }
+
+ rc = get_page_and_type_from_pagenr(
+ l2e_get_pfn(l2e), PGT_l1_page_table, d, 0);
+ if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
+ rc = 0;

return rc;
}


-#if CONFIG_PAGING_LEVELS >= 3
define_get_linear_pagetable(l3);
static int
get_page_from_l3e(
- l3_pgentry_t l3e, unsigned long pfn, struct domain *d)
+ l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int preemptible)
{
int rc;

@@ -779,22 +782,22 @@ get_page_from_l3e(
if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
{
MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d));
- return 0;
- }
-
- rc = get_page_and_type_from_pagenr(l3e_get_pfn(l3e), PGT_l2_page_table, d);
- if ( unlikely(!rc) )
- rc = get_l3_linear_pagetable(l3e, pfn, d);
+ return -EINVAL;
+ }
+
+ rc = get_page_and_type_from_pagenr(
+ l3e_get_pfn(l3e), PGT_l2_page_table, d, preemptible);
+ if ( unlikely(rc == -EINVAL) && get_l3_linear_pagetable(l3e, pfn, d) )
+ rc = 0;

return rc;
}
-#endif /* 3 level */

#if CONFIG_PAGING_LEVELS >= 4
define_get_linear_pagetable(l4);
static int
get_page_from_l4e(
- l4_pgentry_t l4e, unsigned long pfn, struct domain *d)
+ l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int preemptible)
{
int rc;

@@ -804,12 +807,13 @@ get_page_from_l4e(
if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
{
MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
- return 0;
- }
-
- rc = get_page_and_type_from_pagenr(l4e_get_pfn(l4e), PGT_l3_page_table, d);
- if ( unlikely(!rc) )
- rc = get_l4_linear_pagetable(l4e, pfn, d);
+ return -EINVAL;
+ }
+
+ rc = get_page_and_type_from_pagenr(
+ l4e_get_pfn(l4e), PGT_l3_page_table, d, preemptible);
+ if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
+ rc = 0;

return rc;
}
@@ -946,29 +950,35 @@ void put_page_from_l1e(l1_pgentry_t l1e,
* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
* Note also that this automatically deals correctly with linear p.t.'s.
*/
-static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
+static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
{
if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
(l2e_get_pfn(l2e) != pfn) )
+ {
put_page_and_type(l2e_get_page(l2e));
-}
-
-
-#if CONFIG_PAGING_LEVELS >= 3
-static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
+ return 0;
+ }
+ return 1;
+}
+
+
+static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
+ int preemptible)
{
if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
(l3e_get_pfn(l3e) != pfn) )
- put_page_and_type(l3e_get_page(l3e));
-}
-#endif
+ return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
+ return 1;
+}

#if CONFIG_PAGING_LEVELS >= 4
-static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
+static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
+ int preemptible)
{
if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
(l4e_get_pfn(l4e) != pfn) )
- put_page_and_type(l4e_get_page(l4e));
+ return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible);
+ return 1;
}
#endif

@@ -977,7 +987,7 @@ static int alloc_l1_table(struct page_in
struct domain *d = page_get_owner(page);
unsigned long pfn = page_to_mfn(page);
l1_pgentry_t *pl1e;
- int i;
+ unsigned int i;

pl1e = map_domain_page(pfn);

@@ -991,7 +1001,7 @@ static int alloc_l1_table(struct page_in
}

unmap_domain_page(pl1e);
- return 1;
+ return 0;

fail:
MEM_LOG("Failure in alloc_l1_table: entry %d", i);
@@ -1000,7 +1010,7 @@ static int alloc_l1_table(struct page_in
put_page_from_l1e(pl1e[i], d);

unmap_domain_page(pl1e);
- return 0;
+ return -EINVAL;
}

static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
@@ -1128,47 +1138,53 @@ static void pae_flush_pgd(
# define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
#endif

-static int alloc_l2_table(struct page_info *page, unsigned long type)
+static int alloc_l2_table(struct page_info *page, unsigned long type,
+ int preemptible)
{
struct domain *d = page_get_owner(page);
unsigned long pfn = page_to_mfn(page);
l2_pgentry_t *pl2e;
- int i;
+ unsigned int i;
+ int rc = 0;

pl2e = map_domain_page(pfn);

- for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
- {
- if ( !is_guest_l2_slot(d, type, i) )
+ for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
+ {
+ if ( preemptible && i && hypercall_preempt_check() )
+ {
+ page->nr_validated_ptes = i;
+ rc = -EAGAIN;
+ break;
+ }
+
+ if ( !is_guest_l2_slot(d, type, i) ||
+ (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
continue;

- if ( unlikely(!get_page_from_l2e(pl2e[i], pfn, d)) )
- goto fail;
-
+ if ( rc < 0 )
+ {
+ MEM_LOG("Failure in alloc_l2_table: entry %d", i);
+ while ( i-- > 0 )
+ if ( is_guest_l2_slot(d, type, i) )
+ put_page_from_l2e(pl2e[i], pfn);
+ break;
+ }
+
adjust_guest_l2e(pl2e[i], d);
}

unmap_domain_page(pl2e);
- return 1;
-
- fail:
- MEM_LOG("Failure in alloc_l2_table: entry %d", i);
- while ( i-- > 0 )
- if ( is_guest_l2_slot(d, type, i) )
- put_page_from_l2e(pl2e[i], pfn);
-
- unmap_domain_page(pl2e);
- return 0;
-}
-
-
-#if CONFIG_PAGING_LEVELS >= 3
-static int alloc_l3_table(struct page_info *page)
+ return rc > 0 ? 0 : rc;
+}
+
+static int alloc_l3_table(struct page_info *page, int preemptible)
{
struct domain *d = page_get_owner(page);
unsigned long pfn = page_to_mfn(page);
l3_pgentry_t *pl3e;
- int i;
+ unsigned int i;
+ int rc = 0;

#if CONFIG_PAGING_LEVELS == 3
/*
@@ -1181,7 +1197,7 @@ static int alloc_l3_table(struct page_in
d->vcpu[0] && d->vcpu[0]->is_initialised )
{
MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
- return 0;
+ return -EINVAL;
}
#endif

@@ -1197,64 +1213,96 @@ static int alloc_l3_table(struct page_in
if ( is_pv_32on64_domain(d) )
memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));

- for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
+ for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; i++ )
{
if ( is_pv_32bit_domain(d) && (i == 3) )
{
if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
- (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) ||
- !get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
- PGT_l2_page_table |
- PGT_pae_xen_l2,
- d) )
- goto fail;
- }
- else if ( !is_guest_l3_slot(i) )
+ (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) )
+ rc = -EINVAL;
+ else
+ rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
+ PGT_l2_page_table |
+ PGT_pae_xen_l2,
+ d, preemptible);
+ }
+ else if ( !is_guest_l3_slot(i) ||
+ (rc = get_page_from_l3e(pl3e[i], pfn, d, preemptible)) > 0 )
continue;
- else if ( unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) )
- goto fail;
+
+ if ( rc == -EAGAIN )
+ {
+ page->nr_validated_ptes = i;
+ page->partial_pte = 1;
+ }
+ else if ( rc == -EINTR && i )
+ {
+ page->nr_validated_ptes = i;
+ page->partial_pte = 0;
+ rc = -EAGAIN;
+ }
+ if ( rc < 0 )
+ break;

adjust_guest_l3e(pl3e[i], d);
}

- if ( !create_pae_xen_mappings(d, pl3e) )
- goto fail;
+ if ( rc >= 0 && !create_pae_xen_mappings(d, pl3e) )
+ rc = -EINVAL;
+ if ( rc < 0 && rc != -EAGAIN && rc != -EINTR )
+ {
+ MEM_LOG("Failure in alloc_l3_table: entry %d", i);
+ while ( i-- > 0 )
+ {
+ if ( !is_guest_l3_slot(i) )
+ continue;
+ unadjust_guest_l3e(pl3e[i], d);
+ put_page_from_l3e(pl3e[i], pfn, 0);
+ }
+ }

unmap_domain_page(pl3e);
- return 1;
-
- fail:
- MEM_LOG("Failure in alloc_l3_table: entry %d", i);
- while ( i-- > 0 )
- {
- if ( !is_guest_l3_slot(i) )
- continue;
- unadjust_guest_l3e(pl3e[i], d);
- put_page_from_l3e(pl3e[i], pfn);
- }
-
- unmap_domain_page(pl3e);
- return 0;
-}
-#else
-#define alloc_l3_table(page) (0)
-#endif
+ return rc > 0 ? 0 : rc;
+}

#if CONFIG_PAGING_LEVELS >= 4
-static int alloc_l4_table(struct page_info *page)
+static int alloc_l4_table(struct page_info *page, int preemptible)
{
struct domain *d = page_get_owner(page);
unsigned long pfn = page_to_mfn(page);
l4_pgentry_t *pl4e = page_to_virt(page);
- int i;
-
- for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
- {
- if ( !is_guest_l4_slot(d, i) )
+ unsigned int i;
+ int rc = 0;
+
+ for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; i++ )
+ {
+ if ( !is_guest_l4_slot(d, i) ||
+ (rc = get_page_from_l4e(pl4e[i], pfn, d, preemptible)) > 0 )
continue;

- if ( unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) )
- goto fail;
+ if ( rc == -EAGAIN )
+ {
+ page->nr_validated_ptes = i;
+ page->partial_pte = 1;
+ }
+ else if ( rc == -EINTR )
+ {
+ if ( i )
+ {
+ page->nr_validated_ptes = i;
+ page->partial_pte = 0;
+ rc = -EAGAIN;
+ }
+ }
+ else if ( rc < 0 )
+ {
+ MEM_LOG("Failure in alloc_l4_table: entry %d", i);
+ while ( i-- > 0 )
+ if ( is_guest_l4_slot(d, i) )
+ put_page_from_l4e(pl4e[i], pfn, 0);
+ }
+ if ( rc < 0 )
+ return rc;

adjust_guest_l4e(pl4e[i], d);
}
@@ -1269,18 +1317,10 @@ static int alloc_l4_table(struct page_in
l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
__PAGE_HYPERVISOR);

- return 1;
-
- fail:
- MEM_LOG("Failure in alloc_l4_table: entry %d", i);
- while ( i-- > 0 )
- if ( is_guest_l4_slot(d, i) )
- put_page_from_l4e(pl4e[i], pfn);
-
- return 0;
+ return rc > 0 ? 0 : rc;
}
#else
-#define alloc_l4_table(page) (0)
+#define alloc_l4_table(page, preemptible) (-EINVAL)
#endif


@@ -1289,7 +1329,7 @@ static void free_l1_table(struct page_in
struct domain *d = page_get_owner(page);
unsigned long pfn = page_to_mfn(page);
l1_pgentry_t *pl1e;
- int i;
+ unsigned int i;

pl1e = map_domain_page(pfn);

@@ -1301,74 +1341,114 @@ static void free_l1_table(struct page_in
}


-static void free_l2_table(struct page_info *page)
+static int free_l2_table(struct page_info *page, int preemptible)
{
#ifdef CONFIG_COMPAT
struct domain *d = page_get_owner(page);
#endif
unsigned long pfn = page_to_mfn(page);
l2_pgentry_t *pl2e;
- int i;
+ unsigned int i = page->nr_validated_ptes - 1;
+ int err = 0;

pl2e = map_domain_page(pfn);

- for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
- if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) )
- put_page_from_l2e(pl2e[i], pfn);
+ ASSERT(page->nr_validated_ptes);
+ do {
+ if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
+ put_page_from_l2e(pl2e[i], pfn) == 0 &&
+ preemptible && i && hypercall_preempt_check() )
+ {
+ page->nr_validated_ptes = i;
+ err = -EAGAIN;
+ }
+ } while ( !err && i-- );

unmap_domain_page(pl2e);

- page->u.inuse.type_info &= ~PGT_pae_xen_l2;
-}
-
-
-#if CONFIG_PAGING_LEVELS >= 3
-
-static void free_l3_table(struct page_info *page)
+ if ( !err )
+ page->u.inuse.type_info &= ~PGT_pae_xen_l2;
+
+ return err;
+}
+
+static int free_l3_table(struct page_info *page, int preemptible)
{
struct domain *d = page_get_owner(page);
unsigned long pfn = page_to_mfn(page);
l3_pgentry_t *pl3e;
- int i;
+ unsigned int i = page->nr_validated_ptes - !page->partial_pte;
+ int rc = 0;

#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
if ( d->arch.relmem == RELMEM_l3 )
- return;
+ return 0;
#endif

pl3e = map_domain_page(pfn);

- for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
+ do {
if ( is_guest_l3_slot(i) )
{
- put_page_from_l3e(pl3e[i], pfn);
+ rc = put_page_from_l3e(pl3e[i], pfn, preemptible);
+ if ( rc > 0 )
+ continue;
+ if ( rc )
+ break;
unadjust_guest_l3e(pl3e[i], d);
}
+ } while ( i-- );

unmap_domain_page(pl3e);
-}
-
-#endif
+
+ if ( rc == -EAGAIN )
+ {
+ page->nr_validated_ptes = i;
+ page->partial_pte = 1;
+ }
+ else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
+ {
+ page->nr_validated_ptes = i + 1;
+ page->partial_pte = 0;
+ rc = -EAGAIN;
+ }
+ return rc > 0 ? 0 : rc;
+}

#if CONFIG_PAGING_LEVELS >= 4
-
-static void free_l4_table(struct page_info *page)
+static int free_l4_table(struct page_info *page, int preemptible)
{
struct domain *d = page_get_owner(page);
unsigned long pfn = page_to_mfn(page);
l4_pgentry_t *pl4e = page_to_virt(page);
- int i;
+ unsigned int i = page->nr_validated_ptes - !page->partial_pte;
+ int rc = 0;

#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
if ( d->arch.relmem == RELMEM_l4 )
- return;
+ return 0;
#endif

- for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
+ do {
if ( is_guest_l4_slot(d, i) )
- put_page_from_l4e(pl4e[i], pfn);
-}
-
+ rc = put_page_from_l4e(pl4e[i], pfn, preemptible);
+ } while ( rc >= 0 && i-- );
+
+ if ( rc == -EAGAIN )
+ {
+ page->nr_validated_ptes = i;
+ page->partial_pte = 1;
+ }
+ else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
+ {
+ page->nr_validated_ptes = i + 1;
+ page->partial_pte = 0;
+ rc = -EAGAIN;
+ }
+ return rc > 0 ? 0 : rc;
+}
+#else
+#define free_l4_table(page, preemptible) (-EINVAL)
#endif

static void page_lock(struct page_info *page)
@@ -1560,7 +1640,7 @@ static int mod_l2_entry(l2_pgentry_t *pl
return rc;
}

- if ( unlikely(!get_page_from_l2e(nl2e, pfn, d)) )
+ if ( unlikely(get_page_from_l2e(nl2e, pfn, d) < 0) )
return page_unlock(l2pg), 0;

adjust_guest_l2e(nl2e, d);
@@ -1582,25 +1662,24 @@ static int mod_l2_entry(l2_pgentry_t *pl
put_page_from_l2e(ol2e, pfn);
return rc;
}
-
-#if CONFIG_PAGING_LEVELS >= 3

/* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
static int mod_l3_entry(l3_pgentry_t *pl3e,
l3_pgentry_t nl3e,
unsigned long pfn,
- int preserve_ad)
+ int preserve_ad,
+ int preemptible)
{
l3_pgentry_t ol3e;
struct vcpu *curr = current;
struct domain *d = curr->domain;
struct page_info *l3pg = mfn_to_page(pfn);
- int rc = 1;
+ int rc = 0;

if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
{
MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
- return 0;
+ return -EINVAL;
}

/*
@@ -1608,12 +1687,12 @@ static int mod_l3_entry(l3_pgentry_t *pl
* would be a pain to ensure they remain continuously valid throughout.
*/
if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
- return 0;
+ return -EINVAL;

page_lock(l3pg);

if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
- return page_unlock(l3pg), 0;
+ return page_unlock(l3pg), -EFAULT;

if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
{
@@ -1622,7 +1701,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
page_unlock(l3pg);
MEM_LOG("Bad L3 flags %x",
l3e_get_flags(nl3e) & l3_disallow_mask(d));
- return 0;
+ return -EINVAL;
}

/* Fast path for identical mapping and presence. */
@@ -1631,28 +1710,30 @@ static int mod_l3_entry(l3_pgentry_t *pl
adjust_guest_l3e(nl3e, d);
rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr, preserve_ad);
page_unlock(l3pg);
- return rc;
- }
-
- if ( unlikely(!get_page_from_l3e(nl3e, pfn, d)) )
- return page_unlock(l3pg), 0;
+ return rc ? 0 : -EFAULT;
+ }
+
+ rc = get_page_from_l3e(nl3e, pfn, d, preemptible);
+ if ( unlikely(rc < 0) )
+ return page_unlock(l3pg), rc;
+ rc = 0;

adjust_guest_l3e(nl3e, d);
if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr,
preserve_ad)) )
{
ol3e = nl3e;
- rc = 0;
+ rc = -EFAULT;
}
}
else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr,
preserve_ad)) )
{
page_unlock(l3pg);
- return 0;
- }
-
- if ( likely(rc) )
+ return -EFAULT;
+ }
+
+ if ( likely(rc == 0) )
{
if ( !create_pae_xen_mappings(d, pl3e) )
BUG();
@@ -1661,11 +1742,9 @@ static int mod_l3_entry(l3_pgentry_t *pl
}

page_unlock(l3pg);
- put_page_from_l3e(ol3e, pfn);
+ put_page_from_l3e(ol3e, pfn, 0);
return rc;
}
-
-#endif

#if CONFIG_PAGING_LEVELS >= 4

@@ -1673,24 +1752,25 @@ static int mod_l4_entry(l4_pgentry_t *pl
static int mod_l4_entry(l4_pgentry_t *pl4e,
l4_pgentry_t nl4e,
unsigned long pfn,
- int preserve_ad)
+ int preserve_ad,
+ int preemptible)
{
struct vcpu *curr = current;
struct domain *d = curr->domain;
l4_pgentry_t ol4e;
struct page_info *l4pg = mfn_to_page(pfn);
- int rc = 1;
+ int rc = 0;

if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
{
MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
- return 0;
+ return -EINVAL;
}

page_lock(l4pg);

if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
- return page_unlock(l4pg), 0;
+ return page_unlock(l4pg), -EFAULT;

if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
{
@@ -1699,7 +1779,7 @@ static int mod_l4_entry(l4_pgentry_t *pl
page_unlock(l4pg);
MEM_LOG("Bad L4 flags %x",
l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
- return 0;
+ return -EINVAL;
}

/* Fast path for identical mapping and presence. */
@@ -1708,29 +1788,31 @@ static int mod_l4_entry(l4_pgentry_t *pl
adjust_guest_l4e(nl4e, d);
rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr, preserve_ad);
page_unlock(l4pg);
- return rc;
- }
-
- if ( unlikely(!get_page_from_l4e(nl4e, pfn, d)) )
- return page_unlock(l4pg), 0;
+ return rc ? 0 : -EFAULT;
+ }
+
+ rc = get_page_from_l4e(nl4e, pfn, d, preemptible);
+ if ( unlikely(rc < 0) )
+ return page_unlock(l4pg), rc;
+ rc = 0;

adjust_guest_l4e(nl4e, d);
if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr,
preserve_ad)) )
{
ol4e = nl4e;
- rc = 0;
+ rc = -EFAULT;
}
}
else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr,
preserve_ad)) )
{
page_unlock(l4pg);
- return 0;
+ return -EFAULT;
}

page_unlock(l4pg);
- put_page_from_l4e(ol4e, pfn);
+ put_page_from_l4e(ol4e, pfn, 0);
return rc;
}

@@ -1788,9 +1870,11 @@ int get_page(struct page_info *page, str
}


-static int alloc_page_type(struct page_info *page, unsigned long type)
+static int alloc_page_type(struct page_info *page, unsigned long type,
+ int preemptible)
{
struct domain *owner = page_get_owner(page);
+ int rc;

/* A page table is dirtied when its type count becomes non-zero. */
if ( likely(owner != NULL) )
@@ -1799,30 +1883,65 @@ static int alloc_page_type(struct page_i
switch ( type & PGT_type_mask )
{
case PGT_l1_page_table:
- return alloc_l1_table(page);
+ alloc_l1_table(page);
+ rc = 0;
+ break;
case PGT_l2_page_table:
- return alloc_l2_table(page, type);
+ rc = alloc_l2_table(page, type, preemptible);
+ break;
case PGT_l3_page_table:
- return alloc_l3_table(page);
+ rc = alloc_l3_table(page, preemptible);
+ break;
case PGT_l4_page_table:
- return alloc_l4_table(page);
+ rc = alloc_l4_table(page, preemptible);
+ break;
case PGT_seg_desc_page:
- return alloc_segdesc_page(page);
+ rc = alloc_segdesc_page(page);
+ break;
default:
printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n",
type, page->u.inuse.type_info,
page->count_info);
+ rc = -EINVAL;
BUG();
}

- return 0;
-}
-
-
-void free_page_type(struct page_info *page, unsigned long type)
+ /* No need for atomic update of type_info here: noone else updates it. */
+ wmb();
+ if ( rc == -EAGAIN )
+ {
+ page->u.inuse.type_info |= PGT_partial;
+ }
+ else if ( rc == -EINTR )
+ {
+ ASSERT((page->u.inuse.type_info &
+ (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
+ page->u.inuse.type_info &= ~PGT_count_mask;
+ }
+ else if ( rc )
+ {
+ ASSERT(rc < 0);
+ MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
+ PRtype_info ": caf=%08x taf=%" PRtype_info,
+ page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
+ type, page->count_info, page->u.inuse.type_info);
+ page->u.inuse.type_info = 0;
+ }
+ else
+ {
+ page->u.inuse.type_info |= PGT_validated;
+ }
+
+ return rc;
+}
+
+
+int free_page_type(struct page_info *page, unsigned long type,
+ int preemptible)
{
struct domain *owner = page_get_owner(page);
unsigned long gmfn;
+ int rc;

if ( likely(owner != NULL) )
{
@@ -1842,7 +1961,7 @@ void free_page_type(struct page_info *pa
paging_mark_dirty(owner, page_to_mfn(page));

if ( shadow_mode_refcounts(owner) )
- return;
+ return 0;

gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
ASSERT(VALID_M2P(gmfn));
@@ -1850,42 +1969,80 @@ void free_page_type(struct page_info *pa
}
}

+ if ( !(type & PGT_partial) )
+ {
+ page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
+ page->partial_pte = 0;
+ }
switch ( type & PGT_type_mask )
{
case PGT_l1_page_table:
free_l1_table(page);
+ rc = 0;
break;
-
case PGT_l2_page_table:
- free_l2_table(page);
+ rc = free_l2_table(page, preemptible);
break;
-
-#if CONFIG_PAGING_LEVELS >= 3
case PGT_l3_page_table:
- free_l3_table(page);
+#if CONFIG_PAGING_LEVELS == 3
+ if ( !(type & PGT_partial) )
+ page->nr_validated_ptes = L3_PAGETABLE_ENTRIES;
+#endif
+ rc = free_l3_table(page, preemptible);
break;
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 4
case PGT_l4_page_table:
- free_l4_table(page);
+ rc = free_l4_table(page, preemptible);
break;
-#endif
-
default:
- printk("%s: type %lx pfn %lx\n",__FUNCTION__,
- type, page_to_mfn(page));
+ MEM_LOG("type %lx pfn %lx\n", type, page_to_mfn(page));
+ rc = -EINVAL;
BUG();
}
-}
-
-
-void put_page_type(struct page_info *page)
+
+ /* No need for atomic update of type_info here: noone else updates it. */
+ if ( rc == 0 )
+ {
+ /*
+ * Record TLB information for flush later. We do not stamp page tables
+ * when running in shadow mode:
+ * 1. Pointless, since it's the shadow pt's which must be tracked.
+ * 2. Shadow mode reuses this field for shadowed page tables to
+ * store flags info -- we don't want to conflict with that.
+ */
+ if ( !(shadow_mode_enabled(page_get_owner(page)) &&
+ (page->count_info & PGC_page_table)) )
+ page->tlbflush_timestamp = tlbflush_current_time();
+ wmb();
+ page->u.inuse.type_info--;
+ }
+ else if ( rc == -EINTR )
+ {
+ ASSERT(!(page->u.inuse.type_info &
+ (PGT_count_mask|PGT_validated|PGT_partial)));
+ if ( !(shadow_mode_enabled(page_get_owner(page)) &&
+ (page->count_info & PGC_page_table)) )
+ page->tlbflush_timestamp = tlbflush_current_time();
+ wmb();
+ page->u.inuse.type_info |= PGT_validated;
+ }
+ else
+ {
+ BUG_ON(rc != -EAGAIN);
+ wmb();
+ page->u.inuse.type_info |= PGT_partial;
+ }
+
+ return rc;
+}
+
+
+static int __put_page_type(struct page_info *page,
+ int preemptible)
{
unsigned long nx, x, y = page->u.inuse.type_info;

- again:
- do {
+ for ( ; ; )
+ {
x = y;
nx = x - 1;

@@ -1894,21 +2051,19 @@ void put_page_type(struct page_info *pag
if ( unlikely((nx & PGT_count_mask) == 0) )
{
if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
- likely(nx & PGT_validated) )
+ likely(nx & (PGT_validated|PGT_partial)) )
{
/*
* Page-table pages must be unvalidated when count is zero. The
* 'free' is safe because the refcnt is non-zero and validated
* bit is clear => other ops will spin or fail.
*/
- if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
- x & ~PGT_validated)) != x) )
- goto again;
+ nx = x & ~(PGT_validated|PGT_partial);
+ if ( unlikely((y = cmpxchg(&page->u.inuse.type_info,
+ x, nx)) != x) )
+ continue;
/* We cleared the 'valid bit' so we do the clean up. */
- free_page_type(page, x);
- /* Carry on, but with the 'valid bit' now clear. */
- x &= ~PGT_validated;
- nx &= ~PGT_validated;
+ return free_page_type(page, x, preemptible);
}

/*
@@ -1922,25 +2077,33 @@ void put_page_type(struct page_info *pag
(page->count_info & PGC_page_table)) )
page->tlbflush_timestamp = tlbflush_current_time();
}
- }
- while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
-}
-
-
-int get_page_type(struct page_info *page, unsigned long type)
+
+ if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
+ break;
+
+ if ( preemptible && hypercall_preempt_check() )
+ return -EINTR;
+ }
+
+ return 0;
+}
+
+
+static int __get_page_type(struct page_info *page, unsigned long type,
+ int preemptible)
{
unsigned long nx, x, y = page->u.inuse.type_info;

ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));

- again:
- do {
+ for ( ; ; )
+ {
x = y;
nx = x + 1;
if ( unlikely((nx & PGT_count_mask) == 0) )
{
MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
- return 0;
+ return -EINVAL;
}
else if ( unlikely((x & PGT_count_mask) == 0) )
{
@@ -1993,28 +2156,43 @@ int get_page_type(struct page_info *page
/* Don't log failure if it could be a recursive-mapping attempt. */
if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
(type == PGT_l1_page_table) )
- return 0;
+ return -EINVAL;
if ( ((x & PGT_type_mask) == PGT_l3_page_table) &&
(type == PGT_l2_page_table) )
- return 0;
+ return -EINVAL;
if ( ((x & PGT_type_mask) == PGT_l4_page_table) &&
(type == PGT_l3_page_table) )
- return 0;
+ return -EINVAL;
MEM_LOG("Bad type (saw %" PRtype_info " != exp %" PRtype_info ") "
"for mfn %lx (pfn %lx)",
x, type, page_to_mfn(page),
get_gpfn_from_mfn(page_to_mfn(page)));
- return 0;
+ return -EINVAL;
}
else if ( unlikely(!(x & PGT_validated)) )
{
- /* Someone else is updating validation of this page. Wait... */
- while ( (y = page->u.inuse.type_info) == x )
- cpu_relax();
- goto again;
- }
- }
- while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
+ if ( !(x & PGT_partial) )
+ {
+ /* Someone else is updating validation of this page. Wait... */
+ while ( (y = page->u.inuse.type_info) == x )
+ {
+ if ( preemptible && hypercall_preempt_check() )
+ return -EINTR;
+ cpu_relax();
+ }
+ continue;
+ }
+ /* Type ref count was left at 1 when PGT_partial got set. */
+ ASSERT((x & PGT_count_mask) == 1);
+ nx = x & ~PGT_partial;
+ }
+
+ if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
+ break;
+
+ if ( preemptible && hypercall_preempt_check() )
+ return -EINTR;
+ }

if ( unlikely((x & PGT_type_mask) != type) )
{
@@ -2032,25 +2210,42 @@ int get_page_type(struct page_info *page

if ( unlikely(!(nx & PGT_validated)) )
{
- /* Try to validate page type; drop the new reference on failure. */
- if ( unlikely(!alloc_page_type(page, type)) )
- {
- MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
- PRtype_info ": caf=%08x taf=%" PRtype_info,
- page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
- type, page->count_info, page->u.inuse.type_info);
- /* Noone else can get a reference. We hold the only ref. */
- page->u.inuse.type_info = 0;
- return 0;
- }
-
- /* Noone else is updating simultaneously. */
- __set_bit(_PGT_validated, &page->u.inuse.type_info);
- }
-
- return 1;
-}
-
+ if ( !(x & PGT_partial) )
+ {
+ page->nr_validated_ptes = 0;
+ page->partial_pte = 0;
+ }
+ return alloc_page_type(page, type, preemptible);
+ }
+
+ return 0;
+}
+
+void put_page_type(struct page_info *page)
+{
+ int rc = __put_page_type(page, 0);
+ ASSERT(rc == 0);
+ (void)rc;
+}
+
+int get_page_type(struct page_info *page, unsigned long type)
+{
+ int rc = __get_page_type(page, type, 0);
+ if ( likely(rc == 0) )
+ return 1;
+ ASSERT(rc == -EINVAL);
+ return 0;
+}
+
+int put_page_type_preemptible(struct page_info *page)
+{
+ return __put_page_type(page, 1);
+}
+
+int get_page_type_preemptible(struct page_info *page, unsigned long type)
+{
+ return __get_page_type(page, type, 1);
+}

void cleanup_page_cacheattr(struct page_info *page)
{
@@ -2087,7 +2282,7 @@ int new_guest_cr3(unsigned long mfn)
l4e_from_pfn(
mfn,
(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
- pagetable_get_pfn(v->arch.guest_table), 0);
+ pagetable_get_pfn(v->arch.guest_table), 0, 0) == 0;
if ( unlikely(!okay) )
{
MEM_LOG("Error while installing new compat baseptr %lx", mfn);
@@ -2102,7 +2297,7 @@ int new_guest_cr3(unsigned long mfn)
#endif
okay = paging_mode_refcounts(d)
? get_page_from_pagenr(mfn, d)
- : get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
+ : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0);
if ( unlikely(!okay) )
{
MEM_LOG("Error while installing new baseptr %lx", mfn);
@@ -2276,9 +2471,7 @@ int do_mmuext_op(
{
if ( hypercall_preempt_check() )
{
- rc = hypercall_create_continuation(
- __HYPERVISOR_mmuext_op, "hihi",
- uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+ rc = -EAGAIN;
break;
}

@@ -2325,10 +2518,14 @@ int do_mmuext_op(
if ( paging_mode_refcounts(FOREIGNDOM) )
break;

- okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
+ rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 1);
+ okay = !rc;
if ( unlikely(!okay) )
{
- MEM_LOG("Error while pinning mfn %lx", mfn);
+ if ( rc == -EINTR )
+ rc = -EAGAIN;
+ else if ( rc != -EAGAIN )
+ MEM_LOG("Error while pinning mfn %lx", mfn);
break;
}

@@ -2373,8 +2570,11 @@ int do_mmuext_op(
{
put_page_and_type(page);
put_page(page);
- /* A page is dirtied when its pin status is cleared. */
- paging_mark_dirty(d, mfn);
+ if ( !rc )
+ {
+ /* A page is dirtied when its pin status is cleared. */
+ paging_mark_dirty(d, mfn);
+ }
}
else
{
@@ -2398,8 +2598,8 @@ int do_mmuext_op(
if ( paging_mode_refcounts(d) )
okay = get_page_from_pagenr(mfn, d);
else
- okay = get_page_and_type_from_pagenr(
- mfn, PGT_root_page_table, d);
+ okay = !get_page_and_type_from_pagenr(
+ mfn, PGT_root_page_table, d, 0);
if ( unlikely(!okay) )
{
MEM_LOG("Error while installing new mfn %lx", mfn);
@@ -2517,6 +2717,11 @@ int do_mmuext_op(
guest_handle_add_offset(uops, 1);
}

+ if ( rc == -EAGAIN )
+ rc = hypercall_create_continuation(
+ __HYPERVISOR_mmuext_op, "hihi",
+ uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+
process_deferred_ops();

perfc_add(num_mmuext_ops, i);
@@ -2576,9 +2781,7 @@ int do_mmu_update(
{
if ( hypercall_preempt_check() )
{
- rc = hypercall_create_continuation(
- __HYPERVISOR_mmu_update, "hihi",
- ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+ rc = -EAGAIN;
break;
}

@@ -2601,7 +2804,7 @@ int do_mmu_update(
*/
case MMU_NORMAL_PT_UPDATE:
case MMU_PT_UPDATE_PRESERVE_AD:
- rc = xsm_mmu_normal_update(d, req.val);
+ rc = xsm_mmu_normal_update(d, FOREIGNDOM, req.val);
if ( rc )
break;

@@ -2653,27 +2856,29 @@ int do_mmu_update(
cmd == MMU_PT_UPDATE_PRESERVE_AD);
}
break;
-#if CONFIG_PAGING_LEVELS >= 3
case PGT_l3_page_table:
{
l3_pgentry_t l3e = l3e_from_intpte(req.val);
- okay = mod_l3_entry(va, l3e, mfn,
- cmd == MMU_PT_UPDATE_PRESERVE_AD);
+ rc = mod_l3_entry(va, l3e, mfn,
+ cmd == MMU_PT_UPDATE_PRESERVE_AD, 1);
+ okay = !rc;
}
break;
-#endif
#if CONFIG_PAGING_LEVELS >= 4
case PGT_l4_page_table:
{
l4_pgentry_t l4e = l4e_from_intpte(req.val);
- okay = mod_l4_entry(va, l4e, mfn,
- cmd == MMU_PT_UPDATE_PRESERVE_AD);
+ rc = mod_l4_entry(va, l4e, mfn,
+ cmd == MMU_PT_UPDATE_PRESERVE_AD, 1);
+ okay = !rc;
}
break;
#endif
}

put_page_type(page);
+ if ( rc == -EINTR )
+ rc = -EAGAIN;
}
break;

@@ -2741,6 +2946,11 @@ int do_mmu_update(

guest_handle_add_offset(ureqs, 1);
}
+
+ if ( rc == -EAGAIN )
+ rc = hypercall_create_continuation(
+ __HYPERVISOR_mmu_update, "hihi",
+ ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);

process_deferred_ops();

@@ -3111,7 +3321,7 @@ int do_update_va_mapping(unsigned long v
if ( unlikely(!access_ok(va, 1) && !paging_mode_external(d)) )
return -EINVAL;

- rc = xsm_update_va_mapping(d, val);
+ rc = xsm_update_va_mapping(d, FOREIGNDOM, val);
if ( rc )
return rc;

@@ -3695,9 +3905,8 @@ static int ptwr_emulated_update(
nl1e = l1e_from_intpte(val);
if ( unlikely(!get_page_from_l1e(nl1e, d)) )
{
- if ( (CONFIG_PAGING_LEVELS >= 3) && is_pv_32bit_domain(d) &&
- (bytes == 4) && (unaligned_addr & 4) && !do_cmpxchg &&
- (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
+ if ( is_pv_32bit_domain(d) && (bytes == 4) && (unaligned_addr & 4) &&
+ !do_cmpxchg && (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
{
/*
* If this is an upper-half write to a PAE PTE then we assume that
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/mm/hap/hap.c
--- a/xen/arch/x86/mm/hap/hap.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/mm/hap/hap.c Fri Sep 12 14:47:40 2008 +0900
@@ -37,6 +37,7 @@
#include <asm/shared.h>
#include <asm/hap.h>
#include <asm/paging.h>
+#include <asm/p2m.h>
#include <asm/domain.h>
#include <xen/numa.h>

diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/mm/shadow/common.c
--- a/xen/arch/x86/mm/shadow/common.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/mm/shadow/common.c Fri Sep 12 14:47:40 2008 +0900
@@ -39,6 +39,7 @@
#include <xen/numa.h>
#include "private.h"

+DEFINE_PER_CPU(uint32_t,trace_shadow_path_flags);

/* Set up the shadow-specific parts of a domain struct at start of day.
* Called for every domain from arch_domain_create() */
@@ -630,6 +631,8 @@ void oos_fixup_add(struct vcpu *v, mfn_t

if ( mfn_x(oos_fixup[idx].smfn[next]) != INVALID_MFN )
{
+ TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_OOS_FIXUP_EVICT);
+
/* Reuse this slot and remove current writable mapping. */
sh_remove_write_access_from_sl1p(v, gmfn,
oos_fixup[idx].smfn[next],
@@ -645,6 +648,8 @@ void oos_fixup_add(struct vcpu *v, mfn_t
oos_fixup[idx].smfn[next] = smfn;
oos_fixup[idx].off[next] = off;
oos_fixup[idx].next = (next + 1) % SHADOW_OOS_FIXUPS;
+
+ TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_OOS_FIXUP_ADD);
return;
}
}
@@ -687,6 +692,16 @@ static int oos_remove_write_access(struc
}


+static inline void trace_resync(int event, mfn_t gmfn)
+{
+ if ( tb_init_done )
+ {
+ /* Convert gmfn to gfn */
+ unsigned long gfn = mfn_to_gfn(current->domain, gmfn);
+ __trace_var(event, 0/*!tsc*/, sizeof(gfn), (unsigned char*)&gfn);
+ }
+}
+
/* Pull all the entries on an out-of-sync page back into sync. */
static void _sh_resync(struct vcpu *v, mfn_t gmfn,
struct oos_fixup *fixup, mfn_t snp)
@@ -700,8 +715,8 @@ static void _sh_resync(struct vcpu *v, m
& ~SHF_L1_ANY));
ASSERT(!sh_page_has_multiple_shadows(mfn_to_page(gmfn)));

- SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, va=%lx\n",
- v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va);
+ SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
+ v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));

/* Need to pull write access so the page *stays* in sync. */
if ( oos_remove_write_access(v, gmfn, fixup) )
@@ -719,6 +734,7 @@ static void _sh_resync(struct vcpu *v, m
/* Now we know all the entries are synced, and will stay that way */
pg->shadow_flags &= ~SHF_out_of_sync;
perfc_incr(shadow_resync);
+ trace_resync(TRC_SHADOW_RESYNC_FULL, gmfn);
}


@@ -930,6 +946,7 @@ void sh_resync_all(struct vcpu *v, int s
/* Update the shadows and leave the page OOS. */
if ( sh_skip_sync(v, oos[idx]) )
continue;
+ trace_resync(TRC_SHADOW_RESYNC_ONLY, oos[idx]);
_sh_resync_l1(other, oos[idx], oos_snapshot[idx]);
}
else
@@ -945,15 +962,16 @@ void sh_resync_all(struct vcpu *v, int s
}
}

-/* Allow a shadowed page to go out of sync */
+/* Allow a shadowed page to go out of sync. Unsyncs are traced in
+ * multi.c:sh_page_fault() */
int sh_unsync(struct vcpu *v, mfn_t gmfn)
{
struct page_info *pg;

ASSERT(shadow_locked_by_me(v->domain));

- SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx va %lx\n",
- v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va);
+ SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
+ v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));

pg = mfn_to_page(gmfn);

@@ -970,6 +988,7 @@ int sh_unsync(struct vcpu *v, mfn_t gmfn
pg->shadow_flags |= SHF_out_of_sync|SHF_oos_may_write;
oos_hash_add(v, gmfn);
perfc_incr(shadow_unsync);
+ TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_UNSYNC);
return 1;
}

@@ -1005,6 +1024,7 @@ void shadow_promote(struct vcpu *v, mfn_

ASSERT(!test_bit(type, &page->shadow_flags));
set_bit(type, &page->shadow_flags);
+ TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_PROMOTE);
}

void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type)
@@ -1027,6 +1047,8 @@ void shadow_demote(struct vcpu *v, mfn_t
#endif
clear_bit(_PGC_page_table, &page->count_info);
}
+
+ TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_DEMOTE);
}

/**************************************************************************/
@@ -1094,6 +1116,7 @@ sh_validate_guest_entry(struct vcpu *v,
ASSERT((page->shadow_flags
& (SHF_L4_64|SHF_L3_64|SHF_L2H_64|SHF_L2_64|SHF_L1_64)) == 0);
#endif
+ this_cpu(trace_shadow_path_flags) |= (result<<(TRCE_SFLAG_SET_CHANGED));

return result;
}
@@ -1295,6 +1318,18 @@ static void shadow_unhook_mappings(struc
}
}

+static inline void trace_shadow_prealloc_unpin(struct domain *d, mfn_t smfn)
+{
+ if ( tb_init_done )
+ {
+ /* Convert smfn to gfn */
+ unsigned long gfn;
+ ASSERT(mfn_valid(smfn));
+ gfn = mfn_to_gfn(d, _mfn(mfn_to_shadow_page(smfn)->backpointer));
+ __trace_var(TRC_SHADOW_PREALLOC_UNPIN, 0/*!tsc*/,
+ sizeof(gfn), (unsigned char*)&gfn);
+ }
+}

/* Make sure there are at least count order-sized pages
* available in the shadow page pool. */
@@ -1327,6 +1362,7 @@ static void _shadow_prealloc(
smfn = shadow_page_to_mfn(sp);

/* Unpin this top-level shadow */
+ trace_shadow_prealloc_unpin(d, smfn);
sh_unpin(v, smfn);

/* See if that freed up enough space */
@@ -1343,6 +1379,7 @@ static void _shadow_prealloc(
{
if ( !pagetable_is_null(v2->arch.shadow_table[i]) )
{
+ TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_PREALLOC_UNHOOK);
shadow_unhook_mappings(v,
pagetable_get_mfn(v2->arch.shadow_table[i]));

@@ -2200,6 +2237,16 @@ void sh_destroy_shadow(struct vcpu *v, m
}
}

+static inline void trace_shadow_wrmap_bf(mfn_t gmfn)
+{
+ if ( tb_init_done )
+ {
+ /* Convert gmfn to gfn */
+ unsigned long gfn = mfn_to_gfn(current->domain, gmfn);
+ __trace_var(TRC_SHADOW_WRMAP_BF, 0/*!tsc*/, sizeof(gfn), (unsigned char*)&gfn);
+ }
+}
+
/**************************************************************************/
/* Remove all writeable mappings of a guest frame from the shadow tables
* Returns non-zero if we need to flush TLBs.
@@ -2265,6 +2312,8 @@ int sh_remove_write_access(struct vcpu *
|| (pg->u.inuse.type_info & PGT_count_mask) == 0 )
return 0;

+ TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP);
+
perfc_incr(shadow_writeable);

/* If this isn't a "normal" writeable page, the domain is trying to
@@ -2285,11 +2334,14 @@ int sh_remove_write_access(struct vcpu *
* and that mapping is likely to be in the current pagetable,
* in the guest's linear map (on non-HIGHPTE linux and windows)*/

-#define GUESS(_a, _h) do { \
+#define GUESS(_a, _h) do { \
if ( v->arch.paging.mode->shadow.guess_wrmap(v, (_a), gmfn) ) \
- perfc_incr(shadow_writeable_h_ ## _h); \
- if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \
- return 1; \
+ perfc_incr(shadow_writeable_h_ ## _h); \
+ if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \
+ { \
+ TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP_GUESS_FOUND); \
+ return 1; \
+ } \
} while (0)

if ( level == 0 && fault_addr )
@@ -2377,6 +2429,7 @@ int sh_remove_write_access(struct vcpu *
#endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */

/* Brute-force search of all the shadows, by walking the hash */
+ trace_shadow_wrmap_bf(gmfn);
if ( level == 0 )
perfc_incr(shadow_writeable_bf_1);
else
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/mm/shadow/multi.c Fri Sep 12 14:47:40 2008 +0900
@@ -225,6 +225,7 @@ static uint32_t set_ad_bits(void *guest_
static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
{
guest_intpte_t old, new;
+ int ret = 0;

old = *(guest_intpte_t *)walk_p;
new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0);
@@ -234,10 +235,16 @@ static uint32_t set_ad_bits(void *guest_
* into the guest table as well. If the guest table has changed
* under out feet then leave it alone. */
*(guest_intpte_t *)walk_p = new;
- if ( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old )
- return 1;
- }
- return 0;
+ if( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old )
+ ret = 1;
+
+ /* FIXME -- this code is longer than necessary */
+ if(set_dirty)
+ TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SET_AD);
+ else
+ TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SET_A);
+ }
+ return ret;
}

/* This validation is called with lock held, and after write permission
@@ -1432,6 +1439,7 @@ static int shadow_set_l1e(struct vcpu *v
{
/* About to install a new reference */
if ( shadow_mode_refcounts(d) ) {
+ TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SHADOW_L1_GET_REF);
if ( shadow_get_page_from_l1e(new_sl1e, d) == 0 )
{
/* Doesn't look like a pagetable. */
@@ -1461,6 +1469,7 @@ static int shadow_set_l1e(struct vcpu *v
{
shadow_vram_put_l1e(old_sl1e, sl1e, sl1mfn, d);
shadow_put_page_from_l1e(old_sl1e, d);
+ TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SHADOW_L1_PUT_REF);
}
}
return flags;
@@ -2896,6 +2905,7 @@ static inline void check_for_early_unsha
{
perfc_incr(shadow_early_unshadow);
sh_remove_shadows(v, gmfn, 1, 0 /* Fast, can fail to unshadow */ );
+ TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EARLY_UNSHADOW);
}
v->arch.paging.shadow.last_emulated_mfn_for_unshadow = mfn_x(gmfn);
#endif
@@ -3012,6 +3022,132 @@ static void sh_prefetch(struct vcpu *v,

#endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */

+#if GUEST_PAGING_LEVELS == 4
+typedef u64 guest_va_t;
+typedef u64 guest_pa_t;
+#elif GUEST_PAGING_LEVELS == 3
+typedef u32 guest_va_t;
+typedef u64 guest_pa_t;
+#else
+typedef u32 guest_va_t;
+typedef u32 guest_pa_t;
+#endif
+
+static inline void trace_shadow_gen(u32 event, guest_va_t va)
+{
+ if ( tb_init_done )
+ {
+ event |= (GUEST_PAGING_LEVELS-2)<<8;
+ __trace_var(event, 0/*!tsc*/, sizeof(va), (unsigned char*)&va);
+ }
+}
+
+static inline void trace_shadow_fixup(guest_l1e_t gl1e,
+ guest_va_t va)
+{
+ if ( tb_init_done )
+ {
+ struct {
+ /* for PAE, guest_l1e may be 64 while guest_va may be 32;
+ so put it first for alignment sake. */
+ guest_l1e_t gl1e;
+ guest_va_t va;
+ u32 flags;
+ } __attribute__((packed)) d;
+ u32 event;
+
+ event = TRC_SHADOW_FIXUP | ((GUEST_PAGING_LEVELS-2)<<8);
+
+ d.gl1e = gl1e;
+ d.va = va;
+ d.flags = this_cpu(trace_shadow_path_flags);
+
+ __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
+ }
+}
+
+static inline void trace_not_shadow_fault(guest_l1e_t gl1e,
+ guest_va_t va)
+{
+ if ( tb_init_done )
+ {
+ struct {
+ /* for PAE, guest_l1e may be 64 while guest_va may be 32;
+ so put it first for alignment sake. */
+ guest_l1e_t gl1e;
+ guest_va_t va;
+ u32 flags;
+ } __attribute__((packed)) d;
+ u32 event;
+
+ event = TRC_SHADOW_NOT_SHADOW | ((GUEST_PAGING_LEVELS-2)<<8);
+
+ d.gl1e = gl1e;
+ d.va = va;
+ d.flags = this_cpu(trace_shadow_path_flags);
+
+ __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
+ }
+}
+
+static inline void trace_shadow_emulate_other(u32 event,
+ guest_va_t va,
+ gfn_t gfn)
+{
+ if ( tb_init_done )
+ {
+ struct {
+ /* for PAE, guest_l1e may be 64 while guest_va may be 32;
+ so put it first for alignment sake. */
+#if GUEST_PAGING_LEVELS == 2
+ u32 gfn;
+#else
+ u64 gfn;
+#endif
+ guest_va_t va;
+ } __attribute__((packed)) d;
+
+ event |= ((GUEST_PAGING_LEVELS-2)<<8);
+
+ d.gfn=gfn_x(gfn);
+ d.va = va;
+
+ __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
+ }
+}
+
+#if GUEST_PAGING_LEVELS == 3
+static DEFINE_PER_CPU(guest_va_t,trace_emulate_initial_va);
+static DEFINE_PER_CPU(int,trace_extra_emulation_count);
+#endif
+static DEFINE_PER_CPU(guest_pa_t,trace_emulate_write_val);
+
+static inline void trace_shadow_emulate(guest_l1e_t gl1e, unsigned long va)
+{
+ if ( tb_init_done )
+ {
+ struct {
+ /* for PAE, guest_l1e may be 64 while guest_va may be 32;
+ so put it first for alignment sake. */
+ guest_l1e_t gl1e, write_val;
+ guest_va_t va;
+ unsigned flags:29, emulation_count:3;
+ } __attribute__((packed)) d;
+ u32 event;
+
+ event = TRC_SHADOW_EMULATE | ((GUEST_PAGING_LEVELS-2)<<8);
+
+ d.gl1e = gl1e;
+ d.write_val.l1 = this_cpu(trace_emulate_write_val);
+ d.va = va;
+#if GUEST_PAGING_LEVELS == 3
+ d.emulation_count = this_cpu(trace_extra_emulation_count);
+#endif
+ d.flags = this_cpu(trace_shadow_path_flags);
+
+ __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
+ }
+}

/**************************************************************************/
/* Entry points into the shadow code */
@@ -3027,8 +3163,8 @@ static int sh_page_fault(struct vcpu *v,
{
struct domain *d = v->domain;
walk_t gw;
- gfn_t gfn;
- mfn_t gmfn, sl1mfn=_mfn(0);
+ gfn_t gfn = _gfn(0);
+ mfn_t gmfn, sl1mfn = _mfn(0);
shadow_l1e_t sl1e, *ptr_sl1e;
paddr_t gpa;
struct sh_emulate_ctxt emul_ctxt;
@@ -3043,7 +3179,7 @@ static int sh_page_fault(struct vcpu *v,

SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u, rip=%lx\n",
v->domain->domain_id, v->vcpu_id, va, regs->error_code,
- regs->rip);
+ regs->eip);

perfc_incr(shadow_fault);

@@ -3132,6 +3268,7 @@ static int sh_page_fault(struct vcpu *v,
reset_early_unshadow(v);
perfc_incr(shadow_fault_fast_gnp);
SHADOW_PRINTK("fast path not-present\n");
+ trace_shadow_gen(TRC_SHADOW_FAST_PROPAGATE, va);
return 0;
}
else
@@ -3145,6 +3282,7 @@ static int sh_page_fault(struct vcpu *v,
perfc_incr(shadow_fault_fast_mmio);
SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa);
reset_early_unshadow(v);
+ trace_shadow_gen(TRC_SHADOW_FAST_MMIO, va);
return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT)
? EXCRET_fault_fixed : 0);
}
@@ -3155,6 +3293,7 @@ static int sh_page_fault(struct vcpu *v,
* Retry and let the hardware give us the right fault next time. */
perfc_incr(shadow_fault_fast_fail);
SHADOW_PRINTK("fast path false alarm!\n");
+ trace_shadow_gen(TRC_SHADOW_FALSE_FAST_PATH, va);
return EXCRET_fault_fixed;
}
}
@@ -3190,7 +3329,7 @@ static int sh_page_fault(struct vcpu *v,
perfc_incr(shadow_fault_bail_real_fault);
SHADOW_PRINTK("not a shadow fault\n");
reset_early_unshadow(v);
- return 0;
+ goto propagate;
}

/* It's possible that the guest has put pagetables in memory that it has
@@ -3200,7 +3339,7 @@ static int sh_page_fault(struct vcpu *v,
if ( unlikely(d->is_shutting_down) )
{
SHADOW_PRINTK("guest is shutting down\n");
- return 0;
+ goto propagate;
}

/* What kind of access are we dealing with? */
@@ -3218,7 +3357,7 @@ static int sh_page_fault(struct vcpu *v,
SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n",
gfn_x(gfn), mfn_x(gmfn));
reset_early_unshadow(v);
- return 0;
+ goto propagate;
}

#if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
@@ -3229,6 +3368,8 @@ static int sh_page_fault(struct vcpu *v,

shadow_lock(d);

+ TRACE_CLEAR_PATH_FLAGS;
+
rc = gw_remove_write_accesses(v, va, &gw);

/* First bit set: Removed write access to a page. */
@@ -3281,6 +3422,7 @@ static int sh_page_fault(struct vcpu *v,
* Get out of the fault handler immediately. */
ASSERT(d->is_shutting_down);
shadow_unlock(d);
+ trace_shadow_gen(TRC_SHADOW_DOMF_DYING, va);
return 0;
}

@@ -3383,6 +3525,7 @@ static int sh_page_fault(struct vcpu *v,
d->arch.paging.log_dirty.fault_count++;
reset_early_unshadow(v);

+ trace_shadow_fixup(gw.l1e, va);
done:
sh_audit_gw(v, &gw);
SHADOW_PRINTK("fixed\n");
@@ -3405,6 +3548,8 @@ static int sh_page_fault(struct vcpu *v,
mfn_x(gmfn));
perfc_incr(shadow_fault_emulate_failed);
sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
+ trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_USER,
+ va, gfn);
goto done;
}

@@ -3421,6 +3566,8 @@ static int sh_page_fault(struct vcpu *v,
shadow_audit_tables(v);
shadow_unlock(d);

+ this_cpu(trace_emulate_write_val) = 0;
+
#if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
early_emulation:
#endif
@@ -3446,6 +3593,8 @@ static int sh_page_fault(struct vcpu *v,
"injection: cr2=%#lx, mfn=%#lx\n",
va, mfn_x(gmfn));
sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
+ trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ,
+ va, gfn);
return EXCRET_fault_fixed;
}
}
@@ -3478,6 +3627,10 @@ static int sh_page_fault(struct vcpu *v,
* to support more operations in the emulator. More likely,
* though, this is a hint that this page should not be shadowed. */
shadow_remove_all_shadows(v, gmfn);
+
+ trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED,
+ va, gfn);
+ goto emulate_done;
}

#if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
@@ -3504,7 +3657,8 @@ static int sh_page_fault(struct vcpu *v,

#if GUEST_PAGING_LEVELS == 3 /* PAE guest */
if ( r == X86EMUL_OKAY ) {
- int i;
+ int i, emulation_count=0;
+ this_cpu(trace_emulate_initial_va) = va;
/* Emulate up to four extra instructions in the hope of catching
* the "second half" of a 64-bit pagetable write. */
for ( i = 0 ; i < 4 ; i++ )
@@ -3513,10 +3667,12 @@ static int sh_page_fault(struct vcpu *v,
v->arch.paging.last_write_was_pt = 0;
r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
if ( r == X86EMUL_OKAY )
- {
+ {
+ emulation_count++;
if ( v->arch.paging.last_write_was_pt )
{
perfc_incr(shadow_em_ex_pt);
+ TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATION_2ND_PT_WRITTEN);
break; /* Don't emulate past the other half of the write */
}
else
@@ -3525,12 +3681,16 @@ static int sh_page_fault(struct vcpu *v,
else
{
perfc_incr(shadow_em_ex_fail);
+ TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATION_LAST_FAILED);
break; /* Don't emulate again if we failed! */
}
}
+ this_cpu(trace_extra_emulation_count)=emulation_count;
}
#endif /* PAE guest */

+ trace_shadow_emulate(gw.l1e, va);
+ emulate_done:
SHADOW_PRINTK("emulated\n");
return EXCRET_fault_fixed;

@@ -3543,6 +3703,7 @@ static int sh_page_fault(struct vcpu *v,
shadow_audit_tables(v);
reset_early_unshadow(v);
shadow_unlock(d);
+ trace_shadow_gen(TRC_SHADOW_MMIO, va);
return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT)
? EXCRET_fault_fixed : 0);

@@ -3552,6 +3713,10 @@ static int sh_page_fault(struct vcpu *v,
shadow_audit_tables(v);
reset_early_unshadow(v);
shadow_unlock(d);
+
+propagate:
+ trace_not_shadow_fault(gw.l1e, va);
+
return 0;
}

@@ -3990,7 +4155,7 @@ sh_detach_old_tables(struct vcpu *v)
sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
v->arch.paging.shadow.guest_vtable = NULL;
}
-#endif
+#endif // !NDEBUG


////
@@ -4446,6 +4611,7 @@ static int sh_guess_wrmap(struct vcpu *v
sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
r = shadow_set_l1e(v, sl1p, sl1e, sl1mfn);
ASSERT( !(r & SHADOW_SET_ERROR) );
+ TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP_GUESS_FOUND);
return 1;
}
#endif
@@ -4800,7 +4966,7 @@ static void emulate_unmap_dest(struct vc

static int
sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
- u32 bytes, struct sh_emulate_ctxt *sh_ctxt)
+ u32 bytes, struct sh_emulate_ctxt *sh_ctxt)
{
void *addr;

@@ -4814,6 +4980,22 @@ sh_x86_emulate_write(struct vcpu *v, uns

shadow_lock(v->domain);
memcpy(addr, src, bytes);
+
+ if ( tb_init_done )
+ {
+#if GUEST_PAGING_LEVELS == 3
+ if ( vaddr == this_cpu(trace_emulate_initial_va) )
+ memcpy(&this_cpu(trace_emulate_write_val), src, bytes);
+ else if ( (vaddr & ~(0x7UL)) == this_cpu(trace_emulate_initial_va) )
+ {
+ TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATE_FULL_PT);
+ memcpy(&this_cpu(trace_emulate_write_val),
+ (void *)(((unsigned long) addr) & ~(0x7UL)), GUEST_PTE_SIZE);
+ }
+#else
+ memcpy(&this_cpu(trace_emulate_write_val), src, bytes);
+#endif
+ }

emulate_unmap_dest(v, addr, bytes, sh_ctxt);
shadow_audit_tables(v);
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/mm/shadow/private.h
--- a/xen/arch/x86/mm/shadow/private.h Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/mm/shadow/private.h Fri Sep 12 14:47:40 2008 +0900
@@ -90,6 +90,43 @@ extern int shadow_audit_enable;
#define SHADOW_DEBUG_EMULATE 1
#define SHADOW_DEBUG_P2M 1
#define SHADOW_DEBUG_LOGDIRTY 0
+
+/******************************************************************************
+ * Tracing
+ */
+DECLARE_PER_CPU(uint32_t,trace_shadow_path_flags);
+
+#define TRACE_SHADOW_PATH_FLAG(_x) \
+ do { \
+ this_cpu(trace_shadow_path_flags) |= (1<<(_x)); \
+ } while(0)
+
+#define TRACE_CLEAR_PATH_FLAGS \
+ this_cpu(trace_shadow_path_flags) = 0
+
+enum {
+ TRCE_SFLAG_SET_AD,
+ TRCE_SFLAG_SET_A,
+ TRCE_SFLAG_SHADOW_L1_GET_REF,
+ TRCE_SFLAG_SHADOW_L1_PUT_REF,
+ TRCE_SFLAG_L2_PROPAGATE,
+ TRCE_SFLAG_SET_CHANGED,
+ TRCE_SFLAG_SET_FLUSH,
+ TRCE_SFLAG_SET_ERROR,
+ TRCE_SFLAG_DEMOTE,
+ TRCE_SFLAG_PROMOTE,
+ TRCE_SFLAG_WRMAP,
+ TRCE_SFLAG_WRMAP_GUESS_FOUND,
+ TRCE_SFLAG_WRMAP_BRUTE_FORCE,
+ TRCE_SFLAG_EARLY_UNSHADOW,
+ TRCE_SFLAG_EMULATION_2ND_PT_WRITTEN,
+ TRCE_SFLAG_EMULATION_LAST_FAILED,
+ TRCE_SFLAG_EMULATE_FULL_PT,
+ TRCE_SFLAG_PREALLOC_UNHOOK,
+ TRCE_SFLAG_UNSYNC,
+ TRCE_SFLAG_OOS_FIXUP_ADD,
+ TRCE_SFLAG_OOS_FIXUP_EVICT,
+};

/******************************************************************************
* The shadow lock.
@@ -143,6 +180,12 @@ extern int shadow_audit_enable;
} while (0)


+/* Size (in bytes) of a guest PTE */
+#if GUEST_PAGING_LEVELS >= 3
+# define GUEST_PTE_SIZE 8
+#else
+# define GUEST_PTE_SIZE 4
+#endif

/******************************************************************************
* Auditing routines
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/physdev.c
--- a/xen/arch/x86/physdev.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/physdev.c Fri Sep 12 14:47:40 2008 +0900
@@ -58,9 +58,6 @@ static int get_free_pirq(struct domain *
return i;
}

-/*
- * Caller hold the irq_lock
- */
static int map_domain_pirq(struct domain *d, int pirq, int vector,
struct physdev_map_pirq *map)
{
@@ -136,13 +133,12 @@ done:
return ret;
}

-/*
- * The pirq should has been unbound before this call
- */
+/* The pirq should have been unbound before this call. */
static int unmap_domain_pirq(struct domain *d, int pirq)
{
- int ret = 0;
- int vector;
+ unsigned long flags;
+ irq_desc_t *desc;
+ int vector, ret = 0;

if ( d == NULL || pirq < 0 || pirq >= NR_PIRQS )
return -EINVAL;
@@ -159,33 +155,29 @@ static int unmap_domain_pirq(struct doma
gdprintk(XENLOG_G_ERR, "domain %X: pirq %x not mapped still\n",
d->domain_id, pirq);
ret = -EINVAL;
- }
- else
- {
- unsigned long flags;
- irq_desc_t *desc;
-
- desc = &irq_desc[vector];
- spin_lock_irqsave(&desc->lock, flags);
- if ( desc->msi_desc )
- pci_disable_msi(vector);
-
- if ( desc->handler == &pci_msi_type )
- {
- /* MSI is not shared, so should be released already */
- BUG_ON(desc->status & IRQ_GUEST);
- irq_desc[vector].handler = &no_irq_type;
- }
- spin_unlock_irqrestore(&desc->lock, flags);
-
- d->arch.pirq_vector[pirq] = d->arch.vector_pirq[vector] = 0;
- }
+ goto done;
+ }
+
+ desc = &irq_desc[vector];
+ spin_lock_irqsave(&desc->lock, flags);
+ if ( desc->msi_desc )
+ pci_disable_msi(vector);
+
+ if ( desc->handler == &pci_msi_type )
+ {
+ /* MSI is not shared, so should be released already */
+ BUG_ON(desc->status & IRQ_GUEST);
+ irq_desc[vector].handler = &no_irq_type;
+ }
+ spin_unlock_irqrestore(&desc->lock, flags);
+
+ d->arch.pirq_vector[pirq] = d->arch.vector_pirq[vector] = 0;

ret = irq_deny_access(d, pirq);
-
if ( ret )
gdprintk(XENLOG_G_ERR, "deny irq %x access failed\n", pirq);

+ done:
return ret;
}

@@ -194,10 +186,6 @@ static int physdev_map_pirq(struct physd
struct domain *d;
int vector, pirq, ret = 0;
unsigned long flags;
-
- /* if msi_enable is not enabled, map always succeeds */
- if ( !msi_enable )
- return 0;

if ( !IS_PRIV(current->domain) )
return -EPERM;
@@ -308,14 +296,8 @@ static int physdev_unmap_pirq(struct phy
unsigned long flags;
int ret;

- if ( !msi_enable )
- return 0;
-
if ( !IS_PRIV(current->domain) )
return -EPERM;
-
- if ( !unmap )
- return -EINVAL;

if ( unmap->domid == DOMID_SELF )
d = rcu_lock_domain(current->domain);
@@ -323,14 +305,12 @@ static int physdev_unmap_pirq(struct phy
d = rcu_lock_domain_by_id(unmap->domid);

if ( d == NULL )
- {
- rcu_unlock_domain(d);
return -ESRCH;
- }

spin_lock_irqsave(&d->arch.irq_lock, flags);
ret = unmap_domain_pirq(d, unmap->pirq);
spin_unlock_irqrestore(&d->arch.irq_lock, flags);
+
rcu_unlock_domain(d);

return ret;
@@ -452,20 +432,14 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H

irq = irq_op.irq;
ret = -EINVAL;
- if ( ((irq < 0) && (irq != AUTO_ASSIGN)) || (irq >= NR_IRQS) )
+ if ( (irq < 0) || (irq >= NR_IRQS) )
break;

irq_op.vector = assign_irq_vector(irq);

- ret = 0;
-
- if ( msi_enable )
- {
- spin_lock_irqsave(&dom0->arch.irq_lock, flags);
- if ( irq != AUTO_ASSIGN )
- ret = map_domain_pirq(dom0, irq_op.irq, irq_op.vector, NULL);
- spin_unlock_irqrestore(&dom0->arch.irq_lock, flags);
- }
+ spin_lock_irqsave(&dom0->arch.irq_lock, flags);
+ ret = map_domain_pirq(dom0, irq_op.irq, irq_op.vector, NULL);
+ spin_unlock_irqrestore(&dom0->arch.irq_lock, flags);

if ( copy_to_guest(arg, &irq_op, 1) != 0 )
ret = -EFAULT;
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/platform_hypercall.c
--- a/xen/arch/x86/platform_hypercall.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/platform_hypercall.c Fri Sep 12 14:47:40 2008 +0900
@@ -192,6 +192,10 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
break;

case XENPF_firmware_info:
+ ret = xsm_firmware_info();
+ if ( ret )
+ break;
+
switch ( op->u.firmware_info.type )
{
case XEN_FW_DISK_INFO: {
@@ -280,10 +284,18 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
break;

case XENPF_enter_acpi_sleep:
+ ret = xsm_acpi_sleep();
+ if ( ret )
+ break;
+
ret = acpi_enter_sleep(&op->u.enter_acpi_sleep);
break;

case XENPF_change_freq:
+ ret = xsm_change_freq();
+ if ( ret )
+ break;
+
ret = -ENOSYS;
if ( cpufreq_controller != FREQCTL_dom0_kernel )
break;
@@ -305,6 +317,10 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
cpumask_t cpumap;
XEN_GUEST_HANDLE(uint8) cpumap_bitmap;
XEN_GUEST_HANDLE(uint64) idletimes;
+
+ ret = xsm_getidletime();
+ if ( ret )
+ break;

ret = -ENOSYS;
if ( cpufreq_controller != FREQCTL_dom0_kernel )
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/smpboot.c
--- a/xen/arch/x86/smpboot.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/smpboot.c Fri Sep 12 14:47:40 2008 +0900
@@ -1225,15 +1225,6 @@ int __cpu_disable(void)
if (cpu == 0)
return -EBUSY;

- /*
- * Only S3 is using this path, and thus idle vcpus are running on all
- * APs when we are called. To support full cpu hotplug, other
- * notification mechanisms should be introduced (e.g., migrate vcpus
- * off this physical cpu before rendezvous point).
- */
- if (!is_idle_vcpu(current))
- return -EINVAL;
-
local_irq_disable();
clear_local_APIC();
/* Allow any queued timer interrupts to get serviced */
@@ -1249,6 +1240,9 @@ int __cpu_disable(void)
fixup_irqs(map);
/* It's now safe to remove this processor from the online map */
cpu_clear(cpu, cpu_online_map);
+
+ cpu_disable_scheduler();
+
return 0;
}

@@ -1275,28 +1269,6 @@ static int take_cpu_down(void *unused)
return __cpu_disable();
}

-/*
- * XXX: One important thing missed here is to migrate vcpus
- * from dead cpu to other online ones and then put whole
- * system into a stop state. It assures a safe environment
- * for a cpu hotplug/remove at normal running state.
- *
- * However for xen PM case, at this point:
- * -> All other domains should be notified with PM event,
- * and then in following states:
- * * Suspend state, or
- * * Paused state, which is a force step to all
- * domains if they do nothing to suspend
- * -> All vcpus of dom0 (except vcpu0) have already beem
- * hot removed
- * with the net effect that all other cpus only have idle vcpu
- * running. In this special case, we can avoid vcpu migration
- * then and system can be considered in a stop state.
- *
- * So current cpu hotplug is a special version for PM specific
- * usage, and need more effort later for full cpu hotplug.
- * (ktian1)
- */
int cpu_down(unsigned int cpu)
{
int err = 0;
@@ -1304,6 +1276,12 @@ int cpu_down(unsigned int cpu)
spin_lock(&cpu_add_remove_lock);
if (num_online_cpus() == 1) {
err = -EBUSY;
+ goto out;
+ }
+
+ /* Can not offline BSP */
+ if (cpu == 0) {
+ err = -EINVAL;
goto out;
}

diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/time.c
--- a/xen/arch/x86/time.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/time.c Fri Sep 12 14:47:40 2008 +0900
@@ -993,15 +993,16 @@ static void local_time_calibration(void)
* All CPUS snapshot their local TSC and extrapolation of system time.
*/
struct calibration_rendezvous {
+ cpumask_t cpu_calibration_map;
atomic_t nr_cpus;
s_time_t master_stime;
};

static void time_calibration_rendezvous(void *_r)
{
- unsigned int total_cpus = num_online_cpus();
struct cpu_calibration *c = &this_cpu(cpu_calibration);
struct calibration_rendezvous *r = _r;
+ unsigned int total_cpus = cpus_weight(r->cpu_calibration_map);

if ( smp_processor_id() == 0 )
{
@@ -1029,11 +1030,13 @@ static void time_calibration(void *unuse
static void time_calibration(void *unused)
{
struct calibration_rendezvous r = {
+ .cpu_calibration_map = cpu_online_map,
.nr_cpus = ATOMIC_INIT(0)
};

/* @wait=1 because we must wait for all cpus before freeing @r. */
- on_each_cpu(time_calibration_rendezvous, &r, 0, 1);
+ on_selected_cpus(r.cpu_calibration_map,
+ time_calibration_rendezvous, &r, 0, 1);
}

void init_percpu_time(void)
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/traps.c Fri Sep 12 14:47:40 2008 +0900
@@ -47,7 +47,7 @@
#include <xen/version.h>
#include <xen/kexec.h>
#include <xen/trace.h>
-#include <asm/paging.h>
+#include <xen/paging.h>
#include <asm/system.h>
#include <asm/io.h>
#include <asm/atomic.h>
@@ -2116,6 +2116,36 @@ static int emulate_privileged_op(struct
if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
goto fail;
break;
+ case MSR_AMD64_NB_CFG:
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
+ boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x11 )
+ goto fail;
+ if ( !IS_PRIV(v->domain) )
+ break;
+ if ( (rdmsr_safe(MSR_AMD64_NB_CFG, l, h) != 0) ||
+ (eax != l) ||
+ ((edx ^ h) & ~(1 << (AMD64_NB_CFG_CF8_EXT_ENABLE_BIT - 32))) )
+ goto invalid;
+ if ( wrmsr_safe(MSR_AMD64_NB_CFG, eax, edx) != 0 )
+ goto fail;
+ break;
+ case MSR_FAM10H_MMIO_CONF_BASE:
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
+ boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x11 )
+ goto fail;
+ if ( !IS_PRIV(v->domain) )
+ break;
+ if ( (rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, l, h) != 0) ||
+ (((((u64)h << 32) | l) ^ res) &
+ ~((1 << FAM10H_MMIO_CONF_ENABLE_BIT) |
+ (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
+ FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
+ ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
+ FAM10H_MMIO_CONF_BASE_SHIFT))) )
+ goto invalid;
+ if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, eax, edx) != 0 )
+ goto fail;
+ break;
case MSR_IA32_PERF_CTL:
if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
goto fail;
@@ -2124,11 +2154,18 @@ static int emulate_privileged_op(struct
if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
goto fail;
break;
+ case MSR_IA32_THERM_CONTROL:
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+ goto fail;
+ if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
+ goto fail;
+ break;
default:
if ( wrmsr_hypervisor_regs(regs->ecx, eax, edx) )
break;
if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
(eax != l) || (edx != h) )
+ invalid:
gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
"%08x:%08x to %08x:%08x.\n",
_p(regs->ecx), h, l, edx, eax);
@@ -2198,6 +2235,12 @@ static int emulate_privileged_op(struct
regs->eax |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
+ break;
+ case MSR_IA32_THERM_CONTROL:
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+ goto fail;
+ if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
+ goto fail;
break;
default:
if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/common/domain.c
--- a/xen/common/domain.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/common/domain.c Fri Sep 12 14:47:40 2008 +0900
@@ -651,9 +651,11 @@ void vcpu_reset(struct vcpu *v)

set_bit(_VPF_down, &v->pause_flags);

+ clear_bit(v->vcpu_id, d->poll_mask);
+ v->poll_evtchn = 0;
+
v->fpu_initialised = 0;
v->fpu_dirtied = 0;
- v->is_polling = 0;
v->is_initialised = 0;
v->nmi_pending = 0;
v->mce_pending = 0;
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/common/domctl.c
--- a/xen/common/domctl.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/common/domctl.c Fri Sep 12 14:47:40 2008 +0900
@@ -655,9 +655,6 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
spin_lock(&d->page_alloc_lock);
if ( new_max >= d->tot_pages )
{
- ret = guest_physmap_max_mem_pages(d, new_max);
- if ( ret != 0 )
- break;
d->max_pages = new_max;
ret = 0;
}
@@ -729,16 +726,11 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
if ( d == NULL )
break;

- ret = xsm_irq_permission(d, pirq, op->u.irq_permission.allow_access);
- if ( ret )
- goto irq_permission_out;
-
if ( op->u.irq_permission.allow_access )
ret = irq_permit_access(d, pirq);
else
ret = irq_deny_access(d, pirq);

- irq_permission_out:
rcu_unlock_domain(d);
}
break;
@@ -757,17 +749,12 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
d = rcu_lock_domain_by_id(op->domain);
if ( d == NULL )
break;
-
- ret = xsm_iomem_permission(d, mfn, op->u.iomem_permission.allow_access);
- if ( ret )
- goto iomem_permission_out;

if ( op->u.iomem_permission.allow_access )
ret = iomem_permit_access(d, mfn, mfn + nr_mfns - 1);
else
ret = iomem_deny_access(d, mfn, mfn + nr_mfns - 1);

- iomem_permission_out:
rcu_unlock_domain(d);
}
break;
@@ -813,6 +800,12 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
{
put_domain(e);
goto set_target_out;
+ }
+
+ ret = xsm_set_target(d, e);
+ if ( ret ) {
+ put_domain(e);
+ goto set_target_out;
}

/* Hold reference on @e until we destroy @d. */
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/common/event_channel.c
--- a/xen/common/event_channel.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/common/event_channel.c Fri Sep 12 14:47:40 2008 +0900
@@ -545,6 +545,7 @@ static int evtchn_set_pending(struct vcp
static int evtchn_set_pending(struct vcpu *v, int port)
{
struct domain *d = v->domain;
+ int vcpuid;

/*
* The following bit operations must happen in strict order.
@@ -564,15 +565,19 @@ static int evtchn_set_pending(struct vcp
}

/* Check if some VCPU might be polling for this event. */
- if ( unlikely(d->is_polling) )
- {
- d->is_polling = 0;
- smp_mb(); /* check vcpu poll-flags /after/ clearing domain poll-flag */
- for_each_vcpu ( d, v )
+ if ( likely(bitmap_empty(d->poll_mask, MAX_VIRT_CPUS)) )
+ return 0;
+
+ /* Wake any interested (or potentially interested) pollers. */
+ for ( vcpuid = find_first_bit(d->poll_mask, MAX_VIRT_CPUS);
+ vcpuid < MAX_VIRT_CPUS;
+ vcpuid = find_next_bit(d->poll_mask, MAX_VIRT_CPUS, vcpuid+1) )
+ {
+ v = d->vcpu[vcpuid];
+ if ( ((v->poll_evtchn <= 0) || (v->poll_evtchn == port)) &&
+ test_and_clear_bit(vcpuid, d->poll_mask) )
{
- if ( !v->is_polling )
- continue;
- v->is_polling = 0;
+ v->poll_evtchn = 0;
vcpu_unblock(v);
}
}
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/common/rangeset.c
--- a/xen/common/rangeset.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/common/rangeset.c Fri Sep 12 14:47:40 2008 +0900
@@ -10,6 +10,7 @@
#include <xen/sched.h>
#include <xen/errno.h>
#include <xen/rangeset.h>
+#include <xsm/xsm.h>

/* An inclusive range [s,e] and pointer to next range in ascending order. */
struct range {
@@ -95,6 +96,10 @@ int rangeset_add_range(
{
struct range *x, *y;
int rc = 0;
+
+ rc = xsm_add_range(r->domain, r->name, s, e);
+ if ( rc )
+ return rc;

ASSERT(s <= e);

@@ -164,6 +169,10 @@ int rangeset_remove_range(
struct range *x, *y, *t;
int rc = 0;

+ rc = xsm_remove_range(r->domain, r->name, s, e);
+ if ( rc )
+ return rc;
+
ASSERT(s <= e);

spin_lock(&r->lock);
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/common/sched_credit.c
--- a/xen/common/sched_credit.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/common/sched_credit.c Fri Sep 12 14:47:40 2008 +0900
@@ -1107,6 +1107,10 @@ csched_load_balance(int cpu, struct csch

BUG_ON( cpu != snext->vcpu->processor );

+ /* If this CPU is going offline we shouldn't steal work. */
+ if ( unlikely(!cpu_online(cpu)) )
+ goto out;
+
if ( snext->pri == CSCHED_PRI_IDLE )
CSCHED_STAT_CRANK(load_balance_idle);
else if ( snext->pri == CSCHED_PRI_TS_OVER )
@@ -1149,6 +1153,7 @@ csched_load_balance(int cpu, struct csch
return speer;
}

+ out:
/* Failed to find more important work elsewhere... */
__runq_remove(snext);
return snext;
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/common/schedule.c
--- a/xen/common/schedule.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/common/schedule.c Fri Sep 12 14:47:40 2008 +0900
@@ -63,11 +63,31 @@ static struct scheduler ops;
(( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ ) \
: (typeof(ops.fn(__VA_ARGS__)))0 )

+static inline void trace_runstate_change(struct vcpu *v, int new_state)
+{
+ struct { uint32_t vcpu:16, domain:16; } d;
+ uint32_t event;
+
+ if ( likely(!tb_init_done) )
+ return;
+
+ d.vcpu = v->vcpu_id;
+ d.domain = v->domain->domain_id;
+
+ event = TRC_SCHED_RUNSTATE_CHANGE;
+ event |= ( v->runstate.state & 0x3 ) << 8;
+ event |= ( new_state & 0x3 ) << 4;
+
+ __trace_var(event, 1/*tsc*/, sizeof(d), (unsigned char *)&d);
+}
+
static inline void vcpu_runstate_change(
struct vcpu *v, int new_state, s_time_t new_entry_time)
{
ASSERT(v->runstate.state != new_state);
ASSERT(spin_is_locked(&per_cpu(schedule_data,v->processor).schedule_lock));
+
+ trace_runstate_change(v, new_state);

v->runstate.time[v->runstate.state] +=
new_entry_time - v->runstate.state_entry_time;
@@ -198,6 +218,27 @@ void vcpu_wake(struct vcpu *v)
TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
}

+void vcpu_unblock(struct vcpu *v)
+{
+ if ( !test_and_clear_bit(_VPF_blocked, &v->pause_flags) )
+ return;
+
+ /* Polling period ends when a VCPU is unblocked. */
+ if ( unlikely(v->poll_evtchn != 0) )
+ {
+ v->poll_evtchn = 0;
+ /*
+ * We *must* re-clear _VPF_blocked to avoid racing other wakeups of
+ * this VCPU (and it then going back to sleep on poll_mask).
+ * Test-and-clear is idiomatic and ensures clear_bit not reordered.
+ */
+ if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
+ clear_bit(_VPF_blocked, &v->pause_flags);
+ }
+
+ vcpu_wake(v);
+}
+
static void vcpu_migrate(struct vcpu *v)
{
unsigned long flags;
@@ -247,6 +288,48 @@ void vcpu_force_reschedule(struct vcpu *
}
}

+/*
+ * This function is used by cpu_hotplug code from stop_machine context.
+ * Hence we can avoid needing to take the
+ */
+void cpu_disable_scheduler(void)
+{
+ struct domain *d;
+ struct vcpu *v;
+ unsigned int cpu = smp_processor_id();
+
+ for_each_domain ( d )
+ {
+ for_each_vcpu ( d, v )
+ {
+ if ( is_idle_vcpu(v) )
+ continue;
+
+ if ( (cpus_weight(v->cpu_affinity) == 1) &&
+ cpu_isset(cpu, v->cpu_affinity) )
+ {
+ printk("Breaking vcpu affinity for domain %d vcpu %d\n",
+ v->domain->domain_id, v->vcpu_id);
+ cpus_setall(v->cpu_affinity);
+ }
+
+ /*
+ * Migrate single-shot timers to CPU0. A new cpu will automatically
+ * be chosen when the timer is next re-set.
+ */
+ if ( v->singleshot_timer.cpu == cpu )
+ migrate_timer(&v->singleshot_timer, 0);
+
+ if ( v->processor == cpu )
+ {
+ set_bit(_VPF_migrating, &v->pause_flags);
+ vcpu_sleep_nosync(v);
+ vcpu_migrate(v);
+ }
+ }
+ }
+}
+
static int __vcpu_set_affinity(
struct vcpu *v, cpumask_t *affinity,
bool_t old_lock_status, bool_t new_lock_status)
@@ -337,7 +420,7 @@ static long do_poll(struct sched_poll *s
struct vcpu *v = current;
struct domain *d = v->domain;
evtchn_port_t port;
- long rc = 0;
+ long rc;
unsigned int i;

/* Fairly arbitrary limit. */
@@ -348,11 +431,24 @@ static long do_poll(struct sched_poll *s
return -EFAULT;

set_bit(_VPF_blocked, &v->pause_flags);
- v->is_polling = 1;
- d->is_polling = 1;
-
+ v->poll_evtchn = -1;
+ set_bit(v->vcpu_id, d->poll_mask);
+
+#ifndef CONFIG_X86 /* set_bit() implies mb() on x86 */
/* Check for events /after/ setting flags: avoids wakeup waiting race. */
- smp_wmb();
+ smp_mb();
+
+ /*
+ * Someone may have seen we are blocked but not that we are polling, or
+ * vice versa. We are certainly being woken, so clean up and bail. Beyond
+ * this point others can be guaranteed to clean up for us if they wake us.
+ */
+ rc = 0;
+ if ( (v->poll_evtchn == 0) ||
+ !test_bit(_VPF_blocked, &v->pause_flags) ||
+ !test_bit(v->vcpu_id, d->poll_mask) )
+ goto out;
+#endif

for ( i = 0; i < sched_poll->nr_ports; i++ )
{
@@ -369,6 +465,9 @@ static long do_poll(struct sched_poll *s
goto out;
}

+ if ( sched_poll->nr_ports == 1 )
+ v->poll_evtchn = port;
+
if ( sched_poll->timeout != 0 )
set_timer(&v->poll_timer, sched_poll->timeout);

@@ -378,7 +477,8 @@ static long do_poll(struct sched_poll *s
return 0;

out:
- v->is_polling = 0;
+ v->poll_evtchn = 0;
+ clear_bit(v->vcpu_id, d->poll_mask);
clear_bit(_VPF_blocked, &v->pause_flags);
return rc;
}
@@ -628,7 +728,9 @@ static void vcpu_periodic_timer_work(str
return;

periodic_next_event = v->periodic_last_event + v->periodic_period;
- if ( now > periodic_next_event )
+
+ /* The timer subsystem may call us up to TIME_SLOP ahead of deadline. */
+ if ( (now + TIME_SLOP) > periodic_next_event )
{
send_timer_event(v);
v->periodic_last_event = now;
@@ -758,11 +860,8 @@ static void poll_timer_fn(void *data)
{
struct vcpu *v = data;

- if ( !v->is_polling )
- return;
-
- v->is_polling = 0;
- vcpu_unblock(v);
+ if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
+ vcpu_unblock(v);
}

/* Initialise the data structures. */
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/common/sysctl.c
--- a/xen/common/sysctl.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/common/sysctl.c Fri Sep 12 14:47:40 2008 +0900
@@ -149,6 +149,10 @@ long do_sysctl(XEN_GUEST_HANDLE(xen_sysc
char c;
uint32_t i;

+ ret = xsm_debug_keys();
+ if ( ret )
+ break;
+
for ( i = 0; i < op->u.debug_keys.nr_keys; i++ )
{
if ( copy_from_guest_offset(&c, op->u.debug_keys.keys, i, 1) )
@@ -166,6 +170,10 @@ long do_sysctl(XEN_GUEST_HANDLE(xen_sysc

nr_cpus = min_t(uint32_t, op->u.getcpuinfo.max_cpus, NR_CPUS);

+ ret = xsm_getcpuinfo();
+ if ( ret )
+ break;
+
for ( i = 0; i < nr_cpus; i++ )
{
/* Assume no holes in idle-vcpu map. */
@@ -188,6 +196,10 @@ long do_sysctl(XEN_GUEST_HANDLE(xen_sysc

case XEN_SYSCTL_availheap:
{
+ ret = xsm_availheap();
+ if ( ret )
+ break;
+
op->u.availheap.avail_bytes = avail_domheap_pages_region(
op->u.availheap.node,
op->u.availheap.min_bitwidth,
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/common/trace.c
--- a/xen/common/trace.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/common/trace.c Fri Sep 12 14:47:40 2008 +0900
@@ -58,6 +58,7 @@ static int t_buf_highwater;

/* Number of records lost due to per-CPU trace buffer being full. */
static DEFINE_PER_CPU(unsigned long, lost_records);
+static DEFINE_PER_CPU(unsigned long, lost_records_first_tsc);

/* a flag recording whether initialization has been done */
/* or more properly, if the tbuf subsystem is enabled right now */
@@ -147,6 +148,31 @@ static int tb_set_size(int size)
return 0;
}

+int trace_will_trace_event(u32 event)
+{
+ if ( !tb_init_done )
+ return 0;
+
+ /*
+ * Copied from __trace_var()
+ */
+ if ( (tb_event_mask & event) == 0 )
+ return 0;
+
+ /* match class */
+ if ( ((tb_event_mask >> TRC_CLS_SHIFT) & (event >> TRC_CLS_SHIFT)) == 0 )
+ return 0;
+
+ /* then match subclass */
+ if ( (((tb_event_mask >> TRC_SUBCLS_SHIFT) & 0xf )
+ & ((event >> TRC_SUBCLS_SHIFT) & 0xf )) == 0 )
+ return 0;
+
+ if ( !cpu_isset(smp_processor_id(), tb_cpu_mask) )
+ return 0;
+
+ return 1;
+}

/**
* init_trace_bufs - performs initialization of the per-cpu trace buffers.
@@ -354,22 +380,27 @@ static inline int insert_wrap_record(str
NULL);
}

-#define LOST_REC_SIZE 8
+#define LOST_REC_SIZE (4 + 8 + 16) /* header + tsc + sizeof(struct ed) */

static inline int insert_lost_records(struct t_buf *buf)
{
struct {
u32 lost_records;
- } ed;
-
+ u32 did:16, vid:16;
+ u64 first_tsc;
+ } __attribute__((packed)) ed;
+
+ ed.vid = current->vcpu_id;
+ ed.did = current->domain->domain_id;
ed.lost_records = this_cpu(lost_records);
+ ed.first_tsc = this_cpu(lost_records_first_tsc);

this_cpu(lost_records) = 0;

return __insert_record(buf,
TRC_LOST_RECORDS,
sizeof(ed),
- 0 /* !cycles */,
+ 1 /* cycles */,
LOST_REC_SIZE,
(unsigned char *)&ed);
}
@@ -401,7 +432,8 @@ void __trace_var(u32 event, int cycles,
int extra_word;
int started_below_highwater;

- ASSERT(tb_init_done);
+ if( !tb_init_done )
+ return;

/* Convert byte count into word count, rounding up */
extra_word = (extra / sizeof(u32));
@@ -479,7 +511,8 @@ void __trace_var(u32 event, int cycles,
/* Do we have enough space for everything? */
if ( total_size > bytes_to_tail )
{
- this_cpu(lost_records)++;
+ if ( ++this_cpu(lost_records) == 1 )
+ this_cpu(lost_records_first_tsc)=(u64)get_cycles();
local_irq_restore(flags);
return;
}
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/drivers/acpi/hwregs.c
--- a/xen/drivers/acpi/hwregs.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/drivers/acpi/hwregs.c Fri Sep 12 14:47:40 2008 +0900
@@ -239,11 +239,13 @@ acpi_status acpi_set_register(u32 regist

case ACPI_REGISTER_PM2_CONTROL:

+#if 0 /* Redundant read in original Linux code. */
status = acpi_hw_register_read(ACPI_REGISTER_PM2_CONTROL,
&register_value);
if (ACPI_FAILURE(status)) {
goto unlock_and_exit;
}
+#endif

ACPI_DEBUG_PRINT((ACPI_DB_IO,
"PM2 control: Read %X from %8.8X%8.8X\n",
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/drivers/passthrough/iommu.c
--- a/xen/drivers/passthrough/iommu.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/drivers/passthrough/iommu.c Fri Sep 12 14:47:40 2008 +0900
@@ -33,11 +33,13 @@ int amd_iov_detect(void);
* pv Enable IOMMU for PV domains
* no-pv Disable IOMMU for PV domains (default)
* force|required Don't boot unless IOMMU is enabled
+ * passthrough Bypass VT-d translation for Dom0
*/
custom_param("iommu", parse_iommu_param);
int iommu_enabled = 0;
int iommu_pv_enabled = 0;
int force_iommu = 0;
+int iommu_passthrough = 0;

static void __init parse_iommu_param(char *s)
{
@@ -58,6 +60,8 @@ static void __init parse_iommu_param(cha
iommu_pv_enabled = 0;
else if ( !strcmp(s, "force") || !strcmp(s, "required") )
force_iommu = 1;
+ else if ( !strcmp(s, "passthrough") )
+ iommu_passthrough = 1;

s = ss + 1;
} while ( ss );
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/drivers/passthrough/vtd/iommu.c
--- a/xen/drivers/passthrough/vtd/iommu.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/drivers/passthrough/vtd/iommu.c Fri Sep 12 14:47:40 2008 +0900
@@ -1090,12 +1090,13 @@ static int domain_context_mapping_one(
}

spin_lock_irqsave(&iommu->lock, flags);
-
-#ifdef CONTEXT_PASSTHRU
- if ( ecap_pass_thru(iommu->ecap) && (domain->domain_id == 0) )
+ if ( iommu_passthrough &&
+ ecap_pass_thru(iommu->ecap) && (domain->domain_id == 0) )
+ {
context_set_translation_type(*context, CONTEXT_TT_PASS_THRU);
+ agaw = level_to_agaw(iommu->nr_pt_levels);
+ }
else
-#endif
{
/* Ensure we have pagetables allocated down to leaf PTE. */
if ( hd->pgd_maddr == 0 )
@@ -1459,11 +1460,13 @@ int intel_iommu_map_page(
u64 pg_maddr;
int pte_present;

-#ifdef CONTEXT_PASSTHRU
+ drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
+ iommu = drhd->iommu;
+
/* do nothing if dom0 and iommu supports pass thru */
- if ( ecap_pass_thru(iommu->ecap) && (d->domain_id == 0) )
+ if ( iommu_passthrough &&
+ ecap_pass_thru(iommu->ecap) && (d->domain_id == 0) )
return 0;
-#endif

pg_maddr = addr_to_dma_page_maddr(d, (paddr_t)gfn << PAGE_SHIFT_4K, 1);
if ( pg_maddr == 0 )
@@ -1500,11 +1503,10 @@ int intel_iommu_unmap_page(struct domain
drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
iommu = drhd->iommu;

-#ifdef CONTEXT_PASSTHRU
/* do nothing if dom0 and iommu supports pass thru */
- if ( ecap_pass_thru(iommu->ecap) && (d->domain_id == 0) )
+ if ( iommu_passthrough &&
+ ecap_pass_thru(iommu->ecap) && (d->domain_id == 0) )
return 0;
-#endif

dma_pte_clear_one(d, (paddr_t)gfn << PAGE_SHIFT_4K);

diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/asm-ia64/shadow.h
--- a/xen/include/asm-ia64/shadow.h Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/include/asm-ia64/shadow.h Fri Sep 12 14:47:40 2008 +0900
@@ -63,8 +63,6 @@ shadow_mark_page_dirty(struct domain *d,
return 0;
}

-#define guest_physmap_max_mem_pages(d, n) (0)
-
#endif // _XEN_SHADOW_H

/*
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/asm-x86/bitops.h
--- a/xen/include/asm-x86/bitops.h Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/include/asm-x86/bitops.h Fri Sep 12 14:47:40 2008 +0900
@@ -116,8 +116,8 @@ static inline void __clear_bit(int nr, v
__clear_bit(nr, addr); \
})

-#define smp_mb__before_clear_bit() barrier()
-#define smp_mb__after_clear_bit() barrier()
+#define smp_mb__before_clear_bit() ((void)0)
+#define smp_mb__after_clear_bit() ((void)0)

/**
* __change_bit - Toggle a bit in memory
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/asm-x86/guest_access.h
--- a/xen/include/asm-x86/guest_access.h Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/include/asm-x86/guest_access.h Fri Sep 12 14:47:40 2008 +0900
@@ -8,7 +8,7 @@
#define __ASM_X86_GUEST_ACCESS_H__

#include <asm/uaccess.h>
-#include <asm/shadow.h>
+#include <asm/paging.h>
#include <asm/hvm/support.h>
#include <asm/hvm/guest_access.h>

@@ -87,10 +87,10 @@
* Allows use of faster __copy_* functions.
*/
#define guest_handle_okay(hnd, nr) \
- (shadow_mode_external(current->domain) || \
+ (paging_mode_external(current->domain) || \
array_access_ok((hnd).p, (nr), sizeof(*(hnd).p)))
#define guest_handle_subrange_okay(hnd, first, last) \
- (shadow_mode_external(current->domain) || \
+ (paging_mode_external(current->domain) || \
array_access_ok((hnd).p + (first), \
(last)-(first)+1, \
sizeof(*(hnd).p)))
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/asm-x86/hvm/trace.h
--- a/xen/include/asm-x86/hvm/trace.h Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/include/asm-x86/hvm/trace.h Fri Sep 12 14:47:40 2008 +0900
@@ -56,16 +56,13 @@
#define TRC_PAR_LONG(par) (par)
#endif

-#define HVMTRACE_ND(evt, cycles, vcpu, count, d1, d2, d3, d4, d5, d6) \
+#define HVMTRACE_ND(evt, cycles, count, d1, d2, d3, d4, d5, d6) \
do { \
if ( unlikely(tb_init_done) && DO_TRC_HVM_ ## evt ) \
{ \
struct { \
- u32 did:16, vid:16; \
u32 d[6]; \
} _d; \
- _d.did=(vcpu)->domain->domain_id; \
- _d.vid=(vcpu)->vcpu_id; \
_d.d[0]=(d1); \
_d.d[1]=(d2); \
_d.d[2]=(d3); \
@@ -77,32 +74,32 @@
} \
} while(0)

-#define HVMTRACE_6D(evt, vcpu, d1, d2, d3, d4, d5, d6) \
- HVMTRACE_ND(evt, 0, vcpu, 6, d1, d2, d3, d4, d5, d6)
-#define HVMTRACE_5D(evt, vcpu, d1, d2, d3, d4, d5) \
- HVMTRACE_ND(evt, 0, vcpu, 5, d1, d2, d3, d4, d5, 0)
-#define HVMTRACE_4D(evt, vcpu, d1, d2, d3, d4) \
- HVMTRACE_ND(evt, 0, vcpu, 4, d1, d2, d3, d4, 0, 0)
-#define HVMTRACE_3D(evt, vcpu, d1, d2, d3) \
- HVMTRACE_ND(evt, 0, vcpu, 3, d1, d2, d3, 0, 0, 0)
-#define HVMTRACE_2D(evt, vcpu, d1, d2) \
- HVMTRACE_ND(evt, 0, vcpu, 2, d1, d2, 0, 0, 0, 0)
-#define HVMTRACE_1D(evt, vcpu, d1) \
- HVMTRACE_ND(evt, 0, vcpu, 1, d1, 0, 0, 0, 0, 0)
-#define HVMTRACE_0D(evt, vcpu) \
- HVMTRACE_ND(evt, 0, vcpu, 0, 0, 0, 0, 0, 0, 0)
+#define HVMTRACE_6D(evt, d1, d2, d3, d4, d5, d6) \
+ HVMTRACE_ND(evt, 0, 6, d1, d2, d3, d4, d5, d6)
+#define HVMTRACE_5D(evt, d1, d2, d3, d4, d5) \
+ HVMTRACE_ND(evt, 0, 5, d1, d2, d3, d4, d5, 0)
+#define HVMTRACE_4D(evt, d1, d2, d3, d4) \
+ HVMTRACE_ND(evt, 0, 4, d1, d2, d3, d4, 0, 0)
+#define HVMTRACE_3D(evt, d1, d2, d3) \
+ HVMTRACE_ND(evt, 0, 3, d1, d2, d3, 0, 0, 0)
+#define HVMTRACE_2D(evt, d1, d2) \
+ HVMTRACE_ND(evt, 0, 2, d1, d2, 0, 0, 0, 0)
+#define HVMTRACE_1D(evt, d1) \
+ HVMTRACE_ND(evt, 0, 1, d1, 0, 0, 0, 0, 0)
+#define HVMTRACE_0D(evt) \
+ HVMTRACE_ND(evt, 0, 0, 0, 0, 0, 0, 0, 0)



#ifdef __x86_64__
-#define HVMTRACE_LONG_1D(evt, vcpu, d1) \
- HVMTRACE_2D(evt ## 64, vcpu, (d1) & 0xFFFFFFFF, (d1) >> 32)
-#define HVMTRACE_LONG_2D(evt,vcpu,d1,d2, ...) \
- HVMTRACE_3D(evt ## 64, vcpu, d1, d2)
-#define HVMTRACE_LONG_3D(evt, vcpu, d1, d2, d3, ...) \
- HVMTRACE_4D(evt ## 64, vcpu, d1, d2, d3)
-#define HVMTRACE_LONG_4D(evt, vcpu, d1, d2, d3, d4, ...) \
- HVMTRACE_5D(evt ## 64, vcpu, d1, d2, d3, d4)
+#define HVMTRACE_LONG_1D(evt, d1) \
+ HVMTRACE_2D(evt ## 64, (d1) & 0xFFFFFFFF, (d1) >> 32)
+#define HVMTRACE_LONG_2D(evt, d1, d2, ...) \
+ HVMTRACE_3D(evt ## 64, d1, d2)
+#define HVMTRACE_LONG_3D(evt, d1, d2, d3, ...) \
+ HVMTRACE_4D(evt ## 64, d1, d2, d3)
+#define HVMTRACE_LONG_4D(evt, d1, d2, d3, d4, ...) \
+ HVMTRACE_5D(evt ## 64, d1, d2, d3, d4)
#else
#define HVMTRACE_LONG_1D HVMTRACE_1D
#define HVMTRACE_LONG_2D HVMTRACE_2D
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/asm-x86/io_apic.h
--- a/xen/include/asm-x86/io_apic.h Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/include/asm-x86/io_apic.h Fri Sep 12 14:47:40 2008 +0900
@@ -162,8 +162,6 @@ static inline void io_apic_modify(unsign
/* 1 if "noapic" boot option passed */
extern int skip_ioapic_setup;

-extern int msi_enable;
-
/*
* If we use the IO-APIC for IRQ routing, disable automatic
* assignment of PCI IRQ's.
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/include/asm-x86/mm.h Fri Sep 12 14:47:40 2008 +0900
@@ -57,6 +57,17 @@ struct page_info
* (except page table pages when the guest is in shadow mode).
*/
u32 tlbflush_timestamp;
+
+ /*
+ * When PGT_partial is true then this field is valid and indicates
+ * that PTEs in the range [.0, @nr_validated_ptes) have been validated.
+ * If @partial_pte is true then PTE at @nr_validated_ptes+1 has been
+ * partially validated.
+ */
+ struct {
+ u16 nr_validated_ptes;
+ bool_t partial_pte;
+ };

/*
* Guest pages with a shadow. This does not conflict with
@@ -86,9 +97,12 @@ struct page_info
/* PAE only: is this an L2 page directory containing Xen-private mappings? */
#define _PGT_pae_xen_l2 26
#define PGT_pae_xen_l2 (1U<<_PGT_pae_xen_l2)
-
- /* 26-bit count of uses of this frame as its current type. */
-#define PGT_count_mask ((1U<<26)-1)
+/* Has this page been *partially* validated for use as its current type? */
+#define _PGT_partial 25
+#define PGT_partial (1U<<_PGT_partial)
+
+ /* 25-bit count of uses of this frame as its current type. */
+#define PGT_count_mask ((1U<<25)-1)

/* Cleared when the owning guest 'frees' this page. */
#define _PGC_allocated 31
@@ -154,7 +168,8 @@ extern unsigned long total_pages;
extern unsigned long total_pages;
void init_frametable(void);

-void free_page_type(struct page_info *page, unsigned long type);
+int free_page_type(struct page_info *page, unsigned long type,
+ int preemptible);
int _shadow_mode_refcounts(struct domain *d);

void cleanup_page_cacheattr(struct page_info *page);
@@ -165,6 +180,8 @@ int get_page(struct page_info *page, st
int get_page(struct page_info *page, struct domain *domain);
void put_page_type(struct page_info *page);
int get_page_type(struct page_info *page, unsigned long type);
+int put_page_type_preemptible(struct page_info *page);
+int get_page_type_preemptible(struct page_info *page, unsigned long type);
int get_page_from_l1e(l1_pgentry_t l1e, struct domain *d);
void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d);

@@ -174,6 +191,19 @@ static inline void put_page_and_type(str
put_page(page);
}

+static inline int put_page_and_type_preemptible(struct page_info *page,
+ int preemptible)
+{
+ int rc = 0;
+
+ if ( preemptible )
+ rc = put_page_type_preemptible(page);
+ else
+ put_page_type(page);
+ if ( likely(rc == 0) )
+ put_page(page);
+ return rc;
+}

static inline int get_page_and_type(struct page_info *page,
struct domain *domain,
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/asm-x86/msr-index.h
--- a/xen/include/asm-x86/msr-index.h Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/include/asm-x86/msr-index.h Fri Sep 12 14:47:40 2008 +0900
@@ -194,10 +194,22 @@
#define _K8_VMCR_SVME_DISABLE 4
#define K8_VMCR_SVME_DISABLE (1 << _K8_VMCR_SVME_DISABLE)

+/* AMD64 MSRs */
+#define MSR_AMD64_NB_CFG 0xc001001f
+#define AMD64_NB_CFG_CF8_EXT_ENABLE_BIT 46
+
/* AMD Family10h machine check MSRs */
#define MSR_F10_MC4_MISC1 0xc0000408
#define MSR_F10_MC4_MISC2 0xc0000409
#define MSR_F10_MC4_MISC3 0xc000040A
+
+/* Other AMD Fam10h MSRs */
+#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058
+#define FAM10H_MMIO_CONF_ENABLE_BIT 0
+#define FAM10H_MMIO_CONF_BUSRANGE_MASK 0xf
+#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2
+#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffff
+#define FAM10H_MMIO_CONF_BASE_SHIFT 20

/* K6 MSRs */
#define MSR_K6_EFER 0xc0000080
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/asm-x86/shadow.h
--- a/xen/include/asm-x86/shadow.h Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/include/asm-x86/shadow.h Fri Sep 12 14:47:40 2008 +0900
@@ -115,8 +115,6 @@ static inline void shadow_remove_all_sha
sh_remove_shadows(v, gmfn, 0 /* Be thorough */, 1 /* Must succeed */);
}

-#define guest_physmap_max_mem_pages(d, n) (0)
-
#endif /* _XEN_SHADOW_H */

/*
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/public/trace.h
--- a/xen/include/public/trace.h Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/include/public/trace.h Fri Sep 12 14:47:40 2008 +0900
@@ -37,6 +37,7 @@
#define TRC_HVM 0x0008f000 /* Xen HVM trace */
#define TRC_MEM 0x0010f000 /* Xen memory trace */
#define TRC_PV 0x0020f000 /* Xen PV traces */
+#define TRC_SHADOW 0x0040f000 /* Xen shadow tracing */
#define TRC_ALL 0x0ffff000
#define TRC_HD_TO_EVENT(x) ((x)&0x0fffffff)
#define TRC_HD_CYCLE_FLAG (1UL<<31)
@@ -50,26 +51,30 @@
#define TRC_HVM_ENTRYEXIT 0x00081000 /* VMENTRY and #VMEXIT */
#define TRC_HVM_HANDLER 0x00082000 /* various HVM handlers */

+#define TRC_SCHED_MIN 0x00021000 /* Just runstate changes */
+#define TRC_SCHED_VERBOSE 0x00028000 /* More inclusive scheduling */
+
/* Trace events per class */
#define TRC_LOST_RECORDS (TRC_GEN + 1)
#define TRC_TRACE_WRAP_BUFFER (TRC_GEN + 2)
#define TRC_TRACE_CPU_CHANGE (TRC_GEN + 3)

-#define TRC_SCHED_DOM_ADD (TRC_SCHED + 1)
-#define TRC_SCHED_DOM_REM (TRC_SCHED + 2)
-#define TRC_SCHED_SLEEP (TRC_SCHED + 3)
-#define TRC_SCHED_WAKE (TRC_SCHED + 4)
-#define TRC_SCHED_YIELD (TRC_SCHED + 5)
-#define TRC_SCHED_BLOCK (TRC_SCHED + 6)
-#define TRC_SCHED_SHUTDOWN (TRC_SCHED + 7)
-#define TRC_SCHED_CTL (TRC_SCHED + 8)
-#define TRC_SCHED_ADJDOM (TRC_SCHED + 9)
-#define TRC_SCHED_SWITCH (TRC_SCHED + 10)
-#define TRC_SCHED_S_TIMER_FN (TRC_SCHED + 11)
-#define TRC_SCHED_T_TIMER_FN (TRC_SCHED + 12)
-#define TRC_SCHED_DOM_TIMER_FN (TRC_SCHED + 13)
-#define TRC_SCHED_SWITCH_INFPREV (TRC_SCHED + 14)
-#define TRC_SCHED_SWITCH_INFNEXT (TRC_SCHED + 15)
+#define TRC_SCHED_RUNSTATE_CHANGE (TRC_SCHED_MIN + 1)
+#define TRC_SCHED_DOM_ADD (TRC_SCHED_VERBOSE + 1)
+#define TRC_SCHED_DOM_REM (TRC_SCHED_VERBOSE + 2)
+#define TRC_SCHED_SLEEP (TRC_SCHED_VERBOSE + 3)
+#define TRC_SCHED_WAKE (TRC_SCHED_VERBOSE + 4)
+#define TRC_SCHED_YIELD (TRC_SCHED_VERBOSE + 5)
+#define TRC_SCHED_BLOCK (TRC_SCHED_VERBOSE + 6)
+#define TRC_SCHED_SHUTDOWN (TRC_SCHED_VERBOSE + 7)
+#define TRC_SCHED_CTL (TRC_SCHED_VERBOSE + 8)
+#define TRC_SCHED_ADJDOM (TRC_SCHED_VERBOSE + 9)
+#define TRC_SCHED_SWITCH (TRC_SCHED_VERBOSE + 10)
+#define TRC_SCHED_S_TIMER_FN (TRC_SCHED_VERBOSE + 11)
+#define TRC_SCHED_T_TIMER_FN (TRC_SCHED_VERBOSE + 12)
+#define TRC_SCHED_DOM_TIMER_FN (TRC_SCHED_VERBOSE + 13)
+#define TRC_SCHED_SWITCH_INFPREV (TRC_SCHED_VERBOSE + 14)
+#define TRC_SCHED_SWITCH_INFNEXT (TRC_SCHED_VERBOSE + 15)

#define TRC_MEM_PAGE_GRANT_MAP (TRC_MEM + 1)
#define TRC_MEM_PAGE_GRANT_UNMAP (TRC_MEM + 2)
@@ -88,6 +93,22 @@
#define TRC_PV_PTWR_EMULATION_PAE (TRC_PV + 12)
/* Indicates that addresses in trace record are 64 bits */
#define TRC_64_FLAG (0x100)
+
+#define TRC_SHADOW_NOT_SHADOW (TRC_SHADOW + 1)
+#define TRC_SHADOW_FAST_PROPAGATE (TRC_SHADOW + 2)
+#define TRC_SHADOW_FAST_MMIO (TRC_SHADOW + 3)
+#define TRC_SHADOW_FALSE_FAST_PATH (TRC_SHADOW + 4)
+#define TRC_SHADOW_MMIO (TRC_SHADOW + 5)
+#define TRC_SHADOW_FIXUP (TRC_SHADOW + 6)
+#define TRC_SHADOW_DOMF_DYING (TRC_SHADOW + 7)
+#define TRC_SHADOW_EMULATE (TRC_SHADOW + 8)
+#define TRC_SHADOW_EMULATE_UNSHADOW_USER (TRC_SHADOW + 9)
+#define TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ (TRC_SHADOW + 10)
+#define TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED (TRC_SHADOW + 11)
+#define TRC_SHADOW_WRMAP_BF (TRC_SHADOW + 12)
+#define TRC_SHADOW_PREALLOC_UNPIN (TRC_SHADOW + 13)
+#define TRC_SHADOW_RESYNC_FULL (TRC_SHADOW + 14)
+#define TRC_SHADOW_RESYNC_ONLY (TRC_SHADOW + 15)

/* trace events per subclass */
#define TRC_HVM_VMENTRY (TRC_HVM_ENTRYEXIT + 0x01)
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/xen/cpuidle.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/xen/cpuidle.h Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,82 @@
+/*
+ * cpuidle.h - xen idle state module derived from Linux
+ *
+ * (C) 2007 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ * Shaohua Li <shaohua.li@intel.com>
+ * Adam Belay <abelay@novell.com>
+ * Copyright (C) 2008 Intel Corporation
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#ifndef _XEN_CPUIDLE_H
+#define _XEN_CPUIDLE_H
+
+#define ACPI_PROCESSOR_MAX_POWER 8
+#define CPUIDLE_NAME_LEN 16
+
+struct acpi_processor_cx
+{
+ u8 valid;
+ u8 type;
+ u32 address;
+ u8 space_id;
+ u32 latency;
+ u32 latency_ticks;
+ u32 power;
+ u32 usage;
+ u64 time;
+ u32 target_residency;
+};
+
+struct acpi_processor_flags
+{
+ u8 bm_control:1;
+ u8 bm_check:1;
+ u8 has_cst:1;
+ u8 power_setup_done:1;
+ u8 bm_rld_set:1;
+};
+
+struct acpi_processor_power
+{
+ unsigned int cpu;
+ struct acpi_processor_flags flags;
+ struct acpi_processor_cx *last_state;
+ struct acpi_processor_cx *safe_state;
+ u32 last_residency;
+ void *gdata; /* governor specific data */
+ u32 count;
+ struct acpi_processor_cx states[ACPI_PROCESSOR_MAX_POWER];
+};
+
+struct cpuidle_governor
+{
+ char name[CPUIDLE_NAME_LEN];
+ unsigned int rating;
+
+ int (*enable) (struct acpi_processor_power *dev);
+ void (*disable) (struct acpi_processor_power *dev);
+
+ int (*select) (struct acpi_processor_power *dev);
+ void (*reflect) (struct acpi_processor_power *dev);
+};
+
+extern struct cpuidle_governor *cpuidle_current_governor;
+
+#endif /* _XEN_CPUIDLE_H */
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/xen/iommu.h
--- a/xen/include/xen/iommu.h Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/include/xen/iommu.h Fri Sep 12 14:47:40 2008 +0900
@@ -31,6 +31,7 @@ extern int iommu_enabled;
extern int iommu_enabled;
extern int iommu_pv_enabled;
extern int force_iommu;
+extern int iommu_passthrough;

#define domain_hvm_iommu(d) (&d->arch.hvm_domain.hvm_iommu)

diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/xen/sched.h
--- a/xen/include/xen/sched.h Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/include/xen/sched.h Fri Sep 12 14:47:40 2008 +0900
@@ -106,8 +106,6 @@ struct vcpu
bool_t fpu_initialised;
/* Has the FPU been used since it was last saved? */
bool_t fpu_dirtied;
- /* Is this VCPU polling any event channels (SCHEDOP_poll)? */
- bool_t is_polling;
/* Initialization completed for this VCPU? */
bool_t is_initialised;
/* Currently running on a CPU? */
@@ -133,6 +131,13 @@ struct vcpu
bool_t paused_for_shutdown;
/* VCPU affinity is temporarily locked from controller changes? */
bool_t affinity_locked;
+
+ /*
+ * > 0: a single port is being polled;
+ * = 0: nothing is being polled (vcpu should be clear in d->poll_mask);
+ * < 0: multiple ports may be being polled.
+ */
+ int poll_evtchn;

unsigned long pause_flags;
atomic_t pause_count;
@@ -209,14 +214,15 @@ struct domain
struct domain *target;
/* Is this guest being debugged by dom0? */
bool_t debugger_attached;
- /* Are any VCPUs polling event channels (SCHEDOP_poll)? */
- bool_t is_polling;
/* Is this guest dying (i.e., a zombie)? */
enum { DOMDYING_alive, DOMDYING_dying, DOMDYING_dead } is_dying;
/* Domain is paused by controller software? */
bool_t is_paused_by_controller;
/* Domain's VCPUs are pinned 1:1 to physical CPUs? */
bool_t is_pinned;
+
+ /* Are any VCPUs polling event channels (SCHEDOP_poll)? */
+ DECLARE_BITMAP(poll_mask, MAX_VIRT_CPUS);

/* Guest has shut down (inc. reason code)? */
spinlock_t shutdown_lock;
@@ -507,6 +513,7 @@ static inline int vcpu_runnable(struct v
atomic_read(&v->domain->pause_count));
}

+void vcpu_unblock(struct vcpu *v);
void vcpu_pause(struct vcpu *v);
void vcpu_pause_nosync(struct vcpu *v);
void domain_pause(struct domain *d);
@@ -517,17 +524,12 @@ void cpu_init(void);
void cpu_init(void);

void vcpu_force_reschedule(struct vcpu *v);
+void cpu_disable_scheduler(void);
int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity);
int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity);
void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity);

void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate);
-
-static inline void vcpu_unblock(struct vcpu *v)
-{
- if ( test_and_clear_bit(_VPF_blocked, &v->pause_flags) )
- vcpu_wake(v);
-}

#define IS_PRIV(_d) ((_d)->is_privileged)
#define IS_PRIV_FOR(_d, _t) (IS_PRIV(_d) || ((_d)->target && (_d)->target == (_t)))
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/xen/trace.h
--- a/xen/include/xen/trace.h Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/include/xen/trace.h Fri Sep 12 14:47:40 2008 +0900
@@ -33,6 +33,8 @@ void init_trace_bufs(void);

/* used to retrieve the physical address of the trace buffers */
int tb_control(struct xen_sysctl_tbuf_op *tbc);
+
+int trace_will_trace_event(u32 event);

void __trace_var(u32 event, int cycles, int extra, unsigned char *extra_data);

diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/xsm/xsm.h
--- a/xen/include/xsm/xsm.h Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/include/xsm/xsm.h Fri Sep 12 14:47:40 2008 +0900
@@ -64,16 +64,17 @@ struct xsm_operations {
int (*getvcpucontext) (struct domain *d);
int (*getvcpuinfo) (struct domain *d);
int (*domain_settime) (struct domain *d);
+ int (*set_target) (struct domain *d, struct domain *e);
int (*tbufcontrol) (void);
int (*readconsole) (uint32_t clear);
int (*sched_id) (void);
int (*setdomainmaxmem) (struct domain *d);
int (*setdomainhandle) (struct domain *d);
int (*setdebugging) (struct domain *d);
- int (*irq_permission) (struct domain *d, uint8_t pirq, uint8_t access);
- int (*iomem_permission) (struct domain *d, unsigned long mfn,
- uint8_t access);
int (*perfcontrol) (void);
+ int (*debug_keys) (void);
+ int (*getcpuinfo) (void);
+ int (*availheap) (void);

int (*evtchn_unbound) (struct domain *d, struct evtchn *chn, domid_t id2);
int (*evtchn_interdomain) (struct domain *d1, struct evtchn *chn1,
@@ -106,13 +107,13 @@ struct xsm_operations {

int (*kexec) (void);
int (*schedop_shutdown) (struct domain *d1, struct domain *d2);
+ int (*add_range) (struct domain *d, char *name, unsigned long s, unsigned long e);
+ int (*remove_range) (struct domain *d, char *name, unsigned long s, unsigned long e);

long (*__do_xsm_op) (XEN_GUEST_HANDLE(xsm_op_t) op);

#ifdef CONFIG_X86
int (*shadow_control) (struct domain *d, uint32_t op);
- int (*ioport_permission) (struct domain *d, uint32_t ioport,
- uint8_t access);
int (*getpageframeinfo) (struct page_info *page);
int (*getmemlist) (struct domain *d);
int (*hypercall_init) (struct domain *d);
@@ -130,13 +131,26 @@ struct xsm_operations {
int (*microcode) (void);
int (*physinfo) (void);
int (*platform_quirk) (uint32_t);
+ int (*firmware_info) (void);
+ int (*acpi_sleep) (void);
+ int (*change_freq) (void);
+ int (*getidletime) (void);
int (*machine_memory_map) (void);
int (*domain_memory_map) (struct domain *d);
- int (*mmu_normal_update) (struct domain *d, intpte_t fpte);
+ int (*mmu_normal_update) (struct domain *d, struct domain *f,
+ intpte_t fpte);
int (*mmu_machphys_update) (struct domain *d, unsigned long mfn);
- int (*update_va_mapping) (struct domain *d, l1_pgentry_t pte);
+ int (*update_va_mapping) (struct domain *d, struct domain *f,
+ l1_pgentry_t pte);
int (*add_to_physmap) (struct domain *d1, struct domain *d2);
int (*remove_from_physmap) (struct domain *d1, struct domain *d2);
+ int (*sendtrigger) (struct domain *d);
+ int (*test_assign_device) (uint32_t machine_bdf);
+ int (*assign_device) (struct domain *d, uint32_t machine_bdf);
+ int (*deassign_device) (struct domain *d, uint32_t machine_bdf);
+ int (*bind_pt_irq) (struct domain *d, struct xen_domctl_bind_pt_irq *bind);
+ int (*pin_mem_cacheattr) (struct domain *d);
+ int (*ext_vcpucontext) (struct domain *d, uint32_t cmd);
#endif
};

@@ -215,6 +229,11 @@ static inline int xsm_domain_settime (st
return xsm_call(domain_settime(d));
}

+static inline int xsm_set_target (struct domain *d, struct domain *e)
+{
+ return xsm_call(set_target(d, e));
+}
+
static inline int xsm_tbufcontrol (void)
{
return xsm_call(tbufcontrol());
@@ -245,21 +264,24 @@ static inline int xsm_setdebugging (stru
return xsm_call(setdebugging(d));
}

-static inline int xsm_irq_permission (struct domain *d, uint8_t pirq,
- uint8_t access)
-{
- return xsm_call(irq_permission(d, pirq, access));
-}
-
-static inline int xsm_iomem_permission (struct domain *d, unsigned long mfn,
- uint8_t access)
-{
- return xsm_call(iomem_permission(d, mfn, access));
-}
-
static inline int xsm_perfcontrol (void)
{
return xsm_call(perfcontrol());
+}
+
+static inline int xsm_debug_keys (void)
+{
+ return xsm_call(debug_keys());
+}
+
+static inline int xsm_availheap (void)
+{
+ return xsm_call(availheap());
+}
+
+static inline int xsm_getcpuinfo (void)
+{
+ return xsm_call(getcpuinfo());
}

static inline int xsm_evtchn_unbound (struct domain *d1, struct evtchn *chn,
@@ -385,6 +407,18 @@ static inline int xsm_schedop_shutdown (
static inline int xsm_schedop_shutdown (struct domain *d1, struct domain *d2)
{
return xsm_call(schedop_shutdown(d1, d2));
+}
+
+static inline int xsm_add_range (struct domain *d, char *name, unsigned long s,
+ unsigned long e)
+{
+ return xsm_call(add_range(d, name, s, e));
+}
+
+static inline int xsm_remove_range (struct domain *d, char *name, unsigned long s,
+ unsigned long e)
+{
+ return xsm_call(remove_range(d, name, s, e));
}

static inline long __do_xsm_op (XEN_GUEST_HANDLE(xsm_op_t) op)
@@ -413,12 +447,6 @@ static inline int xsm_shadow_control (st
return xsm_call(shadow_control(d, op));
}

-static inline int xsm_ioport_permission (struct domain *d, uint32_t ioport,
- uint8_t access)
-{
- return xsm_call(ioport_permission(d, ioport, access));
-}
-
static inline int xsm_getpageframeinfo (struct page_info *page)
{
return xsm_call(getpageframeinfo(page));
@@ -504,6 +532,26 @@ static inline int xsm_platform_quirk (ui
return xsm_call(platform_quirk(quirk));
}

+static inline int xsm_firmware_info (void)
+{
+ return xsm_call(firmware_info());
+}
+
+static inline int xsm_acpi_sleep (void)
+{
+ return xsm_call(acpi_sleep());
+}
+
+static inline int xsm_change_freq (void)
+{
+ return xsm_call(change_freq());
+}
+
+static inline int xsm_getidletime (void)
+{
+ return xsm_call(getidletime());
+}
+
static inline int xsm_machine_memory_map(void)
{
return xsm_call(machine_memory_map());
@@ -514,9 +562,10 @@ static inline int xsm_domain_memory_map(
return xsm_call(domain_memory_map(d));
}

-static inline int xsm_mmu_normal_update (struct domain *d, intpte_t fpte)
-{
- return xsm_call(mmu_normal_update(d, fpte));
+static inline int xsm_mmu_normal_update (struct domain *d, struct domain *f,
+ intpte_t fpte)
+{
+ return xsm_call(mmu_normal_update(d, f, fpte));
}

static inline int xsm_mmu_machphys_update (struct domain *d, unsigned long mfn)
@@ -524,9 +573,10 @@ static inline int xsm_mmu_machphys_updat
return xsm_call(mmu_machphys_update(d, mfn));
}

-static inline int xsm_update_va_mapping(struct domain *d, l1_pgentry_t pte)
-{
- return xsm_call(update_va_mapping(d, pte));
+static inline int xsm_update_va_mapping(struct domain *d, struct domain *f,
+ l1_pgentry_t pte)
+{
+ return xsm_call(update_va_mapping(d, f, pte));
}

static inline int xsm_add_to_physmap(struct domain *d1, struct domain *d2)
@@ -538,6 +588,42 @@ static inline int xsm_remove_from_physma
{
return xsm_call(remove_from_physmap(d1, d2));
}
+
+static inline int xsm_sendtrigger(struct domain *d)
+{
+ return xsm_call(sendtrigger(d));
+}
+
+static inline int xsm_test_assign_device(uint32_t machine_bdf)
+{
+ return xsm_call(test_assign_device(machine_bdf));
+}
+
+static inline int xsm_assign_device(struct domain *d, uint32_t machine_bdf)
+{
+ return xsm_call(assign_device(d, machine_bdf));
+}
+
+static inline int xsm_deassign_device(struct domain *d, uint32_t machine_bdf)
+{
+ return xsm_call(deassign_device(d, machine_bdf));
+}
+
+static inline int xsm_bind_pt_irq(struct domain *d,
+ struct xen_domctl_bind_pt_irq *bind)
+{
+ return xsm_call(bind_pt_irq(d, bind));
+}
+
+static inline int xsm_pin_mem_cacheattr(struct domain *d)
+{
+ return xsm_call(pin_mem_cacheattr(d));
+}
+
+static inline int xsm_ext_vcpucontext(struct domain *d, uint32_t cmd)
+{
+ return xsm_call(ext_vcpucontext(d, cmd));
+}
#endif /* CONFIG_X86 */

#endif /* __XSM_H */
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/xsm/dummy.c
--- a/xen/xsm/dummy.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/xsm/dummy.c Fri Sep 12 14:47:40 2008 +0900
@@ -84,6 +84,11 @@ static int dummy_domain_settime (struct
return 0;
}

+static int dummy_set_target (struct domain *d, struct domain *e)
+{
+ return 0;
+}
+
static int dummy_tbufcontrol (void)
{
return 0;
@@ -114,18 +119,22 @@ static int dummy_setdebugging (struct do
return 0;
}

-static int dummy_irq_permission (struct domain *d, uint8_t pirq, uint8_t access)
-{
- return 0;
-}
-
-static int dummy_iomem_permission (struct domain *d, unsigned long mfn,
- uint8_t access)
-{
- return 0;
-}
-
static int dummy_perfcontrol (void)
+{
+ return 0;
+}
+
+static int dummy_debug_keys (void)
+{
+ return 0;
+}
+
+static int dummy_getcpuinfo (void)
+{
+ return 0;
+}
+
+static int dummy_availheap (void)
{
return 0;
}
@@ -259,18 +268,23 @@ static long dummy___do_xsm_op(XEN_GUEST_
return -ENOSYS;
}

+static int dummy_add_range (struct domain *d, char *name, unsigned long s, unsigned long e)
+{
+ return 0;
+}
+
+static int dummy_remove_range (struct domain *d, char *name, unsigned long s,
+ unsigned long e)
+{
+ return 0;
+}
+
#ifdef CONFIG_X86
static int dummy_shadow_control (struct domain *d, uint32_t op)
{
return 0;
}

-static int dummy_ioport_permission (struct domain *d, uint32_t ioport,
- uint8_t access)
-{
- return 0;
-}
-
static int dummy_getpageframeinfo (struct page_info *page)
{
return 0;
@@ -356,6 +370,26 @@ static int dummy_platform_quirk (uint32_
return 0;
}

+static int dummy_firmware_info (void)
+{
+ return 0;
+}
+
+static int dummy_acpi_sleep (void)
+{
+ return 0;
+}
+
+static int dummy_change_freq (void)
+{
+ return 0;
+}
+
+static int dummy_getidletime (void)
+{
+ return 0;
+}
+
static int dummy_machine_memory_map (void)
{
return 0;
@@ -366,7 +400,8 @@ static int dummy_domain_memory_map (stru
return 0;
}

-static int dummy_mmu_normal_update (struct domain *d, intpte_t fpte)
+static int dummy_mmu_normal_update (struct domain *d, struct domain *f,
+ intpte_t fpte)
{
return 0;
}
@@ -376,12 +411,48 @@ static int dummy_mmu_machphys_update (st
return 0;
}


_______________________________________________
Xen-changelog mailing list
Xen-changelog@lists.xensource.com
http://lists.xensource.com/xen-changelog
[xen-unstable] merge with xen-unstable.hg [ In reply to ]
# HG changeset patch
# User Isaku Yamahata <yamahata@valinux.co.jp>
# Date 1220255983 -32400
# Node ID d0a544d8a3f3194dda7c928572ac191358cc2204
# Parent 48db4eee7d588ea340535ae3ef60862686207210
# Parent 05c7efee10a1d253b9b7f8b52464627aff441931
merge with xen-unstable.hg
---
docs/misc/vtpm.txt | 4
docs/src/user.tex | 71 +++-----
docs/xen-api/xenapi-datamodel-graph.dot | 2
extras/mini-os/include/posix/dirent.h | 2
extras/mini-os/include/posix/err.h | 15 +
extras/mini-os/include/posix/pthread.h | 7
extras/mini-os/include/posix/syslog.h | 37 ++++
extras/mini-os/include/xenbus.h | 7
extras/mini-os/lib/sys.c | 153 +++++++++++++++---
extras/mini-os/main.c | 35 +++-
extras/mini-os/xenbus/xenbus.c | 6
stubdom/Makefile | 4
tools/Makefile | 3
tools/Rules.mk | 3
tools/cross-install | 2
tools/examples/xmexample.pv-grub | 28 ---
tools/firmware/hvmloader/32bitbios_support.c | 2
tools/firmware/hvmloader/acpi/Makefile | 6
tools/firmware/hvmloader/acpi/build.c | 4
tools/firmware/hvmloader/hvmloader.c | 25 ++
tools/firmware/hvmloader/util.c | 21 +-
tools/firmware/hvmloader/util.h | 2
tools/ioemu/hw/cirrus_vga.c | 218 +++++++++-----------------
tools/ioemu/hw/ide.c | 28 +--
tools/ioemu/hw/pass-through.c | 5
tools/ioemu/hw/vga.c | 150 +++++++++++++----
tools/ioemu/hw/vga_int.h | 4
tools/ioemu/hw/xen_platform.c | 38 ++++
tools/ioemu/vl.c | 32 ---
tools/ioemu/vl.h | 3
tools/libxc/xc_dom_boot.c | 9 -
tools/libxc/xc_domain.c | 27 +++
tools/libxc/xc_domain_save.c | 6
tools/libxc/xc_minios.c | 10 -
tools/libxc/xenctrl.h | 6
tools/pygrub/src/pygrub | 32 ++-
tools/python/xen/util/pci.py | 37 ++++
tools/python/xen/xend/XendConfig.py | 6
tools/python/xen/xend/XendDomain.py | 4
tools/python/xen/xend/XendDomainInfo.py | 60 ++++---
tools/python/xen/xend/image.py | 3
tools/python/xen/xend/server/DevController.py | 8
tools/python/xen/xend/server/pciif.py | 25 --
tools/xenmon/Makefile | 4
xen/Makefile | 6
xen/arch/ia64/xen/mm.c | 70 ++++++++
xen/arch/x86/acpi/power.c | 5
xen/arch/x86/cpu/amd.c | 138 ++++++++++++++++
xen/arch/x86/cpu/amd.h | 103 ++++++++++++
xen/arch/x86/hvm/emulate.c | 58 +++++-
xen/arch/x86/irq.c | 2
xen/arch/x86/microcode.c | 4
xen/arch/x86/mm.c | 58 ++++++
xen/arch/x86/platform_hypercall.c | 3
xen/arch/x86/time.c | 20 --
xen/arch/x86/x86_64/compat/mm.c | 14 +
xen/common/softirq.c | 1
xen/common/timer.c | 177 ++++++++++++++++-----
xen/common/xmalloc.c | 17 --
xen/drivers/passthrough/vtd/intremap.c | 89 +++++++---
xen/drivers/passthrough/vtd/iommu.c | 70 +++-----
xen/include/asm-x86/io_apic.h | 4
xen/include/asm-x86/msr-index.h | 3
xen/include/asm-x86/processor.h | 2
xen/include/asm-x86/softirq.h | 5
xen/include/public/memory.h | 17 ++
xen/include/public/platform.h | 2
xen/include/xen/compat.h | 4
xen/include/xen/iommu.h | 4
xen/include/xen/timer.h | 33 ++-
xen/include/xlat.lst | 1
xen/include/xsm/xsm.h | 6
xen/xsm/dummy.c | 6
xen/xsm/flask/hooks.c | 6
74 files changed, 1466 insertions(+), 616 deletions(-)

diff -r 48db4eee7d58 -r d0a544d8a3f3 docs/misc/vtpm.txt
--- a/docs/misc/vtpm.txt Mon Aug 25 19:04:37 2008 +0900
+++ b/docs/misc/vtpm.txt Mon Sep 01 16:59:43 2008 +0900
@@ -92,8 +92,8 @@ can be different. This is the case if fo
can be different. This is the case if for example that particular
instance is already used by another virtual machine. The association
of which TPM instance number is used by which virtual machine is
-kept in the file /etc/xen/vtpm.db. Associations are maintained by
-domain name and instance number.
+kept in the file /var/vtpm/vtpm.db. Associations are maintained by
+a xend-internal vTPM UUID and vTPM instance number.

Note: If you do not want TPM functionality for your user domain simply
leave out the 'vtpm' line in the configuration file.
diff -r 48db4eee7d58 -r d0a544d8a3f3 docs/src/user.tex
--- a/docs/src/user.tex Mon Aug 25 19:04:37 2008 +0900
+++ b/docs/src/user.tex Mon Sep 01 16:59:43 2008 +0900
@@ -22,7 +22,7 @@
\vfill
\begin{tabular}{l}
{\Huge \bf Users' Manual} \\[4mm]
-{\huge Xen v3.0} \\[80mm]
+{\huge Xen v3.3} \\[80mm]
\end{tabular}
\end{center}

@@ -42,9 +42,7 @@ welcome.}

\vspace*{\fill}

-Xen is Copyright \copyright 2002-2005, University of Cambridge, UK, XenSource
-Inc., IBM Corp., Hewlett-Packard Co., Intel Corp., AMD Inc., and others. All
-rights reserved.
+Xen is Copyright \copyright 2002-2008, Citrix Systems, Inc., University of Cambridge, UK, XenSource Inc., IBM Corp., Hewlett-Packard Co., Intel Corp., AMD Inc., and others. All rights reserved.

Xen is an open-source project. Most portions of Xen are licensed for copying
under the terms of the GNU General Public License, version 2. Other portions
@@ -116,16 +114,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE


Xen is an open-source \emph{para-virtualizing} virtual machine monitor
-(VMM), or ``hypervisor'', for the x86 processor architecture. Xen can
-securely execute multiple virtual machines on a single physical system
-with close-to-native performance. Xen facilitates enterprise-grade
-functionality, including:
+(VMM), or ``hypervisor'', for a variety of processor architectures including x86. Xen can securely execute multiple virtual machines on a single physical system with near native performance. Xen facilitates enterprise-grade functionality, including:

\begin{itemize}
\item Virtual machines with performance close to native hardware.
\item Live migration of running virtual machines between physical hosts.
\item Up to 32\footnote{IA64 supports up to 64 virtual CPUs per guest virtual machine} virtual CPUs per guest virtual machine, with VCPU hotplug.
-\item x86/32, x86/32 with PAE, x86/64, IA64 and Power platform support.
+\item x86/32 with PAE, x86/64, and IA64 platform support.
\item Intel and AMD Virtualization Technology for unmodified guest operating systems (including Microsoft Windows).
\item Excellent hardware support (supports almost all Linux device
drivers).
@@ -182,22 +177,20 @@ unmodified guests running natively on th

Paravirtualized Xen support is available for increasingly many
operating systems: currently, mature Linux support is available and
-included in the standard distribution. Other OS ports---including
-NetBSD, FreeBSD and Solaris x86 v10---are nearing completion.
+included in the standard distribution. Other OS ports, including
+NetBSD, FreeBSD and Solaris are also complete.


\section{Hardware Support}

-Xen currently runs on the x86 architecture, requiring a ``P6'' or
-newer processor (e.g.\ Pentium Pro, Celeron, Pentium~II, Pentium~III,
-Pentium~IV, Xeon, AMD~Athlon, AMD~Duron). Multiprocessor machines are
-supported, and there is support for HyperThreading (SMT). In
-addition, ports to IA64 and Power architectures are supported.
-
-The default 32-bit Xen supports for Intel's Physical Addressing Extensions (PAE), which enable x86/32 machines to address up to 64 GB of physical memory.
-It also supports non-PAE 32-bit Xen up to 4GB of memory.
-Xen also supports x86/64 platforms such as Intel EM64T and AMD Opteron
-which can currently address up to 1TB of physical memory.
+Xen currently runs on the IA64 and x86 architectures. Multiprocessor
+machines are supported, and there is support for HyperThreading (SMT).
+
+The default 32-bit Xen requires processor support for Physical
+Addressing Extensions (PAE), which enables the hypervisor to address
+up to 16GB of physical memory. Xen also supports x86/64 platforms
+such as Intel EM64T and AMD Opteron which can currently address up to
+1TB of physical memory.

Xen offloads most of the hardware support issues to the guest OS
running in the \emph{Domain~0} management virtual machine. Xen itself
@@ -253,8 +246,8 @@ Xen has grown into a fully-fledged proje
Xen has grown into a fully-fledged project in its own right, enabling us
to investigate interesting research issues regarding the best techniques
for virtualizing resources such as the CPU, memory, disk and network.
-Project contributors now include XenSource, Intel, IBM, HP, AMD, Novell,
-RedHat.
+Project contributors now include Citrix, Intel, IBM, HP, AMD, Novell,
+RedHat, Sun, Fujitsu, and Samsung.

Xen was first described in a paper presented at SOSP in
2003\footnote{\tt
@@ -265,25 +258,20 @@ sites.

\section{What's New}

-Xen 3.0.0 offers:
+Xen 3.3.0 offers:

\begin{itemize}
-\item Support for up to 32-way SMP guest operating systems
-\item Intel (Physical Addressing Extensions) PAE to support 32-bit
- servers with more than 4GB physical memory
-\item x86/64 support (Intel EM64T, AMD Opteron)
-\item Intel VT-x support to enable the running of unmodified guest
-operating systems (Windows XP/2003, Legacy Linux)
-\item Enhanced control tools
-\item Improved ACPI support
-\item AGP/DRM graphics
+\item IO Emulation (stub domains) for HVM IO performance and scailability
+\item Replacement of Intel VT vmxassist by new 16b emulation code
+\item Improved VT-d device pass-through e.g. for graphics devices
+\item Enhanced C and P state power management
+\item Exploitation of multi-queue support on modern NICs
+\item Removal of domain lock for improved PV guest scalability
+\item 2MB page support for HVM and PV guests
+\item CPU Portability
\end{itemize}

-
-Xen 3.0 features greatly enhanced hardware support, configuration
-flexibility, usability and a larger complement of supported operating
-systems. This latest release takes Xen a step closer to being the
-definitive open source solution for virtualization.
+Xen 3.3 delivers the capabilities needed by enterprise customers and gives computing industry leaders a solid, secure platform to build upon for their virtualization solutions. This latest release establishes Xen as the definitive open source solution for virtualization.



@@ -295,7 +283,7 @@ The Xen distribution includes three main
The Xen distribution includes three main components: Xen itself, ports
of Linux and NetBSD to run on Xen, and the userspace tools required to
manage a Xen-based system. This chapter describes how to install the
-Xen~3.0 distribution from source. Alternatively, there may be pre-built
+Xen~3.3 distribution from source. Alternatively, there may be pre-built
packages available as part of your operating system distribution.


@@ -4029,9 +4017,8 @@ files: \path{Config.mk} and \path{Makefi

The former allows the overall build target architecture to be
specified. You will typically not need to modify this unless
-you are cross-compiling or if you wish to build a non-PAE
-Xen system. Additional configuration options are documented
-in the \path{Config.mk} file.
+you are cross-compiling. Additional configuration options are
+documented in the \path{Config.mk} file.

The top-level \path{Makefile} is chiefly used to customize the set of
kernels built. Look for the line:
diff -r 48db4eee7d58 -r d0a544d8a3f3 docs/xen-api/xenapi-datamodel-graph.dot
--- a/docs/xen-api/xenapi-datamodel-graph.dot Mon Aug 25 19:04:37 2008 +0900
+++ b/docs/xen-api/xenapi-datamodel-graph.dot Mon Sep 01 16:59:43 2008 +0900
@@ -14,7 +14,7 @@ fontname="Verdana";

node [ shape=box ]; session VM host network VIF PIF SR VDI VBD PBD user XSPolicy ACMPolicy;
node [shape=ellipse]; PIF_metrics VIF_metrics VM_metrics VBD_metrics PBD_metrics VM_guest_metrics host_metrics;
-node [shape=box]; DPCI PPCI host_cpu console
+node [shape=box]; DPCI PPCI host_cpu console VTPM
session -> host [ arrowhead="none" ]
session -> user [ arrowhead="none" ]
VM -> VM_metrics [ arrowhead="none" ]
diff -r 48db4eee7d58 -r d0a544d8a3f3 extras/mini-os/include/posix/dirent.h
--- a/extras/mini-os/include/posix/dirent.h Mon Aug 25 19:04:37 2008 +0900
+++ b/extras/mini-os/include/posix/dirent.h Mon Sep 01 16:59:43 2008 +0900
@@ -1,7 +1,7 @@
#ifndef _POSIX_DIRENT_H
#define _POSIX_DIRENT_H

-#include <sys/types.h>
+#include <stdint.h>

struct dirent {
char *d_name;
diff -r 48db4eee7d58 -r d0a544d8a3f3 extras/mini-os/include/posix/err.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/extras/mini-os/include/posix/err.h Mon Sep 01 16:59:43 2008 +0900
@@ -0,0 +1,15 @@
+#ifndef _POSIX_ERR_H
+#define _POSIX_ERR_H
+
+#include <stdarg.h>
+
+void err(int eval, const char *fmt, ...);
+void errx(int eval, const char *fmt, ...);
+void warn(const char *fmt, ...);
+void warnx(const char *fmt, ...);
+void verr(int eval, const char *fmt, va_list args);
+void verrx(int eval, const char *fmt, va_list args);
+void vwarn(const char *fmt, va_list args);
+void vwarnx(const char *fmt, va_list args);
+
+#endif /* _POSIX_ERR_H */
diff -r 48db4eee7d58 -r d0a544d8a3f3 extras/mini-os/include/posix/pthread.h
--- a/extras/mini-os/include/posix/pthread.h Mon Aug 25 19:04:37 2008 +0900
+++ b/extras/mini-os/include/posix/pthread.h Mon Sep 01 16:59:43 2008 +0900
@@ -31,8 +31,15 @@ static inline int pthread_key_delete(pth



+typedef struct {} pthread_mutexattr_t;
+static inline int pthread_mutexattr_init(pthread_mutexattr_t *mattr) { return 0; }
+#define PTHREAD_MUTEX_NORMAL 0
+#define PTHREAD_MUTEX_RECURSIVE 1
+static inline int pthread_mutexattr_settype(pthread_mutexattr_t *mattr, int kind) { return 0; }
+static inline int pthread_mutexattr_destroy(pthread_mutexattr_t *mattr) { return 0; }
typedef struct {} pthread_mutex_t;
#define PTHREAD_MUTEX_INITIALIZER {}
+static inline int pthread_mutex_init(pthread_mutex_t *mutex, pthread_mutexattr_t *mattr) { return 0; }
static inline int pthread_mutex_lock(pthread_mutex_t *mutex) { return 0; }
static inline int pthread_mutex_unlock(pthread_mutex_t *mutex) { return 0; }

diff -r 48db4eee7d58 -r d0a544d8a3f3 extras/mini-os/include/posix/syslog.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/extras/mini-os/include/posix/syslog.h Mon Sep 01 16:59:43 2008 +0900
@@ -0,0 +1,37 @@
+#ifndef _POSIX_SYSLOG_H
+#define _POSIX_SYSLOG_H
+
+#include <stdarg.h>
+
+#define LOG_PID 0
+#define LOG_CONS 0
+#define LOG_NDELAY 0
+#define LOG_ODELAY 0
+#define LOG_NOWAIT 0
+
+#define LOG_KERN 0
+#define LOG_USER 0
+#define LOG_MAIL 0
+#define LOG_NEWS 0
+#define LOG_UUCP 0
+#define LOG_DAEMON 0
+#define LOG_AUTH 0
+#define LOG_CRON 0
+#define LOG_LPR 0
+
+/* TODO: support */
+#define LOG_EMERG 0
+#define LOG_ALERT 1
+#define LOG_CRIT 2
+#define LOG_ERR 3
+#define LOG_WARNING 4
+#define LOG_NOTICE 5
+#define LOG_INFO 6
+#define LOG_DEBUG 7
+
+void openlog(const char *ident, int option, int facility);
+void syslog(int priority, const char *format, ...);
+void closelog(void);
+void vsyslog(int priority, const char *format, va_list ap);
+
+#endif /* _POSIX_SYSLOG_H */
diff -r 48db4eee7d58 -r d0a544d8a3f3 extras/mini-os/include/xenbus.h
--- a/extras/mini-os/include/xenbus.h Mon Aug 25 19:04:37 2008 +0900
+++ b/extras/mini-os/include/xenbus.h Mon Sep 01 16:59:43 2008 +0900
@@ -83,12 +83,13 @@ char *xenbus_transaction_end(xenbus_tran
int *retry);

/* Read path and parse it as an integer. Returns -1 on error. */
-int xenbus_read_integer(char *path);
+int xenbus_read_integer(const char *path);

/* Contraction of snprintf and xenbus_write(path/node). */
char* xenbus_printf(xenbus_transaction_t xbt,
- char* node, char* path,
- char* fmt, ...);
+ const char* node, const char* path,
+ const char* fmt, ...)
+ __attribute__((__format__(printf, 4, 5)));

/* Reset the XenBus system. */
void fini_xenbus(void);
diff -r 48db4eee7d58 -r d0a544d8a3f3 extras/mini-os/lib/sys.c
--- a/extras/mini-os/lib/sys.c Mon Aug 25 19:04:37 2008 +0900
+++ b/extras/mini-os/lib/sys.c Mon Sep 01 16:59:43 2008 +0900
@@ -1007,6 +1007,96 @@ LWIP_STUB(int, getsockname, (int s, stru
LWIP_STUB(int, getsockname, (int s, struct sockaddr *name, socklen_t *namelen), (s, name, namelen))
#endif

+static char *syslog_ident;
+void openlog(const char *ident, int option, int facility)
+{
+ if (syslog_ident)
+ free(syslog_ident);
+ syslog_ident = strdup(ident);
+}
+
+void vsyslog(int priority, const char *format, va_list ap)
+{
+ printk("%s: ", syslog_ident);
+ print(0, format, ap);
+}
+
+void syslog(int priority, const char *format, ...)
+{
+ va_list ap;
+ va_start(ap, format);
+ vsyslog(priority, format, ap);
+ va_end(ap);
+}
+
+void closelog(void)
+{
+ free(syslog_ident);
+ syslog_ident = NULL;
+}
+
+void vwarn(const char *format, va_list ap)
+{
+ int the_errno = errno;
+ printk("stubdom: ");
+ if (format) {
+ print(0, format, ap);
+ printk(", ");
+ }
+ printk("%s", strerror(the_errno));
+}
+
+void warn(const char *format, ...)
+{
+ va_list ap;
+ va_start(ap, format);
+ vwarn(format, ap);
+ va_end(ap);
+}
+
+void verr(int eval, const char *format, va_list ap)
+{
+ vwarn(format, ap);
+ exit(eval);
+}
+
+void err(int eval, const char *format, ...)
+{
+ va_list ap;
+ va_start(ap, format);
+ verr(eval, format, ap);
+ va_end(ap);
+}
+
+void vwarnx(const char *format, va_list ap)
+{
+ printk("stubdom: ");
+ if (format)
+ print(0, format, ap);
+}
+
+void warnx(const char *format, ...)
+{
+ va_list ap;
+ va_start(ap, format);
+ vwarnx(format, ap);
+ va_end(ap);
+}
+
+void verrx(int eval, const char *format, va_list ap)
+{
+ vwarnx(format, ap);
+ exit(eval);
+}
+
+void errx(int eval, const char *format, ...)
+{
+ va_list ap;
+ va_start(ap, format);
+ verrx(eval, format, ap);
+ va_end(ap);
+}
+
int nanosleep(const struct timespec *req, struct timespec *rem)
{
s_time_t start = NOW();
@@ -1115,34 +1205,47 @@ void *mmap(void *start, size_t length, i
} else ASSERT(0);
}

+#define UNMAP_BATCH ((STACK_SIZE / 2) / sizeof(multicall_entry_t))
int munmap(void *start, size_t length)
{
- int i, n = length / PAGE_SIZE;
- multicall_entry_t call[n];
- unsigned char (*data)[PAGE_SIZE] = start;
- int ret;
+ int total = length / PAGE_SIZE;
ASSERT(!((unsigned long)start & ~PAGE_MASK));
- ASSERT(!(length & ~PAGE_MASK));
-
- for (i = 0; i < n; i++) {
- call[i].op = __HYPERVISOR_update_va_mapping;
- call[i].args[0] = (unsigned long) &data[i];
- call[i].args[1] = 0;
- call[i].args[2] = 0;
- call[i].args[3] = UVMF_INVLPG;
- }
-
- ret = HYPERVISOR_multicall(call, n);
- if (ret) {
- errno = -ret;
- return -1;
- }
-
- for (i = 0; i < n; i++) {
- if (call[i].result) {
- errno = call[i].result;
- return -1;
- }
+ while (total) {
+ int n = UNMAP_BATCH;
+ if (n > total)
+ n = total;
+ {
+ int i;
+ multicall_entry_t call[n];
+ unsigned char (*data)[PAGE_SIZE] = start;
+ int ret;
+
+ for (i = 0; i < n; i++) {
+ int arg = 0;
+ call[i].op = __HYPERVISOR_update_va_mapping;
+ call[i].args[arg++] = (unsigned long) &data[i];
+ call[i].args[arg++] = 0;
+#ifdef __i386__
+ call[i].args[arg++] = 0;
+#endif
+ call[i].args[arg++] = UVMF_INVLPG;
+ }
+
+ ret = HYPERVISOR_multicall(call, n);
+ if (ret) {
+ errno = -ret;
+ return -1;
+ }
+
+ for (i = 0; i < n; i++) {
+ if (call[i].result) {
+ errno = call[i].result;
+ return -1;
+ }
+ }
+ }
+ start = (char *)start + n * PAGE_SIZE;
+ total -= n;
}
return 0;
}
diff -r 48db4eee7d58 -r d0a544d8a3f3 extras/mini-os/main.c
--- a/extras/mini-os/main.c Mon Aug 25 19:04:37 2008 +0900
+++ b/extras/mini-os/main.c Mon Sep 01 16:59:43 2008 +0900
@@ -42,7 +42,7 @@ extern char __app_bss_start, __app_bss_e
extern char __app_bss_start, __app_bss_end;
static void call_main(void *p)
{
- char *c;
+ char *c, quote;
#ifdef CONFIG_QEMU
char *domargs, *msg;
#endif
@@ -101,32 +101,53 @@ static void call_main(void *p)

argc = 1;

-#define PARSE_ARGS(ARGS,START,END) \
+#define PARSE_ARGS(ARGS,START,QUOTE,END) \
c = ARGS; \
+ quote = 0; \
while (*c) { \
if (*c != ' ') { \
START; \
- while (*c && *c != ' ') \
+ while (*c) { \
+ if (quote) { \
+ if (*c == quote) { \
+ quote = 0; \
+ QUOTE; \
+ continue; \
+ } \
+ } else if (*c == ' ') \
+ break; \
+ if (*c == '"' || *c == '\'') { \
+ quote = *c; \
+ QUOTE; \
+ continue; \
+ } \
c++; \
+ } \
} else { \
END; \
while (*c == ' ') \
c++; \
} \
+ } \
+ if (quote) {\
+ printk("Warning: unterminated quotation %c\n", quote); \
+ quote = 0; \
}
+#define PARSE_ARGS_COUNT(ARGS) PARSE_ARGS(ARGS, argc++, c++, )
+#define PARSE_ARGS_STORE(ARGS) PARSE_ARGS(ARGS, argv[argc++] = c, memmove(c, c + 1, strlen(c + 1) + 1), *c++ = 0)

- PARSE_ARGS((char*)start_info.cmd_line, argc++, );
+ PARSE_ARGS_COUNT((char*)start_info.cmd_line);
#ifdef CONFIG_QEMU
- PARSE_ARGS(domargs, argc++, );
+ PARSE_ARGS_COUNT(domargs);
#endif

argv = alloca((argc + 1) * sizeof(char *));
argv[0] = "main";
argc = 1;

- PARSE_ARGS((char*)start_info.cmd_line, argv[argc++] = c, *c++ = 0)
+ PARSE_ARGS_STORE((char*)start_info.cmd_line)
#ifdef CONFIG_QEMU
- PARSE_ARGS(domargs, argv[argc++] = c, *c++ = 0)
+ PARSE_ARGS_STORE(domargs)
#endif

argv[argc] = NULL;
diff -r 48db4eee7d58 -r d0a544d8a3f3 extras/mini-os/xenbus/xenbus.c
--- a/extras/mini-os/xenbus/xenbus.c Mon Aug 25 19:04:37 2008 +0900
+++ b/extras/mini-os/xenbus/xenbus.c Mon Sep 01 16:59:43 2008 +0900
@@ -633,7 +633,7 @@ xenbus_transaction_end(xenbus_transactio
return NULL;
}

-int xenbus_read_integer(char *path)
+int xenbus_read_integer(const char *path)
{
char *res, *buf;
int t;
@@ -650,8 +650,8 @@ int xenbus_read_integer(char *path)
}

char* xenbus_printf(xenbus_transaction_t xbt,
- char* node, char* path,
- char* fmt, ...)
+ const char* node, const char* path,
+ const char* fmt, ...)
{
#define BUFFER_SIZE 256
char fullpath[BUFFER_SIZE];
diff -r 48db4eee7d58 -r d0a544d8a3f3 stubdom/Makefile
--- a/stubdom/Makefile Mon Aug 25 19:04:37 2008 +0900
+++ b/stubdom/Makefile Mon Sep 01 16:59:43 2008 +0900
@@ -9,7 +9,7 @@ include $(XEN_ROOT)/Config.mk

override CONFIG_QEMU=ioemu

-IOEMU_OPTIONS=--disable-sdl --disable-opengl --disable-gfx-check --disable-vnc-tls --disable-brlapi --disable-kqemu
+IOEMU_OPTIONS=--disable-sdl --disable-opengl --disable-vnc-tls --disable-brlapi --disable-kqemu
ZLIB_URL?=http://www.zlib.net
ZLIB_VERSION=1.2.3
LIBPCI_URL?=http://www.kernel.org/pub/software/utils/pciutils
@@ -53,7 +53,7 @@ TARGET_CFLAGS += $(call cc-option,$(CC),
TARGET_CFLAGS += $(call cc-option,$(CC),-fno-stack-protector-all,)

# Do not use host headers and libs
-GCC_INSTALL = $(shell gcc -print-search-dirs | sed -n -e 's/install: \(.*\)/\1/p')
+GCC_INSTALL = $(shell LANG=C gcc -print-search-dirs | sed -n -e 's/install: \(.*\)/\1/p')
TARGET_CPPFLAGS += -U __linux__ -U __FreeBSD__ -U __sun__
TARGET_CPPFLAGS += -nostdinc
TARGET_CPPFLAGS += -isystem $(CURDIR)/$(MINI_OS)/include/posix
diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/Makefile
--- a/tools/Makefile Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/Makefile Mon Sep 01 16:59:43 2008 +0900
@@ -59,8 +59,7 @@ ifneq ($(XEN_COMPILE_ARCH),$(XEN_TARGET_
ifneq ($(XEN_COMPILE_ARCH),$(XEN_TARGET_ARCH))
IOEMU_CONFIGURE_CROSS ?= --cpu=$(XEN_TARGET_ARCH) \
--cross-prefix=$(CROSS_COMPILE) \
- --interp-prefix=$(CROSS_SYS_ROOT) \
- --install=$(CURDIR)/cross-install
+ --interp-prefix=$(CROSS_SYS_ROOT)
endif

ioemu/config-host.mak:
diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/Rules.mk
--- a/tools/Rules.mk Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/Rules.mk Mon Sep 01 16:59:43 2008 +0900
@@ -4,6 +4,9 @@ all:
all:

include $(XEN_ROOT)/Config.mk
+
+export _INSTALL := $(INSTALL)
+INSTALL = $(XEN_ROOT)/tools/cross-install

XEN_INCLUDE = $(XEN_ROOT)/tools/include
XEN_XC = $(XEN_ROOT)/tools/python/xen/lowlevel/xc
diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/cross-install
--- a/tools/cross-install Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/cross-install Mon Sep 01 16:59:43 2008 +0900
@@ -5,4 +5,4 @@ if [ -n "$CROSS_BIN_PATH" ]; then
PATH="$CROSS_BIN_PATH:$PATH"
fi

-exec install "$@"
+exec $_INSTALL "$@"
diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/examples/xmexample.pv-grub
--- a/tools/examples/xmexample.pv-grub Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/examples/xmexample.pv-grub Mon Sep 01 16:59:43 2008 +0900
@@ -25,7 +25,7 @@ extra = "(hd0,0)/boot/grub/menu.lst"
# WARNING: Creating a domain with insufficient memory may cause out of
# memory errors. The domain needs enough memory to boot kernel
# and modules. Allocating less than 32MBs is not recommended.
-memory = 64
+memory = 128

# A name for your domain. All domains must have different names.
name = "ExampleDomain"
@@ -119,32 +119,6 @@ disk = [ 'phy:hda1,hda1,w' ]
#vtpm = [ 'instance=1,backend=0' ]

#----------------------------------------------------------------------------
-# Set the kernel command line for the new domain.
-# You only need to define the IP parameters and hostname if the domain's
-# IP config doesn't, e.g. in ifcfg-eth0 or via DHCP.
-# You can use 'extra' to set the runlevel and custom environment
-# variables used by custom rc scripts (e.g. VMID=, usr= ).
-
-# Set if you want dhcp to allocate the IP address.
-#dhcp="dhcp"
-# Set netmask.
-#netmask=
-# Set default gateway.
-#gateway=
-# Set the hostname.
-#hostname= "vm%d" % vmid
-
-# Set root device.
-root = "/dev/hda1 ro"
-
-# Root device for nfs.
-#root = "/dev/nfs"
-# The nfs server.
-#nfs_server = '192.0.2.1'
-# Root directory on the nfs server.
-#nfs_root = '/full/path/to/root/directory'
-
-#----------------------------------------------------------------------------
# Configure the behaviour when a domain exits. There are three 'reasons'
# for a domain to stop: poweroff, reboot, and crash. For each of these you
# may specify:
diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/firmware/hvmloader/32bitbios_support.c
--- a/tools/firmware/hvmloader/32bitbios_support.c Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/firmware/hvmloader/32bitbios_support.c Mon Sep 01 16:59:43 2008 +0900
@@ -76,7 +76,7 @@ static void relocate_32bitbios(char *elf
*/
reloc_size = reloc_off;
printf("%d bytes of ROMBIOS high-memory extensions:\n", reloc_size);
- highbiosarea = (char *)(long)e820_malloc(reloc_size);
+ highbiosarea = (char *)(long)e820_malloc(reloc_size, 0);
BUG_ON(highbiosarea == NULL);
printf(" Relocating to 0x%x-0x%x ... ",
(uint32_t)&highbiosarea[0],
diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/firmware/hvmloader/acpi/Makefile
--- a/tools/firmware/hvmloader/acpi/Makefile Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/firmware/hvmloader/acpi/Makefile Mon Sep 01 16:59:43 2008 +0900
@@ -22,8 +22,8 @@ H_SRC = $(wildcard *.h)
H_SRC = $(wildcard *.h)
OBJS = $(patsubst %.c,%.o,$(C_SRC))

-IASL_VER = acpica-unix-20060707
-IASL_URL = http://developer.intel.com/technology/iapc/acpi/downloads/$(IASL_VER).tar.gz
+IASL_VER = acpica-unix-20080729
+IASL_URL = http://acpica.org/download/$(IASL_VER).tar.gz

CFLAGS += -I. -I.. $(CFLAGS_include)

@@ -48,7 +48,7 @@ iasl:
@echo "ACPI ASL compiler(iasl) is needed"
@echo "Download Intel ACPI CA"
@echo "If wget failed, please download and compile manually from"
- @echo "http://developer.intel.com/technology/iapc/acpi/downloads.htm"
+ @echo "http://acpica.org/downloads/"
@echo
wget $(IASL_URL)
tar xzf $(IASL_VER).tar.gz
diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/firmware/hvmloader/acpi/build.c
--- a/tools/firmware/hvmloader/acpi/build.c Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/firmware/hvmloader/acpi/build.c Mon Sep 01 16:59:43 2008 +0900
@@ -233,7 +233,7 @@ static int construct_secondary_tables(ui
tcpa->header.oem_revision = ACPI_OEM_REVISION;
tcpa->header.creator_id = ACPI_CREATOR_ID;
tcpa->header.creator_revision = ACPI_CREATOR_REVISION;
- tcpa->lasa = e820_malloc(ACPI_2_0_TCPA_LAML_SIZE);
+ tcpa->lasa = e820_malloc(ACPI_2_0_TCPA_LAML_SIZE, 0);
if ( tcpa->lasa )
{
tcpa->laml = ACPI_2_0_TCPA_LAML_SIZE;
@@ -363,7 +363,7 @@ void acpi_build_tables(void)
memset(buf, 0, high_sz);

/* Allocate data area and set up ACPI tables there. */
- buf = (uint8_t *)e820_malloc(high_sz);
+ buf = (uint8_t *)e820_malloc(high_sz, 0);
__acpi_build_tables(buf, &low_sz, &high_sz);

printf(" - Lo data: %08lx-%08lx\n"
diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/firmware/hvmloader/hvmloader.c
--- a/tools/firmware/hvmloader/hvmloader.c Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/firmware/hvmloader/hvmloader.c Mon Sep 01 16:59:43 2008 +0900
@@ -243,6 +243,13 @@ static void pci_setup(void)
bars[i].bar_sz = bar_sz;

nr_bars++;
+
+ /* Skip the upper-half of the address for a 64-bit BAR. */
+ if ( (bar_data & (PCI_BASE_ADDRESS_SPACE |
+ PCI_BASE_ADDRESS_MEM_TYPE_MASK)) ==
+ (PCI_BASE_ADDRESS_SPACE_MEMORY |
+ PCI_BASE_ADDRESS_MEM_TYPE_64) )
+ bar++;
}

/* Map the interrupt. */
@@ -430,11 +437,13 @@ static void cmos_write_memory_size(void)
cmos_outb(0x35, (uint8_t)( alt_mem >> 8));
}

-static void init_xen_platform_io_base(void)
+static uint16_t init_xen_platform_io_base(void)
{
struct bios_info *bios_info = (struct bios_info *)ACPI_PHYSICAL_ADDRESS;
uint32_t devfn, bar_data;
uint16_t vendor_id, device_id;
+
+ bios_info->xen_pfiob = 0;

for ( devfn = 0; devfn < 128; devfn++ )
{
@@ -445,12 +454,16 @@ static void init_xen_platform_io_base(vo
bar_data = pci_readl(devfn, PCI_BASE_ADDRESS_0);
bios_info->xen_pfiob = bar_data & PCI_BASE_ADDRESS_IO_MASK;
}
+
+ return bios_info->xen_pfiob;
}

int main(void)
{
int vgabios_sz = 0, etherboot_sz = 0, rombios_sz, smbios_sz;
int extboot_sz = 0;
+ uint32_t vga_ram = 0;
+ uint16_t xen_pfiob;

printf("HVM Loader\n");

@@ -495,6 +508,12 @@ int main(void)
default:
printf("No emulated VGA adaptor ...\n");
break;
+ }
+
+ if ( virtual_vga != VGA_none )
+ {
+ vga_ram = e820_malloc(8 << 20, 4096);
+ printf("VGA RAM at %08x\n", vga_ram);
}

etherboot_sz = scan_etherboot_nic((void*)ETHERBOOT_PHYSICAL_ADDRESS);
@@ -537,7 +556,9 @@ int main(void)
ROMBIOS_PHYSICAL_ADDRESS,
ROMBIOS_PHYSICAL_ADDRESS + rombios_sz - 1);

- init_xen_platform_io_base();
+ xen_pfiob = init_xen_platform_io_base();
+ if ( xen_pfiob && vga_ram )
+ outl(xen_pfiob + 4, vga_ram);

printf("Invoking ROMBIOS ...\n");
return 0;
diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/firmware/hvmloader/util.c
--- a/tools/firmware/hvmloader/util.c Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/firmware/hvmloader/util.c Mon Sep 01 16:59:43 2008 +0900
@@ -325,35 +325,34 @@ static void e820_collapse(void)
}
}

-uint32_t e820_malloc(uint32_t size)
+uint32_t e820_malloc(uint32_t size, uint32_t align)
{
uint32_t addr;
int i;
struct e820entry *ent = (struct e820entry *)HVM_E820;

- /* Align allocation request to a reasonable boundary (1kB). */
- size = (size + 1023) & ~1023;
+ /* Align to at leats one kilobyte. */
+ if ( align < 1024 )
+ align = 1024;

for ( i = *HVM_E820_NR - 1; i >= 0; i-- )
{
- addr = ent[i].addr;
+ addr = (ent[i].addr + ent[i].size - size) & ~(align-1);
if ( (ent[i].type != E820_RAM) || /* not ram? */
- (ent[i].size < size) || /* too small? */
- (addr != ent[i].addr) || /* starts above 4gb? */
+ (addr < ent[i].addr) || /* too small or starts above 4gb? */
((addr + size) < addr) ) /* ends above 4gb? */
continue;

- if ( ent[i].size != size )
+ if ( addr != ent[i].addr )
{
memmove(&ent[i+1], &ent[i], (*HVM_E820_NR-i) * sizeof(*ent));
(*HVM_E820_NR)++;
- ent[i].size -= size;
- addr += ent[i].size;
+ ent[i].size = addr - ent[i].addr;
+ ent[i+1].addr = addr;
+ ent[i+1].size -= ent[i].size;
i++;
}

- ent[i].addr = addr;
- ent[i].size = size;
ent[i].type = E820_RESERVED;

e820_collapse();
diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/firmware/hvmloader/util.h
--- a/tools/firmware/hvmloader/util.h Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/firmware/hvmloader/util.h Mon Sep 01 16:59:43 2008 +0900
@@ -132,7 +132,7 @@ int vprintf(const char *fmt, va_list ap)
int vprintf(const char *fmt, va_list ap);

/* Reserve a RAM region in the e820 table. */
-uint32_t e820_malloc(uint32_t size);
+uint32_t e820_malloc(uint32_t size, uint32_t align);

/* Prepare the 32bit BIOS */
void highbios_setup(void);
diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/ioemu/hw/cirrus_vga.c
--- a/tools/ioemu/hw/cirrus_vga.c Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/ioemu/hw/cirrus_vga.c Mon Sep 01 16:59:43 2008 +0900
@@ -2543,34 +2543,28 @@ static CPUWriteMemoryFunc *cirrus_linear
cirrus_linear_bitblt_writel,
};

-static void *set_vram_mapping(unsigned long begin, unsigned long end)
-{
- xen_pfn_t *extent_start = NULL;
- unsigned long nr_extents;
- void *vram_pointer = NULL;
- int i;
-
- /* align begin and end address */
- begin = begin & TARGET_PAGE_MASK;
- end = begin + VGA_RAM_SIZE;
- end = (end + TARGET_PAGE_SIZE -1 ) & TARGET_PAGE_MASK;
- nr_extents = (end - begin) >> TARGET_PAGE_BITS;
-
- extent_start = malloc(sizeof(xen_pfn_t) * nr_extents);
- if (extent_start == NULL) {
- fprintf(stderr, "Failed malloc on set_vram_mapping\n");
- return NULL;
- }
-
- memset(extent_start, 0, sizeof(xen_pfn_t) * nr_extents);
-
- for (i = 0; i < nr_extents; i++)
- extent_start[i] = (begin + i * TARGET_PAGE_SIZE) >> TARGET_PAGE_BITS;
-
- if (set_mm_mapping(xc_handle, domid, nr_extents, 0, extent_start) < 0) {
- fprintf(logfile, "Failed set_mm_mapping\n");
- free(extent_start);
- return NULL;
+static void set_vram_mapping(CirrusVGAState *s, unsigned long begin, unsigned long end)
+{
+ unsigned long i;
+ struct xen_add_to_physmap xatp;
+ int rc;
+
+ if (end > begin + VGA_RAM_SIZE)
+ end = begin + VGA_RAM_SIZE;
+
+ fprintf(logfile,"mapping vram to %lx - %lx\n", begin, end);
+
+ xatp.domid = domid;
+ xatp.space = XENMAPSPACE_mfn;
+
+ for (i = 0; i < (end - begin) >> TARGET_PAGE_BITS; i++) {
+ xatp.idx = s->vram_mfns[i];
+ xatp.gpfn = (begin >> TARGET_PAGE_BITS) + i;
+ rc = xc_memory_op(xc_handle, XENMEM_add_to_physmap, &xatp);
+ if (rc) {
+ fprintf(stderr, "add_to_physmap MFN %"PRI_xen_pfn" to PFN %"PRI_xen_pfn" failed: %d\n", xatp.idx, xatp.gpfn, rc);
+ return;
+ }
}

(void)xc_domain_pin_memory_cacheattr(
@@ -2578,61 +2572,42 @@ static void *set_vram_mapping(unsigned l
begin >> TARGET_PAGE_BITS,
end >> TARGET_PAGE_BITS,
XEN_DOMCTL_MEM_CACHEATTR_WB);
-
- vram_pointer = xc_map_foreign_pages(xc_handle, domid,
- PROT_READ|PROT_WRITE,
- extent_start, nr_extents);
- if (vram_pointer == NULL) {
- fprintf(logfile, "xc_map_foreign_batch vgaram returned error %d\n",
- errno);
- free(extent_start);
- return NULL;
- }
-
- memset(vram_pointer, 0, nr_extents * TARGET_PAGE_SIZE);
-
-#ifdef CONFIG_STUBDOM
- xenfb_pv_display_start(vram_pointer);
-#endif
-
- free(extent_start);
-
- return vram_pointer;
-}
-
-static int unset_vram_mapping(unsigned long begin, unsigned long end,
- void *mapping)
-{
- xen_pfn_t *extent_start = NULL;
- unsigned long nr_extents;
- int i;
-
- /* align begin and end address */
-
- end = begin + VGA_RAM_SIZE;
- begin = begin & TARGET_PAGE_MASK;
- end = (end + TARGET_PAGE_SIZE -1 ) & TARGET_PAGE_MASK;
- nr_extents = (end - begin) >> TARGET_PAGE_BITS;
-
- extent_start = malloc(sizeof(xen_pfn_t) * nr_extents);
-
- if (extent_start == NULL) {
- fprintf(stderr, "Failed malloc on set_mm_mapping\n");
- return -1;
- }
-
- /* Drop our own references to the vram pages */
- munmap(mapping, nr_extents * TARGET_PAGE_SIZE);
-
- /* Now drop the guest's mappings */
- memset(extent_start, 0, sizeof(xen_pfn_t) * nr_extents);
- for (i = 0; i < nr_extents; i++)
- extent_start[i] = (begin + (i * TARGET_PAGE_SIZE)) >> TARGET_PAGE_BITS;
- unset_mm_mapping(xc_handle, domid, nr_extents, 0, extent_start);
-
- free(extent_start);
-
- return 0;
+}
+
+static void unset_vram_mapping(CirrusVGAState *s, unsigned long begin, unsigned long end)
+{
+ if (s->stolen_vram_addr) {
+ /* We can put it there for xend to save it efficiently */
+ set_vram_mapping(s, s->stolen_vram_addr, s->stolen_vram_addr + VGA_RAM_SIZE);
+ } else {
+ /* Old image, we have to unmap them completely */
+ struct xen_remove_from_physmap xrfp;
+ unsigned long i;
+ int rc;
+
+ if (end > begin + VGA_RAM_SIZE)
+ end = begin + VGA_RAM_SIZE;
+
+ fprintf(logfile,"unmapping vram from %lx - %lx\n", begin, end);
+
+ xrfp.domid = domid;
+
+ for (i = 0; i < (end - begin) >> TARGET_PAGE_BITS; i++) {
+ xrfp.gpfn = (begin >> TARGET_PAGE_BITS) + i;
+ rc = xc_memory_op(xc_handle, XENMEM_remove_from_physmap, &xrfp);
+ if (rc) {
+ fprintf(stderr, "remove_from_physmap PFN %"PRI_xen_pfn" failed: %d\n", xrfp.gpfn, rc);
+ return;
+ }
+ }
+ }
+}
+
+void cirrus_restart_acc(CirrusVGAState *s)
+{
+ set_vram_mapping(s, s->lfb_addr, s->lfb_end);
+ s->map_addr = s->lfb_addr;
+ s->map_end = s->lfb_end;
}

/* Compute the memory access functions */
@@ -2654,17 +2629,7 @@ static void cirrus_update_memory_access(
mode = s->gr[0x05] & 0x7;
if (mode < 4 || mode > 5 || ((s->gr[0x0B] & 0x4) == 0)) {
if (s->lfb_addr && s->lfb_end && !s->map_addr) {
- void *vram_pointer, *old_vram;
-
- vram_pointer = set_vram_mapping(s->lfb_addr,
- s->lfb_end);
- if (!vram_pointer)
- fprintf(stderr, "NULL vram_pointer\n");
- else {
- old_vram = vga_update_vram((VGAState *)s, vram_pointer,
- VGA_RAM_SIZE);
- qemu_free(old_vram);
- }
+ set_vram_mapping(s, s->lfb_addr, s->lfb_end);
s->map_addr = s->lfb_addr;
s->map_end = s->lfb_end;
}
@@ -2674,14 +2639,7 @@ static void cirrus_update_memory_access(
} else {
generic_io:
if (s->lfb_addr && s->lfb_end && s->map_addr) {
- void *old_vram;
-
- old_vram = vga_update_vram((VGAState *)s, NULL, VGA_RAM_SIZE);
-
- unset_vram_mapping(s->lfb_addr,
- s->lfb_end,
- old_vram);
-
+ unset_vram_mapping(s, s->map_addr, s->map_end);
s->map_addr = s->map_end = 0;
}
s->cirrus_linear_write[0] = cirrus_linear_writeb;
@@ -3040,36 +2998,6 @@ static CPUWriteMemoryFunc *cirrus_mmio_w
cirrus_mmio_writel,
};

-void cirrus_stop_acc(CirrusVGAState *s)
-{
- if (s->map_addr){
- int error;
- s->map_addr = 0;
- error = unset_vram_mapping(s->lfb_addr,
- s->lfb_end, s->vram_ptr);
- fprintf(stderr, "cirrus_stop_acc:unset_vram_mapping.\n");
- }
-}
-
-void cirrus_restart_acc(CirrusVGAState *s)
-{
- if (s->lfb_addr && s->lfb_end) {
- void *vram_pointer, *old_vram;
- fprintf(stderr, "cirrus_vga_load:re-enable vga acc.lfb_addr=0x%lx, lfb_end=0x%lx.\n",
- s->lfb_addr, s->lfb_end);
- vram_pointer = set_vram_mapping(s->lfb_addr ,s->lfb_end);
- if (!vram_pointer){
- fprintf(stderr, "cirrus_vga_load:NULL vram_pointer\n");
- } else {
- old_vram = vga_update_vram((VGAState *)s, vram_pointer,
- VGA_RAM_SIZE);
- qemu_free(old_vram);
- s->map_addr = s->lfb_addr;
- s->map_end = s->lfb_end;
- }
- }
-}
-
/* load/save state */

static void cirrus_vga_save(QEMUFile *f, void *opaque)
@@ -3118,7 +3046,10 @@ static void cirrus_vga_save(QEMUFile *f,
qemu_put_8s(f, &vga_acc);
qemu_put_be64s(f, (uint64_t*)&s->lfb_addr);
qemu_put_be64s(f, (uint64_t*)&s->lfb_end);
- qemu_put_buffer(f, s->vram_ptr, VGA_RAM_SIZE);
+ qemu_put_be64s(f, &s->stolen_vram_addr);
+ if (!s->stolen_vram_addr && !vga_acc)
+ /* Old guest: VRAM is not mapped, we have to save it ourselves */
+ qemu_put_buffer(f, s->vram_ptr, VGA_RAM_SIZE);
}

static int cirrus_vga_load(QEMUFile *f, void *opaque, int version_id)
@@ -3127,7 +3058,7 @@ static int cirrus_vga_load(QEMUFile *f,
uint8_t vga_acc = 0;
int ret;

- if (version_id > 2)
+ if (version_id > 3)
return -EINVAL;

if (s->pci_dev && version_id >= 2) {
@@ -3173,9 +3104,20 @@ static int cirrus_vga_load(QEMUFile *f,
qemu_get_8s(f, &vga_acc);
qemu_get_be64s(f, (uint64_t*)&s->lfb_addr);
qemu_get_be64s(f, (uint64_t*)&s->lfb_end);
- qemu_get_buffer(f, s->vram_ptr, VGA_RAM_SIZE);
- if (vga_acc){
- cirrus_restart_acc(s);
+ if (version_id >= 3) {
+ qemu_get_be64s(f, &s->stolen_vram_addr);
+ if (!s->stolen_vram_addr && !vga_acc) {
+ /* Old guest, VRAM is not mapped, we have to restore it ourselves */
+ qemu_get_buffer(f, s->vram_ptr, VGA_RAM_SIZE);
+ xen_vga_populate_vram(s->lfb_addr);
+ } else
+ xen_vga_vram_map(vga_acc ? s->lfb_addr : s->stolen_vram_addr, 0);
+ } else {
+ /* Old image, we have to populate and restore VRAM ourselves */
+ xen_vga_populate_vram(s->lfb_addr);
+ qemu_get_buffer(f, s->vram_ptr, VGA_RAM_SIZE);
+ if (vga_acc)
+ cirrus_restart_acc(s);
}

/* force refresh */
@@ -3297,7 +3239,7 @@ static void cirrus_init_common(CirrusVGA
s->cursor_invalidate = cirrus_cursor_invalidate;
s->cursor_draw_line = cirrus_cursor_draw_line;

- register_savevm("cirrus_vga", 0, 2, cirrus_vga_save, cirrus_vga_load, s);
+ register_savevm("cirrus_vga", 0, 3, cirrus_vga_save, cirrus_vga_load, s);
}

/***************************************
diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/ioemu/hw/ide.c
--- a/tools/ioemu/hw/ide.c Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/ioemu/hw/ide.c Mon Sep 01 16:59:43 2008 +0900
@@ -1108,14 +1108,14 @@ static void ide_flush_cb(void *opaque, i
return;
}
else
- s->status = READY_STAT;
+ s->status = READY_STAT | SEEK_STAT;
ide_set_irq(s);
}

static void ide_atapi_cmd_ok(IDEState *s)
{
s->error = 0;
- s->status = READY_STAT;
+ s->status = READY_STAT | SEEK_STAT;
s->nsector = (s->nsector & ~7) | ATAPI_INT_REASON_IO | ATAPI_INT_REASON_CD;
ide_set_irq(s);
}
@@ -1229,7 +1229,7 @@ static void ide_atapi_cmd_reply_end(IDES
if (s->packet_transfer_size <= 0) {
/* end of transfer */
ide_transfer_stop(s);
- s->status = READY_STAT;
+ s->status = READY_STAT | SEEK_STAT;
s->nsector = (s->nsector & ~7) | ATAPI_INT_REASON_IO | ATAPI_INT_REASON_CD;
ide_set_irq(s);
#ifdef DEBUG_IDE_ATAPI
@@ -1307,10 +1307,10 @@ static void ide_atapi_cmd_reply(IDEState
s->io_buffer_index = 0;

if (s->atapi_dma) {
- s->status = READY_STAT | DRQ_STAT;
+ s->status = READY_STAT | SEEK_STAT | DRQ_STAT;
ide_dma_start(s, ide_atapi_cmd_read_dma_cb);
} else {
- s->status = READY_STAT;
+ s->status = READY_STAT | SEEK_STAT;
ide_atapi_cmd_reply_end(s);
}
}
@@ -1325,7 +1325,7 @@ static void ide_atapi_cmd_read_pio(IDESt
s->io_buffer_index = sector_size;
s->cd_sector_size = sector_size;

- s->status = READY_STAT;
+ s->status = READY_STAT | SEEK_STAT;
ide_atapi_cmd_reply_end(s);
}

@@ -1368,7 +1368,7 @@ static void ide_atapi_cmd_read_dma_cb(vo
}

if (s->packet_transfer_size <= 0) {
- s->status = READY_STAT;
+ s->status = READY_STAT | SEEK_STAT;
s->nsector = (s->nsector & ~7) | ATAPI_INT_REASON_IO | ATAPI_INT_REASON_CD;
ide_set_irq(s);
eot:
@@ -1418,7 +1418,7 @@ static void ide_atapi_cmd_read_dma(IDESt
s->cd_sector_size = sector_size;

/* XXX: check if BUSY_STAT should be set */
- s->status = READY_STAT | DRQ_STAT | BUSY_STAT;
+ s->status = READY_STAT | SEEK_STAT | DRQ_STAT | BUSY_STAT;
ide_dma_start(s, ide_atapi_cmd_read_dma_cb);
}

@@ -1886,7 +1886,7 @@ static void ide_ioport_write(void *opaqu
ide_abort_command(s);
} else {
s->mult_sectors = s->nsector;
- s->status = READY_STAT;
+ s->status = READY_STAT | SEEK_STAT;
}
ide_set_irq(s);
break;
@@ -1896,7 +1896,7 @@ static void ide_ioport_write(void *opaqu
case WIN_VERIFY_ONCE:
/* do sector number check ? */
ide_cmd_lba48_transform(s, lba48);
- s->status = READY_STAT;
+ s->status = READY_STAT | SEEK_STAT;
ide_set_irq(s);
break;
case WIN_READ_EXT:
@@ -1965,12 +1965,12 @@ static void ide_ioport_write(void *opaqu
case WIN_READ_NATIVE_MAX:
ide_cmd_lba48_transform(s, lba48);
ide_set_sector(s, s->nb_sectors - 1);
- s->status = READY_STAT;
+ s->status = READY_STAT | SEEK_STAT;
ide_set_irq(s);
break;
case WIN_CHECKPOWERMODE1:
s->nsector = 0xff; /* device active or idle */
- s->status = READY_STAT;
+ s->status = READY_STAT | SEEK_STAT;
ide_set_irq(s);
break;
case WIN_SETFEATURES:
@@ -2070,7 +2070,7 @@ static void ide_ioport_write(void *opaqu
/* overlapping commands not supported */
if (s->feature & 0x02)
goto abort_cmd;
- s->status = READY_STAT;
+ s->status = READY_STAT | SEEK_STAT;
s->atapi_dma = s->feature & 1;
s->nsector = 1;
ide_transfer_start(s, s->io_buffer, ATAPI_PACKET_SIZE,
@@ -2289,7 +2289,7 @@ static void ide_reset(IDEState *s)
s->mult_sectors = MAX_MULT_SECTORS;
s->cur_drive = s;
s->select = 0xa0;
- s->status = READY_STAT;
+ s->status = READY_STAT | SEEK_STAT;
ide_set_signature(s);
/* init the transfer handler so that 0xffff is returned on data
accesses */
diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/ioemu/hw/pass-through.c
--- a/tools/ioemu/hw/pass-through.c Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/ioemu/hw/pass-through.c Mon Sep 01 16:59:43 2008 +0900
@@ -2340,11 +2340,6 @@ static int pt_bar_reg_write(struct pt_de
return -1;
}

- /* always keep the emulate register value to 0,
- * because hvmloader does not support high MMIO for now.
- */
- cfg_entry->data = 0;
-
/* never mapping the 'empty' upper region,
* because we'll do it enough for the lower region.
*/
diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/ioemu/hw/vga.c
--- a/tools/ioemu/hw/vga.c Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/ioemu/hw/vga.c Mon Sep 01 16:59:43 2008 +0900
@@ -23,6 +23,7 @@
*/
#include "vl.h"
#include "vga_int.h"
+#include <sys/mman.h>

//#define DEBUG_VGA
//#define DEBUG_VGA_MEM
@@ -1776,7 +1777,10 @@ static void vga_save(QEMUFile *f, void *
#endif
vram_size = s->vram_size;
qemu_put_be32s(f, &vram_size);
- qemu_put_buffer(f, s->vram_ptr, s->vram_size);
+ qemu_put_be64s(f, &s->stolen_vram_addr);
+ if (!s->stolen_vram_addr)
+ /* Old guest: VRAM is not mapped, we have to save it ourselves */
+ qemu_put_buffer(f, s->vram_ptr, VGA_RAM_SIZE);
}

static int vga_load(QEMUFile *f, void *opaque, int version_id)
@@ -1788,7 +1792,7 @@ static int vga_load(QEMUFile *f, void *o
int i;
#endif

- if (version_id > 3)
+ if (version_id > 4)
return -EINVAL;

if (s->pci_dev && version_id >= 2) {
@@ -1839,7 +1843,14 @@ static int vga_load(QEMUFile *f, void *o
qemu_get_be32s(f, &vram_size);
if (vram_size != s->vram_size)
return -EINVAL;
- qemu_get_buffer(f, s->vram_ptr, s->vram_size);
+ if (version_id >= 4) {
+ qemu_get_be64s(f, &s->stolen_vram_addr);
+ if (s->stolen_vram_addr)
+ xen_vga_vram_map(s->stolen_vram_addr, 0);
+ }
+ /* Old guest, VRAM is not mapped, we have to restore it ourselves */
+ if (!s->stolen_vram_addr)
+ qemu_get_buffer(f, s->vram_ptr, s->vram_size);
}

/* force refresh */
@@ -1994,6 +2005,100 @@ void vga_bios_init(VGAState *s)
/* TODO: add vbe support if enabled */
}

+
+static VGAState *xen_vga_state;
+
+/* When loading old images we have to populate the video ram ourselves */
+void xen_vga_populate_vram(uint64_t vram_addr)
+{
+ unsigned long nr_pfn;
+ struct xen_remove_from_physmap xrfp;
+ xen_pfn_t *pfn_list;
+ int i;
+ int rc;
+
+ fprintf(logfile, "populating video RAM at %lx\n", vram_addr);
+
+ nr_pfn = VGA_RAM_SIZE >> TARGET_PAGE_BITS;
+
+ pfn_list = malloc(sizeof(*pfn_list) * nr_pfn);
+
+ for (i = 0; i < nr_pfn; i++)
+ pfn_list[i] = (vram_addr >> TARGET_PAGE_BITS) + i;
+
+ if (xc_domain_memory_populate_physmap(xc_handle, domid, nr_pfn, 0, 0, pfn_list)) {
+ fprintf(stderr, "Failed to populate video ram\n");
+ exit(1);
+ }
+ free(pfn_list);
+
+ xen_vga_vram_map(vram_addr, 0);
+
+ /* Unmap them from the guest for now. */
+ xrfp.domid = domid;
+ for (i = 0; i < nr_pfn; i++) {
+ xrfp.gpfn = (vram_addr >> TARGET_PAGE_BITS) + i;
+ rc = xc_memory_op(xc_handle, XENMEM_remove_from_physmap, &xrfp);
+ if (rc) {
+ fprintf(stderr, "remove_from_physmap PFN %"PRI_xen_pfn" failed: %d\n", xrfp.gpfn, rc);
+ break;
+ }
+ }
+}
+
+/* Called once video memory has been allocated in the GPFN space */
+void xen_vga_vram_map(uint64_t vram_addr, int copy)
+{
+ unsigned long nr_pfn;
+ xen_pfn_t *pfn_list;
+ int i;
+ void *vram;
+
+ fprintf(logfile, "mapping video RAM from %lx\n", vram_addr);
+
+ nr_pfn = VGA_RAM_SIZE >> TARGET_PAGE_BITS;
+
+ pfn_list = malloc(sizeof(*pfn_list) * nr_pfn);
+
+ for (i = 0; i < nr_pfn; i++)
+ pfn_list[i] = (vram_addr >> TARGET_PAGE_BITS) + i;
+
+ vram = xc_map_foreign_pages(xc_handle, domid,
+ PROT_READ|PROT_WRITE,
+ pfn_list, nr_pfn);
+
+ if (!vram) {
+ fprintf(stderr, "Failed to map vram\n");
+ exit(1);
+ }
+
+ if (xc_domain_memory_translate_gpfn_list(xc_handle, domid, nr_pfn,
+ pfn_list, pfn_list)) {
+ fprintf(stderr, "Failed translation in xen_vga_vram_addr\n");
+ exit(1);
+ }
+
+ if (copy)
+ memcpy(vram, xen_vga_state->vram_ptr, VGA_RAM_SIZE);
+ qemu_free(xen_vga_state->vram_ptr);
+ xen_vga_state->vram_ptr = vram;
+ xen_vga_state->vram_mfns = pfn_list;
+#ifdef CONFIG_STUBDOM
+ xenfb_pv_display_start(vram);
+#endif
+}
+
+/* Called at boot time when the BIOS has allocated video RAM */
+void xen_vga_stolen_vram_addr(uint64_t stolen_vram_addr)
+{
+ fprintf(logfile, "stolen video RAM at %lx\n", stolen_vram_addr);
+
+ xen_vga_state->stolen_vram_addr = stolen_vram_addr;
+
+ /* And copy from the initialization value */
+ xen_vga_vram_map(stolen_vram_addr, 1);
+}
+
/* when used on xen environment, the vga_ram_base is not used */
void vga_common_init(VGAState *s, DisplayState *ds, uint8_t *vga_ram_base,
unsigned long vga_ram_offset, int vga_ram_size)
@@ -2025,13 +2130,9 @@ void vga_common_init(VGAState *s, Displa

vga_reset(s);

- /* Video RAM must be page-aligned for PVFB memory sharing */
- s->vram_ptr = s->vram_alloc = qemu_memalign(TARGET_PAGE_SIZE, vga_ram_size);
-
-#ifdef CONFIG_STUBDOM
- if (!cirrus_vga_enabled)
- xenfb_pv_display_start(s->vram_ptr);
-#endif
+ s->vram_ptr = qemu_malloc(vga_ram_size);
+ s->vram_mfns = NULL;
+ xen_vga_state = s;

s->vram_offset = vga_ram_offset;
s->vram_size = vga_ram_size;
@@ -2051,7 +2152,7 @@ static void vga_init(VGAState *s)
{
int vga_io_memory;

- register_savevm("vga", 0, 3, vga_save, vga_load, s);
+ register_savevm("vga", 0, 4, vga_save, vga_load, s);

register_ioport_write(0x3c0, 16, 1, vga_ioport_write, s);

@@ -2163,33 +2264,6 @@ int pci_vga_init(PCIBus *bus, DisplaySta
return 0;
}

-void *vga_update_vram(VGAState *s, void *vga_ram_base, int vga_ram_size)
-{
- uint8_t *old_pointer;
-
- if (s->vram_size != vga_ram_size) {
- fprintf(stderr, "No support to change vga_ram_size\n");
- return NULL;
- }
-
- if (!vga_ram_base) {
- vga_ram_base = qemu_memalign(TARGET_PAGE_SIZE, vga_ram_size + TARGET_PAGE_SIZE + 1);
- if (!vga_ram_base) {
- fprintf(stderr, "reallocate error\n");
- return NULL;
- }
- }
-
- /* XXX lock needed? */
- old_pointer = s->vram_alloc;
- s->vram_alloc = vga_ram_base;
- vga_ram_base = (uint8_t *)((long)(vga_ram_base + 15) & ~15L);
- memcpy(vga_ram_base, s->vram_ptr, vga_ram_size);
- s->vram_ptr = vga_ram_base;
-
- return old_pointer;
-}
-
/********************************************************/
/* vga screen dump */

diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/ioemu/hw/vga_int.h
--- a/tools/ioemu/hw/vga_int.h Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/ioemu/hw/vga_int.h Mon Sep 01 16:59:43 2008 +0900
@@ -80,9 +80,9 @@
#define VGA_MAX_HEIGHT 2048

#define VGA_STATE_COMMON \
- uint8_t *vram_alloc; \
uint8_t *vram_ptr; \
- uint8_t *vram_shadow; \
+ xen_pfn_t *vram_mfns; \
+ uint64_t stolen_vram_addr; /* Address of stolen RAM */ \
unsigned long vram_offset; \
unsigned int vram_size; \
unsigned long bios_offset; \
diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/ioemu/hw/xen_platform.c
--- a/tools/ioemu/hw/xen_platform.c Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/ioemu/hw/xen_platform.c Mon Sep 01 16:59:43 2008 +0900
@@ -34,6 +34,7 @@ typedef struct PCIXenPlatformState
{
PCIDevice pci_dev;
uint8_t platform_flags;
+ uint64_t vga_stolen_ram;
} PCIXenPlatformState;

static uint32_t xen_platform_ioport_readb(void *opaque, uint32_t addr)
@@ -69,11 +70,46 @@ static void xen_platform_ioport_writeb(v
}


+static uint32_t xen_platform_ioport_readl(void *opaque, uint32_t addr)
+{
+ PCIXenPlatformState *d = opaque;
+
+ addr &= 0xff;
+
+ switch (addr) {
+ case 4: /* VGA stolen memory address */
+ return d->vga_stolen_ram;
+ default:
+ return ~0u;
+ }
+}
+
+static void xen_platform_ioport_writel(void *opaque, uint32_t addr, uint32_t val)
+{
+ PCIXenPlatformState *d = opaque;
+
+ addr &= 0xff;
+ val &= 0xffffffff;
+
+ switch (addr) {
+ case 4: /* VGA stolen memory address */
+ d->vga_stolen_ram = val;
+ xen_vga_stolen_vram_addr(val);
+ break;
+ default:
+ break;
+ }
+}
+
+
+
static void platform_ioport_map(PCIDevice *pci_dev, int region_num, uint32_t addr, uint32_t size, int type)
{
PCIXenPlatformState *d = (PCIXenPlatformState *)pci_dev;
register_ioport_write(addr, size, 1, xen_platform_ioport_writeb, d);
+ register_ioport_write(addr, size, 4, xen_platform_ioport_writel, d);
register_ioport_read(addr, size, 1, xen_platform_ioport_readb, d);
+ register_ioport_read(addr, size, 4, xen_platform_ioport_readl, d);
}

static uint32_t platform_mmio_read(void *opaque, target_phys_addr_t addr)
@@ -155,6 +191,7 @@ void xen_pci_save(QEMUFile *f, void *opa

pci_device_save(&d->pci_dev, f);
qemu_put_8s(f, &d->platform_flags);
+ qemu_put_be64s(f, &d->vga_stolen_ram);
}

int xen_pci_load(QEMUFile *f, void *opaque, int version_id)
@@ -173,6 +210,7 @@ int xen_pci_load(QEMUFile *f, void *opaq
uint8_t flags;
qemu_get_8s(f, &flags);
xen_platform_ioport_writeb(d, 0, flags);
+ qemu_get_be64s(f, &d->vga_stolen_ram);
}

return 0;
diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/ioemu/vl.c
--- a/tools/ioemu/vl.c Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/ioemu/vl.c Mon Sep 01 16:59:43 2008 +0900
@@ -7022,38 +7022,6 @@ static BOOL WINAPI qemu_ctrl_handler(DWO
#define MAX_NET_CLIENTS 32

#include <xg_private.h>
-
-/* FIXME Flush the shadow page */
-int unset_mm_mapping(int xc_handle, uint32_t domid,
- unsigned long nr_pages, unsigned int address_bits,
- xen_pfn_t *extent_start)
-{
- int err = 0;
-
- err = xc_domain_memory_decrease_reservation(xc_handle, domid,
- nr_pages, 0, extent_start);
- if (err)
- fprintf(stderr, "Failed to decrease physmap\n");
-
- return err;
-}
-
-int set_mm_mapping(int xc_handle, uint32_t domid,
- unsigned long nr_pages, unsigned int address_bits,
- xen_pfn_t *extent_start)
-{
- int err = 0;
-
- err = xc_domain_memory_populate_physmap(
- xc_handle, domid, nr_pages, 0,
- XENMEMF_address_bits(address_bits), extent_start);
- if (err) {
- fprintf(stderr, "Failed to populate physmap\n");
- return -1;
- }
-
- return 0;
-}


int main(int argc, char **argv)
diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/ioemu/vl.h
--- a/tools/ioemu/vl.h Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/ioemu/vl.h Mon Sep 01 16:59:43 2008 +0900
@@ -1560,6 +1560,9 @@ void timeoffset_get(void);
/* xen_platform.c */
#ifndef QEMU_TOOL
void pci_xen_platform_init(PCIBus *bus);
+void xen_vga_stolen_vram_addr(uint64_t vram_addr);
+void xen_vga_populate_vram(uint64_t vram_addr);
+void xen_vga_vram_map(uint64_t vram_addr, int copy);
#endif

/* pci_emulation.c */
diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/libxc/xc_dom_boot.c
--- a/tools/libxc/xc_dom_boot.c Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/libxc/xc_dom_boot.c Mon Sep 01 16:59:43 2008 +0900
@@ -187,7 +187,7 @@ int xc_dom_boot_image(struct xc_dom_imag
int xc_dom_boot_image(struct xc_dom_image *dom)
{
DECLARE_DOMCTL;
- void *ctxt;
+ vcpu_guest_context_any_t ctxt;
int rc;

xc_dom_printf("%s: called\n", __FUNCTION__);
@@ -245,12 +245,11 @@ int xc_dom_boot_image(struct xc_dom_imag
return rc;

/* let the vm run */
- ctxt = xc_dom_malloc(dom, PAGE_SIZE * 2 /* FIXME */ );
- memset(ctxt, 0, PAGE_SIZE * 2);
- if ( (rc = dom->arch_hooks->vcpu(dom, ctxt)) != 0 )
+ memset(&ctxt, 0, sizeof(ctxt));
+ if ( (rc = dom->arch_hooks->vcpu(dom, &ctxt)) != 0 )
return rc;
xc_dom_unmap_all(dom);
- rc = launch_vm(dom->guest_xc, dom->guest_domid, ctxt);
+ rc = launch_vm(dom->guest_xc, dom->guest_domid, &ctxt);

return rc;
}
diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/libxc/xc_domain.c
--- a/tools/libxc/xc_domain.c Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/libxc/xc_domain.c Mon Sep 01 16:59:43 2008 +0900
@@ -531,6 +531,33 @@ int xc_domain_memory_populate_physmap(in
DPRINTF("Failed allocation for dom %d: %ld extents of order %d\n",
domid, nr_extents, extent_order);
errno = EBUSY;
+ err = -1;
+ }
+
+ return err;
+}
+
+int xc_domain_memory_translate_gpfn_list(int xc_handle,
+ uint32_t domid,
+ unsigned long nr_gpfns,
+ xen_pfn_t *gpfn_list,
+ xen_pfn_t *mfn_list)
+{
+ int err;
+ struct xen_translate_gpfn_list translate_gpfn_list = {
+ .domid = domid,
+ .nr_gpfns = nr_gpfns,
+ };
+ set_xen_guest_handle(translate_gpfn_list.gpfn_list, gpfn_list);
+ set_xen_guest_handle(translate_gpfn_list.mfn_list, mfn_list);
+
+ err = xc_memory_op(xc_handle, XENMEM_translate_gpfn_list, &translate_gpfn_list);
+
+ if ( err != 0 )
+ {
+ DPRINTF("Failed translation for dom %d (%ld PFNs)\n",
+ domid, nr_gpfns);
+ errno = -err;
err = -1;
}

diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/libxc/xc_domain_save.c
--- a/tools/libxc/xc_domain_save.c Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/libxc/xc_domain_save.c Mon Sep 01 16:59:43 2008 +0900
@@ -1109,12 +1109,6 @@ int xc_domain_save(int xc_handle, int io
if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
(test_bit(n, to_send) && last_iter) ||
(test_bit(n, to_fix) && last_iter)) )
- continue;
-
- /* Skip PFNs that aren't really there */
- if ( hvm && ((n >= 0xa0 && n < 0xc0) /* VGA hole */
- || (n >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT)
- && n < (1ULL<<32) >> PAGE_SHIFT)) /* MMIO */ )
continue;

/*
diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/libxc/xc_minios.c
--- a/tools/libxc/xc_minios.c Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/libxc/xc_minios.c Mon Sep 01 16:59:43 2008 +0900
@@ -64,7 +64,6 @@ void *xc_map_foreign_range(int xc_handle
unsigned long mfn)
{
unsigned long pt_prot = 0;
- printf("xc_map_foreign_range(%lx, %d)\n", mfn, size);
#ifdef __ia64__
/* TODO */
#else
@@ -81,9 +80,10 @@ void *xc_map_foreign_ranges(int xc_handl
size_t size, int prot, size_t chunksize,
privcmd_mmap_entry_t entries[], int nentries)
{
- unsigned long mfns[size / PAGE_SIZE];
+ unsigned long *mfns;
int i, j, n;
unsigned long pt_prot = 0;
+ void *ret;
#ifdef __ia64__
/* TODO */
#else
@@ -93,12 +93,16 @@ void *xc_map_foreign_ranges(int xc_handl
pt_prot = L1_PROT;
#endif

+ mfns = malloc((size / PAGE_SIZE) * sizeof(*mfns));
+
n = 0;
for (i = 0; i < nentries; i++)
for (j = 0; j < chunksize / PAGE_SIZE; j++)
mfns[n++] = entries[i].mfn + j;

- return map_frames_ex(mfns, n, 1, 0, 1, dom, 0, pt_prot);
+ ret = map_frames_ex(mfns, n, 1, 0, 1, dom, 0, pt_prot);
+ free(mfns);
+ return ret;
}


diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/libxc/xenctrl.h Mon Sep 01 16:59:43 2008 +0900
@@ -628,6 +628,12 @@ int xc_domain_memory_populate_physmap(in
unsigned int mem_flags,
xen_pfn_t *extent_start);

+int xc_domain_memory_translate_gpfn_list(int xc_handle,
+ uint32_t domid,
+ unsigned long nr_gpfns,
+ xen_pfn_t *gpfn_list,
+ xen_pfn_t *mfn_list);
+
int xc_domain_ioport_permission(int xc_handle,
uint32_t domid,
uint32_t first_port,
diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/pygrub/src/pygrub
--- a/tools/pygrub/src/pygrub Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/pygrub/src/pygrub Mon Sep 01 16:59:43 2008 +0900
@@ -124,7 +124,7 @@ class GrubLineEditor(curses.textpad.Text
class GrubLineEditor(curses.textpad.Textbox):
def __init__(self, screen, startx, starty, line = ""):
screen.addstr(startx, starty, "> ")
- screen.refresh()
+ screen.noutrefresh()
win = curses.newwin(1, 74, startx, starty + 2)
curses.textpad.Textbox.__init__(self, win)

@@ -137,7 +137,7 @@ class GrubLineEditor(curses.textpad.Text
"""Show the text. One of our advantages over standard textboxes
is that we can handle lines longer than the window."""

- self.win.clear()
+ self.win.erase()
p = self.pos
off = 0
while p > 70:
@@ -188,6 +188,7 @@ class GrubLineEditor(curses.textpad.Text
return 1

def edit(self):
+ curses.doupdate()
r = curses.textpad.Textbox.edit(self)
if self.cancelled:
return None
@@ -217,16 +218,15 @@ class Grub:
curses.def_prog_mode()

curses.reset_prog_mode()
- self.screen.clear()
- self.screen.refresh()
+ self.screen.erase()

# create basic grub screen with a box of entries and a textbox
self.screen.addstr(1, 4, "pyGRUB version %s" %(PYGRUB_VER,))
self.entry_win.box()
- self.screen.refresh()
+ self.screen.noutrefresh()

def fill_entry_list(self):
- self.entry_win.clear()
+ self.entry_win.erase()
self.entry_win.box()

maxy = self.entry_win.getmaxyx()[0]-3 # maxy - 2 for the frame + index
@@ -244,7 +244,7 @@ class Grub:
self.entry_win.addstr(y + 1 - self.start_image, 2, i.title.ljust(70))
if y == self.selected_image:
self.entry_win.attroff(curses.A_REVERSE)
- self.entry_win.refresh()
+ self.entry_win.noutrefresh()

def edit_entry(self, origimg):
def draw():
@@ -259,13 +259,13 @@ class Grub:
self.text_win.addch(0, 14, curses.ACS_DARROW)
(y, x) = self.text_win.getmaxyx()
self.text_win.move(y - 1, x - 1)
- self.text_win.refresh()
+ self.text_win.noutrefresh()

curline = 1
img = copy.deepcopy(origimg)
while 1:
draw()
- self.entry_win.clear()
+ self.entry_win.erase()
self.entry_win.box()
for idx in range(1, len(img.lines)):
# current line should be highlighted
@@ -280,7 +280,8 @@ class Grub:
self.entry_win.addstr(idx, 2, l)
if idx == curline:
self.entry_win.attroff(curses.A_REVERSE)
- self.entry_win.refresh()
+ self.entry_win.noutrefresh()
+ curses.doupdate()

c = self.screen.getch()
if c in (ord('q'), 27): # 27 == esc
@@ -318,10 +319,10 @@ class Grub:
origimg.reset(img.lines)

def edit_line(self, line):
- self.screen.clear()
+ self.screen.erase()
self.screen.addstr(1, 2, "[. Minimal BASH-like line editing is supported. ")
self.screen.addstr(2, 2, " ESC at any time cancels. ENTER at any time accepts your changes. ]")
- self.screen.refresh()
+ self.screen.noutrefresh()

t = GrubLineEditor(self.screen, 5, 2, line)
enable_cursor(True)
@@ -331,10 +332,10 @@ class Grub:
return None

def command_line_mode(self):
- self.screen.clear()
+ self.screen.erase()
self.screen.addstr(1, 2, "[. Minimal BASH-like line editing is supported. ESC at any time ")
self.screen.addstr(2, 2, " exits. Typing 'boot' will boot with your entered commands. ] ")
- self.screen.refresh()
+ self.screen.noutrefresh()

y = 5
lines = []
@@ -420,7 +421,7 @@ class Grub:
self.text_win.addch(0, 14, curses.ACS_DARROW)
(y, x) = self.text_win.getmaxyx()
self.text_win.move(y - 1, x - 1)
- self.text_win.refresh()
+ self.text_win.noutrefresh()

# now loop until we hit the timeout or get a go from the user
mytime = 0
@@ -433,6 +434,7 @@ class Grub:
else:
self.screen.addstr(20, 5, " " * 80)
self.fill_entry_list()
+ curses.doupdate()

c = self.screen.getch()
if c == -1:
diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/python/xen/util/pci.py
--- a/tools/python/xen/util/pci.py Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/python/xen/util/pci.py Mon Sep 01 16:59:43 2008 +0900
@@ -40,6 +40,7 @@ DEV_TYPE_PCI_BRIDGE = 2
DEV_TYPE_PCI_BRIDGE = 2
DEV_TYPE_PCI = 3

+PCI_VENDOR_ID = 0x0
PCI_STATUS = 0x6
PCI_CLASS_DEVICE = 0x0a
PCI_CLASS_BRIDGE_PCI = 0x0604
@@ -68,6 +69,11 @@ PCI_PM_CTRL_NO_SOFT_RESET = 0x0004
PCI_PM_CTRL_NO_SOFT_RESET = 0x0004
PCI_PM_CTRL_STATE_MASK = 0x0003
PCI_D3hot = 3
+
+VENDOR_INTEL = 0x8086
+PCI_CAP_ID_VENDOR_SPECIFIC_CAP = 0x09
+PCI_CLASS_ID_USB = 0x0c03
+PCI_USB_FLRCTRL = 0x4

PCI_CAP_ID_AF = 0x13
PCI_AF_CAPs = 0x3
@@ -487,7 +493,7 @@ class PciDevice:
def do_Dstate_transition(self):
pos = self.find_cap_offset(PCI_CAP_ID_PM)
if pos == 0:
- return
+ return False

(pci_list, cfg_list) = save_pci_conf_space([self.name])

@@ -504,6 +510,31 @@ class PciDevice:
time.sleep(0.010)

restore_pci_conf_space((pci_list, cfg_list))
+ return True
+
+ def do_vendor_specific_FLR_method(self):
+ pos = self.find_cap_offset(PCI_CAP_ID_VENDOR_SPECIFIC_CAP)
+ if pos == 0:
+ return
+
+ vendor_id = self.pci_conf_read16(PCI_VENDOR_ID)
+ if vendor_id != VENDOR_INTEL:
+ return
+
+ class_id = self.pci_conf_read16(PCI_CLASS_DEVICE)
+ if class_id != PCI_CLASS_ID_USB:
+ return
+
+ (pci_list, cfg_list) = save_pci_conf_space([self.name])
+
+ self.pci_conf_write8(pos + PCI_USB_FLRCTRL, 1)
+ time.sleep(0.010)
+
+ restore_pci_conf_space((pci_list, cfg_list))
+
+ def do_FLR_for_integrated_device(self):
+ if not self.do_Dstate_transition():
+ self.do_vendor_specific_FLR_method()

def find_all_the_multi_functions(self):
sysfs_mnt = find_sysfs_mnt()
@@ -676,7 +707,7 @@ class PciDevice:
restore_pci_conf_space((pci_list, cfg_list))
else:
if self.bus == 0:
- self.do_Dstate_transition()
+ self.do_FLR_for_integrated_device()
else:
funcs = self.find_all_the_multi_functions()
self.devs_check_driver(funcs)
@@ -697,7 +728,7 @@ class PciDevice:
restore_pci_conf_space((pci_list, cfg_list))
else:
if self.bus == 0:
- self.do_Dstate_transition()
+ self.do_FLR_for_integrated_device()
else:
devs = self.find_coassigned_devices(False)
# Remove the element 0 which is a bridge
diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/python/xen/xend/XendConfig.py
--- a/tools/python/xen/xend/XendConfig.py Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/python/xen/xend/XendConfig.py Mon Sep 01 16:59:43 2008 +0900
@@ -1538,9 +1538,9 @@ class XendConfig(dict):
pci_dev_info[opt] = val
except TypeError:
pass
- # append uuid for each pci device.
- dpci_uuid = pci_dev_info.get('uuid', uuid.createString())
- pci_dev_info['uuid'] = dpci_uuid
+ # append uuid for each pci device.
+ dpci_uuid = pci_dev_info.get('uuid', uuid.createString())
+ pci_dev_info['uuid'] = dpci_uuid
pci_devs.append(pci_dev_info)
dev_config['devs'] = pci_devs

diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/python/xen/xend/XendDomain.py
--- a/tools/python/xen/xend/XendDomain.py Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/python/xen/xend/XendDomain.py Mon Sep 01 16:59:43 2008 +0900
@@ -419,6 +419,8 @@ class XendDomain:
except VmError:
log.exception("Unable to recreate domain")
try:
+ xc.domain_pause(domid)
+ do_FLR(domid)
xc.domain_destroy(domid)
except:
log.exception("Hard destruction of domain failed: %d" %
@@ -1255,6 +1257,8 @@ class XendDomain:
val = dominfo.destroy()
else:
try:
+ xc.domain_pause(int(domid))
+ do_FLR(int(domid))
val = xc.domain_destroy(int(domid))
except ValueError:
raise XendInvalidDomain(domid)
diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/python/xen/xend/XendDomainInfo.py Mon Sep 01 16:59:43 2008 +0900
@@ -287,6 +287,28 @@ def dom_get(dom):
log.trace("domain_getinfo(%d) failed, ignoring: %s", dom, str(err))
return None

+def do_FLR(domid):
+ from xen.xend.server.pciif import parse_pci_name, PciDevice
+ path = '/local/domain/0/backend/pci/%u/0/' % domid
+ num_devs = xstransact.Read(path + 'num_devs');
+ if num_devs is None or num_devs == "":
+ return;
+
+ num_devs = int(xstransact.Read(path + 'num_devs'));
+
+ dev_str_list = []
+ for i in range(num_devs):
+ dev_str = xstransact.Read(path + 'dev-%i' % i)
+ dev_str_list = dev_str_list + [dev_str]
+
+ for dev_str in dev_str_list:
+ (dom, b, d, f) = parse_pci_name(dev_str)
+ try:
+ dev = PciDevice(dom, b, d, f)
+ except Exception, e:
+ raise VmError("pci: failed to locate device and "+
+ "parse it's resources - "+str(e))
+ dev.do_FLR()

class XendDomainInfo:
"""An object represents a domain.
@@ -2386,42 +2408,32 @@ class XendDomainInfo:
if self.domid is None:
return

+ from xen.xend import XendDomain
log.debug("XendDomainInfo.destroy: domid=%s", str(self.domid))

paths = self._prepare_phantom_paths()

self._cleanupVm()
if self.dompath is not None:
- self.destroyDomain()
-
- self._cleanup_phantom_devs(paths)
-
- if "transient" in self.info["other_config"] \
- and bool(self.info["other_config"]["transient"]):
- from xen.xend import XendDomain
- XendDomain.instance().domain_delete_by_dominfo(self)
-
-
- def destroyDomain(self):
- log.debug("XendDomainInfo.destroyDomain(%s)", str(self.domid))
-
- paths = self._prepare_phantom_paths()
-
- try:
- if self.domid is not None:
+ try:
xc.domain_destroy_hook(self.domid)
+ xc.domain_pause(self.domid)
+ do_FLR(self.domid)
xc.domain_destroy(self.domid)
for state in DOM_STATES_OLD:
self.info[state] = 0
self._stateSet(DOM_STATE_HALTED)
- except:
- log.exception("XendDomainInfo.destroy: xc.domain_destroy failed.")
-
- from xen.xend import XendDomain
- XendDomain.instance().remove_domain(self)
-
- self.cleanupDomain()
+ except:
+ log.exception("XendDomainInfo.destroy: domain destruction failed.")
+
+ XendDomain.instance().remove_domain(self)
+ self.cleanupDomain()
+
self._cleanup_phantom_devs(paths)
+
+ if "transient" in self.info["other_config"] \
+ and bool(self.info["other_config"]["transient"]):
+ XendDomain.instance().domain_delete_by_dominfo(self)


def resetDomain(self):
diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/python/xen/xend/image.py
--- a/tools/python/xen/xend/image.py Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/python/xen/xend/image.py Mon Sep 01 16:59:43 2008 +0900
@@ -637,8 +637,9 @@ class LinuxImageHandler(ImageHandler):
log.debug("ramdisk = %s", self.ramdisk)
log.debug("vcpus = %d", self.vm.getVCpuCount())
log.debug("features = %s", self.vm.getFeatures())
+ log.debug("flags = %d", self.flags)
if arch.type == "ia64":
- log.debug("vhpt = %d", self.flags)
+ log.debug("vhpt = %d", self.vhpt)

return xc.linux_build(domid = self.vm.getDomid(),
memsize = mem_mb,
diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/python/xen/xend/server/DevController.py
--- a/tools/python/xen/xend/server/DevController.py Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/python/xen/xend/server/DevController.py Mon Sep 01 16:59:43 2008 +0900
@@ -223,12 +223,6 @@ class DevController:
raise VmError('%s devices may not be reconfigured' % self.deviceClass)


- def cleanupDeviceOnDomainDestroy(self, devid):
- """ Some devices may need special cleanup when the guest domain
- is destroyed.
- """
- return
-
def destroyDevice(self, devid, force):
"""Destroy the specified device.

@@ -244,8 +238,6 @@ class DevController:
"""

dev = self.convertToDeviceNumber(devid)
-
- self.cleanupDeviceOnDomainDestroy(dev)

# Modify online status /before/ updating state (latter is watched by
# drivers, so this ordering avoids a race).
diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/python/xen/xend/server/pciif.py
--- a/tools/python/xen/xend/server/pciif.py Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/python/xen/xend/server/pciif.py Mon Sep 01 16:59:43 2008 +0900
@@ -383,10 +383,10 @@ class PciController(DevController):
if (dev.dev_type == DEV_TYPE_PCIe_ENDPOINT) and not dev.pcie_flr:
if dev.bus == 0:
# We cope with this case by using the Dstate transition
- # method for now.
+ # method or some vendor specific methods for now.
err_msg = 'pci: %s: it is on bus 0, but has no PCIe' +\
' FLR Capability. Will try the Dstate transition'+\
- ' method if available.'
+ ' method or some vendor specific methods if available.'
log.warn(err_msg % dev.name)
else:
funcs = dev.find_all_the_multi_functions()
@@ -404,10 +404,11 @@ class PciController(DevController):
if dev.bus == 0 or arch.type == "ia64":
if not dev.pci_af_flr:
# We cope with this case by using the Dstate transition
- # method for now.
+ # method or some vendor specific methods for now.
err_msg = 'pci: %s: it is on bus 0, but has no PCI' +\
' Advanced Capabilities for FLR. Will try the'+\
- ' Dstate transition method if available.'
+ ' Dstate transition method or some vendor' +\
+ ' specific methods if available.'
log.warn(err_msg % dev.name)
else:
# All devices behind the uppermost PCI/PCI-X bridge must be\
@@ -543,22 +544,6 @@ class PciController(DevController):

return new_num_devs

- def cleanupDeviceOnDomainDestroy(self, devid):
- num_devs = int(self.readBackend(devid, 'num_devs'))
- dev_str_list = []
- for i in range(num_devs):
- dev_str = self.readBackend(devid, 'dev-%i' % i)
- dev_str_list = dev_str_list + [dev_str]
-
- for dev_str in dev_str_list:
- (dom, b, d, f) = parse_pci_name(dev_str)
- try:
- dev = PciDevice(dom, b, d, f)
- except Exception, e:
- raise VmError("pci: failed to locate device and "+
- "parse it's resources - "+str(e))
- dev.do_FLR()
-
def waitForBackend(self,devid):
return (0, "ok - no hotplug")

diff -r 48db4eee7d58 -r d0a544d8a3f3 tools/xenmon/Makefile
--- a/tools/xenmon/Makefile Mon Aug 25 19:04:37 2008 +0900
+++ b/tools/xenmon/Makefile Mon Sep 01 16:59:43 2008 +0900
@@ -42,6 +42,6 @@ clean:


%: %.c Makefile
- $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $<
+ $(CC) $(CFLAGS) $< $(LDFLAGS) -o $@
xentrace_%: %.c Makefile
- $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $<
+ $(CC) $(CFLAGS) $< $(LDFLAGS) -o $@
diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/Makefile
--- a/xen/Makefile Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/Makefile Mon Sep 01 16:59:43 2008 +0900
@@ -1,8 +1,8 @@
# This is the correct place to edit the build version.
# All other places this is stored (eg. compile.h) should be autogenerated.
export XEN_VERSION = 3
-export XEN_SUBVERSION = 3
-export XEN_EXTRAVERSION ?= .0-rc8-pre$(XEN_VENDORVERSION)
+export XEN_SUBVERSION = 4
+export XEN_EXTRAVERSION ?= -unstable$(XEN_VENDORVERSION)
export XEN_FULLVERSION = $(XEN_VERSION).$(XEN_SUBVERSION)$(XEN_EXTRAVERSION)
-include xen-version

@@ -87,7 +87,7 @@ include/xen/compile.h: include/xen/compi
-e 's/@@whoami@@/$(XEN_WHOAMI)/g' \
-e 's/@@domain@@/$(XEN_DOMAIN)/g' \
-e 's/@@hostname@@/$(shell hostname)/g' \
- -e 's!@@compiler@@!$(shell $(CC) $(CFLAGS) -v 2>&1 | grep -i "gcc.*version")!g' \
+ -e 's!@@compiler@@!$(shell $(CC) $(CFLAGS) -v 2>&1 | tail -1)!g' \
-e 's/@@version@@/$(XEN_VERSION)/g' \
-e 's/@@subversion@@/$(XEN_SUBVERSION)/g' \
-e 's/@@extraversion@@/$(XEN_EXTRAVERSION)/g' \
diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/arch/ia64/xen/mm.c
--- a/xen/arch/ia64/xen/mm.c Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/arch/ia64/xen/mm.c Mon Sep 01 16:59:43 2008 +0900
@@ -2698,6 +2698,20 @@ void put_page_type(struct page_info *pag
}


+static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
+{
+ struct page_info *page = mfn_to_page(page_nr);
+
+ if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
+ {
+ MEM_LOG("Could not get page ref for pfn %lx", page_nr);
+ return 0;
+ }
+
+ return 1;
+}
+
+
int get_page_type(struct page_info *page, u32 type)
{
u64 nx, x, y = page->u.inuse.type_info;
@@ -2792,6 +2806,8 @@ long
long
arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
{
+ struct page_info *page = NULL;
+
switch (op) {
case XENMEM_add_to_physmap:
{
@@ -2836,11 +2852,21 @@ arch_memory_op(int op, XEN_GUEST_HANDLE(

spin_unlock(&d->grant_table->lock);
break;
+ case XENMAPSPACE_mfn:
+ {
+ if ( get_page_from_pagenr(xatp.idx, d) ) {
+ mfn = xatp.idx;
+ page = mfn_to_page(mfn);
+ }
+ break;
+ }
default:
break;
}

if (mfn == 0) {
+ if ( page )
+ put_page(page);
rcu_unlock_domain(d);
return -EINVAL;
}
@@ -2872,11 +2898,53 @@ arch_memory_op(int op, XEN_GUEST_HANDLE(

out:
domain_unlock(d);
-
+
+ if ( page )
+ put_page(page);
+
rcu_unlock_domain(d);

break;
}
+
+ case XENMEM_remove_from_physmap:
+ {
+ struct xen_remove_from_physmap xrfp;
+ unsigned long mfn;
+ struct domain *d;
+
+ if ( copy_from_guest(&xrfp, arg, 1) )
+ return -EFAULT;
+
+ if ( xrfp.domid == DOMID_SELF )
+ {
+ d = rcu_lock_current_domain();
+ }
+ else
+ {
+ if ( (d = rcu_lock_domain_by_id(xrfp.domid)) == NULL )
+ return -ESRCH;
+ if ( !IS_PRIV_FOR(current->domain, d) )
+ {
+ rcu_unlock_domain(d);
+ return -EPERM;
+ }
+ }
+
+ domain_lock(d);
+
+ mfn = gmfn_to_mfn(d, xrfp.gpfn);
+
+ if ( mfn_valid(mfn) )
+ guest_physmap_remove_page(d, xrfp.gpfn, mfn, 0);
+
+ domain_unlock(d);
+
+ rcu_unlock_domain(d);
+
+ break;
+ }
+

case XENMEM_machine_memory_map:
{
diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/arch/x86/acpi/power.c
--- a/xen/arch/x86/acpi/power.c Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/arch/x86/acpi/power.c Mon Sep 01 16:59:43 2008 +0900
@@ -24,6 +24,7 @@
#include <xen/sched.h>
#include <xen/domain.h>
#include <xen/console.h>
+#include <xen/iommu.h>
#include <public/platform.h>
#include <asm/tboot.h>

@@ -41,6 +42,8 @@ void do_suspend_lowlevel(void);

static int device_power_down(void)
{
+ iommu_suspend();
+
console_suspend();

time_suspend();
@@ -65,6 +68,8 @@ static void device_power_up(void)
time_resume();

console_resume();
+
+ iommu_resume();
}

static void freeze_domains(void)
diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/arch/x86/cpu/amd.c
--- a/xen/arch/x86/cpu/amd.c Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/arch/x86/cpu/amd.c Mon Sep 01 16:59:43 2008 +0900
@@ -10,8 +10,142 @@
#include <asm/hvm/support.h>

#include "cpu.h"
+#include "amd.h"

int start_svm(struct cpuinfo_x86 *c);
+
+/*
+ * Pre-canned values for overriding the CPUID features
+ * and extended features masks.
+ *
+ * Currently supported processors:
+ *
+ * "fam_0f_rev_c"
+ * "fam_0f_rev_d"
+ * "fam_0f_rev_e"
+ * "fam_0f_rev_f"
+ * "fam_0f_rev_g"
+ * "fam_10_rev_b"
+ * "fam_10_rev_c"
+ * "fam_11_rev_b"
+ */
+static char opt_famrev[14];
+string_param("cpuid_mask_cpu", opt_famrev);
+
+/* Finer-grained CPUID feature control. */
+static unsigned int opt_cpuid_mask_ecx, opt_cpuid_mask_edx;
+integer_param("cpuid_mask_ecx", opt_cpuid_mask_ecx);
+integer_param("cpuid_mask_edx", opt_cpuid_mask_edx);
+static unsigned int opt_cpuid_mask_ext_ecx, opt_cpuid_mask_ext_edx;
+integer_param("cpuid_mask_ecx", opt_cpuid_mask_ext_ecx);
+integer_param("cpuid_mask_edx", opt_cpuid_mask_ext_edx);
+
+static inline void wrmsr_amd(unsigned int index, unsigned int lo,
+ unsigned int hi)
+{
+ asm volatile (
+ "wrmsr"
+ : /* No outputs */
+ : "c" (index), "a" (lo),
+ "d" (hi), "D" (0x9c5a203a)
+ );
+}
+
+/*
+ * Mask the features and extended features returned by CPUID. Parameters are
+ * set from the boot line via two methods:
+ *
+ * 1) Specific processor revision string
+ * 2) User-defined masks
+ *
+ * The processor revision string parameter has precedene.
+ */
+static void __devinit set_cpuidmask(struct cpuinfo_x86 *c)
+{
+ static unsigned int feat_ecx, feat_edx;
+ static unsigned int extfeat_ecx, extfeat_edx;
+ static enum { not_parsed, no_mask, set_mask } status;
+
+ if (status == no_mask)
+ return;
+
+ if (status == set_mask)
+ goto setmask;
+
+ ASSERT((status == not_parsed) && (smp_processor_id() == 0));
+ status = no_mask;
+
+ if (opt_cpuid_mask_ecx | opt_cpuid_mask_edx |
+ opt_cpuid_mask_ext_ecx | opt_cpuid_mask_ext_edx) {
+ feat_ecx = opt_cpuid_mask_ecx ? : ~0U;
+ feat_edx = opt_cpuid_mask_edx ? : ~0U;
+ extfeat_ecx = opt_cpuid_mask_ext_ecx ? : ~0U;
+ extfeat_edx = opt_cpuid_mask_ext_edx ? : ~0U;
+ } else if (*opt_famrev == '\0') {
+ return;
+ } else if (!strcmp(opt_famrev, "fam_0f_rev_c")) {
+ feat_ecx = AMD_FEATURES_K8_REV_C_ECX;
+ feat_edx = AMD_FEATURES_K8_REV_C_EDX;
+ extfeat_ecx = AMD_EXTFEATURES_K8_REV_C_ECX;
+ extfeat_edx = AMD_EXTFEATURES_K8_REV_C_EDX;
+ } else if (!strcmp(opt_famrev, "fam_0f_rev_d")) {
+ feat_ecx = AMD_FEATURES_K8_REV_D_ECX;
+ feat_edx = AMD_FEATURES_K8_REV_D_EDX;
+ extfeat_ecx = AMD_EXTFEATURES_K8_REV_D_ECX;
+ extfeat_edx = AMD_EXTFEATURES_K8_REV_D_EDX;
+ } else if (!strcmp(opt_famrev, "fam_0f_rev_e")) {
+ feat_ecx = AMD_FEATURES_K8_REV_E_ECX;
+ feat_edx = AMD_FEATURES_K8_REV_E_EDX;
+ extfeat_ecx = AMD_EXTFEATURES_K8_REV_E_ECX;
+ extfeat_edx = AMD_EXTFEATURES_K8_REV_E_EDX;
+ } else if (!strcmp(opt_famrev, "fam_0f_rev_f")) {
+ feat_ecx = AMD_FEATURES_K8_REV_F_ECX;
+ feat_edx = AMD_FEATURES_K8_REV_F_EDX;
+ extfeat_ecx = AMD_EXTFEATURES_K8_REV_F_ECX;
+ extfeat_edx = AMD_EXTFEATURES_K8_REV_F_EDX;
+ } else if (!strcmp(opt_famrev, "fam_0f_rev_g")) {
+ feat_ecx = AMD_FEATURES_K8_REV_G_ECX;
+ feat_edx = AMD_FEATURES_K8_REV_G_EDX;
+ extfeat_ecx = AMD_EXTFEATURES_K8_REV_G_ECX;
+ extfeat_edx = AMD_EXTFEATURES_K8_REV_G_EDX;
+ } else if (!strcmp(opt_famrev, "fam_10_rev_b")) {
+ feat_ecx = AMD_FEATURES_FAM10h_REV_B_ECX;
+ feat_edx = AMD_FEATURES_FAM10h_REV_B_EDX;
+ extfeat_ecx = AMD_EXTFEATURES_FAM10h_REV_B_ECX;
+ extfeat_edx = AMD_EXTFEATURES_FAM10h_REV_B_EDX;
+ } else if (!strcmp(opt_famrev, "fam_10_rev_c")) {
+ feat_ecx = AMD_FEATURES_FAM10h_REV_C_ECX;
+ feat_edx = AMD_FEATURES_FAM10h_REV_C_EDX;
+ extfeat_ecx = AMD_EXTFEATURES_FAM10h_REV_C_ECX;
+ extfeat_edx = AMD_EXTFEATURES_FAM10h_REV_C_EDX;
+ } else if (!strcmp(opt_famrev, "fam_11_rev_b")) {
+ feat_ecx = AMD_FEATURES_FAM11h_REV_B_ECX;
+ feat_edx = AMD_FEATURES_FAM11h_REV_B_EDX;
+ extfeat_ecx = AMD_EXTFEATURES_FAM11h_REV_B_ECX;
+ extfeat_edx = AMD_EXTFEATURES_FAM11h_REV_B_EDX;
+ } else {
+ printk("Invalid processor string: %s\n", opt_famrev);
+ printk("CPUID will not be masked\n");
+ return;
+ }
+
+ status = set_mask;
+ printk("Writing CPUID feature mask ECX:EDX -> %08Xh:%08Xh\n",
+ feat_ecx, feat_edx);
+ printk("Writing CPUID extended feature mask ECX:EDX -> %08Xh:%08Xh\n",
+ extfeat_ecx, extfeat_edx);
+
+ setmask:
+ /* FIXME check if processor supports CPUID masking */
+ /* AMD processors prior to family 10h required a 32-bit password */
+ if (c->x86 >= 0x10) {
+ wrmsr(MSR_K8_FEATURE_MASK, feat_edx, feat_ecx);
+ wrmsr(MSR_K8_EXT_FEATURE_MASK, extfeat_edx, extfeat_ecx);
+ } else if (c->x86 == 0x0f) {
+ wrmsr_amd(MSR_K8_FEATURE_MASK, feat_edx, feat_ecx);
+ wrmsr_amd(MSR_K8_EXT_FEATURE_MASK, extfeat_edx, extfeat_ecx);
+ }
+}

/*
* amd_flush_filter={on,off}. Forcibly Enable or disable the TLB flush
@@ -115,7 +249,7 @@ static void check_disable_c1e(unsigned i
on_each_cpu(disable_c1e, NULL, 1, 1);
}

-static void __init init_amd(struct cpuinfo_x86 *c)
+static void __devinit init_amd(struct cpuinfo_x86 *c)
{
u32 l, h;
int mbytes = num_physpages >> (20-PAGE_SHIFT);
@@ -367,6 +501,8 @@ static void __init init_amd(struct cpuin
/* Prevent TSC drift in non single-processor, single-core platforms. */
if ((smp_processor_id() == 1) && c1_ramping_may_cause_clock_drift(c))
disable_c1_ramping();
+
+ set_cpuidmask(c);

start_svm(c);
}
diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/arch/x86/cpu/amd.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/cpu/amd.h Mon Sep 01 16:59:43 2008 +0900
@@ -0,0 +1,103 @@
+/*
+ * amd.h - AMD processor specific definitions
+ */
+
+#ifndef __AMD_H__
+#define __AMD_H__
+
+#include <asm/cpufeature.h>
+
+/* CPUID masked for use by AMD-V Extended Migration */
+
+#define X86_FEATURE_BITPOS(_feature_) ((_feature_) % 32)
+#define __bit(_x_) (1U << X86_FEATURE_BITPOS(_x_))
+
+/* Family 0Fh, Revision C */
+#define AMD_FEATURES_K8_REV_C_ECX 0
+#define AMD_FEATURES_K8_REV_C_EDX ( \
+ __bit(X86_FEATURE_FPU) | __bit(X86_FEATURE_VME) | \
+ __bit(X86_FEATURE_DE) | __bit(X86_FEATURE_PSE) | \
+ __bit(X86_FEATURE_TSC) | __bit(X86_FEATURE_MSR) | \
+ __bit(X86_FEATURE_PAE) | __bit(X86_FEATURE_MCE) | \
+ __bit(X86_FEATURE_CX8) | __bit(X86_FEATURE_APIC) | \
+ __bit(X86_FEATURE_SEP) | __bit(X86_FEATURE_MTRR) | \
+ __bit(X86_FEATURE_PGE) | __bit(X86_FEATURE_MCA) | \
+ __bit(X86_FEATURE_CMOV) | __bit(X86_FEATURE_PAT) | \
+ __bit(X86_FEATURE_PSE36) | __bit(X86_FEATURE_CLFLSH)| \
+ __bit(X86_FEATURE_MMX) | __bit(X86_FEATURE_FXSR) | \
+ __bit(X86_FEATURE_XMM) | __bit(X86_FEATURE_XMM2))
+#define AMD_EXTFEATURES_K8_REV_C_ECX 0
+#define AMD_EXTFEATURES_K8_REV_C_EDX ( \
+ __bit(X86_FEATURE_FPU) | __bit(X86_FEATURE_VME) | \
+ __bit(X86_FEATURE_DE) | __bit(X86_FEATURE_PSE) | \
+ __bit(X86_FEATURE_TSC) | __bit(X86_FEATURE_MSR) | \
+ __bit(X86_FEATURE_PAE) | __bit(X86_FEATURE_MCE) | \
+ __bit(X86_FEATURE_CX8) | __bit(X86_FEATURE_APIC) | \
+ __bit(X86_FEATURE_SYSCALL) | __bit(X86_FEATURE_MTRR) | \
+ __bit(X86_FEATURE_PGE) | __bit(X86_FEATURE_MCA) | \
+ __bit(X86_FEATURE_CMOV) | __bit(X86_FEATURE_PAT) | \
+ __bit(X86_FEATURE_PSE36) | __bit(X86_FEATURE_NX) | \
+ __bit(X86_FEATURE_MMXEXT) | __bit(X86_FEATURE_MMX) | \
+ __bit(X86_FEATURE_FXSR) | __bit(X86_FEATURE_LM) | \
+ __bit(X86_FEATURE_3DNOWEXT) | __bit(X86_FEATURE_3DNOW))
+
+/* Family 0Fh, Revision D */
+#define AMD_FEATURES_K8_REV_D_ECX AMD_FEATURES_K8_REV_C_ECX
+#define AMD_FEATURES_K8_REV_D_EDX AMD_FEATURES_K8_REV_C_EDX
+#define AMD_EXTFEATURES_K8_REV_D_ECX (AMD_EXTFEATURES_K8_REV_C_ECX |\
+ __bit(X86_FEATURE_LAHF_LM))
+#define AMD_EXTFEATURES_K8_REV_D_EDX (AMD_EXTFEATURES_K8_REV_C_EDX |\
+ __bit(X86_FEATURE_FFXSR))
+
+/* Family 0Fh, Revision E */
+#define AMD_FEATURES_K8_REV_E_ECX (AMD_FEATURES_K8_REV_D_ECX | \
+ __bit(X86_FEATURE_XMM3))
+#define AMD_FEATURES_K8_REV_E_EDX (AMD_FEATURES_K8_REV_D_EDX | \
+ __bit(X86_FEATURE_HT))
+#define AMD_EXTFEATURES_K8_REV_E_ECX (AMD_EXTFEATURES_K8_REV_D_ECX |\
+ __bit(X86_FEATURE_CMP_LEGACY))
+#define AMD_EXTFEATURES_K8_REV_E_EDX AMD_EXTFEATURES_K8_REV_D_EDX
+
+/* Family 0Fh, Revision F */
+#define AMD_FEATURES_K8_REV_F_ECX (AMD_FEATURES_K8_REV_E_ECX | \
+ __bit(X86_FEATURE_CX16))
+#define AMD_FEATURES_K8_REV_F_EDX AMD_FEATURES_K8_REV_E_EDX
+#define AMD_EXTFEATURES_K8_REV_F_ECX (AMD_EXTFEATURES_K8_REV_E_ECX |\
+ __bit(X86_FEATURE_SVME) | __bit(X86_FEATURE_EXTAPICSPACE) | \
+ __bit(X86_FEATURE_ALTMOVCR))
+#define AMD_EXTFEATURES_K8_REV_F_EDX (AMD_EXTFEATURES_K8_REV_E_EDX |\
+ __bit(X86_FEATURE_RDTSCP))
+
+/* Family 0Fh, Revision G */
+#define AMD_FEATURES_K8_REV_G_ECX AMD_FEATURES_K8_REV_F_ECX
+#define AMD_FEATURES_K8_REV_G_EDX AMD_FEATURES_K8_REV_F_EDX
+#define AMD_EXTFEATURES_K8_REV_G_ECX (AMD_EXTFEATURES_K8_REV_F_ECX |\
+ __bit(X86_FEATURE_3DNOWPF))
+#define AMD_EXTFEATURES_K8_REV_G_EDX AMD_EXTFEATURES_K8_REV_F_EDX
+
+/* Family 10h, Revision B */
+#define AMD_FEATURES_FAM10h_REV_B_ECX (AMD_FEATURES_K8_REV_F_ECX | \
+ __bit(X86_FEATURE_POPCNT) | __bit(X86_FEATURE_MWAIT))
+#define AMD_FEATURES_FAM10h_REV_B_EDX AMD_FEATURES_K8_REV_F_EDX
+#define AMD_EXTFEATURES_FAM10h_REV_B_ECX (AMD_EXTFEATURES_K8_REV_F_ECX |\
+ __bit(X86_FEATURE_ABM) | __bit(X86_FEATURE_SSE4A) | \
+ __bit(X86_FEATURE_MISALIGNSSE) | __bit(X86_FEATURE_OSVW) | \
+ __bit(X86_FEATURE_IBS))
+#define AMD_EXTFEATURES_FAM10h_REV_B_EDX (AMD_EXTFEATURES_K8_REV_F_EDX |\
+ __bit(X86_FEATURE_PAGE1GB))
+
+/* Family 10h, Revision C */
+#define AMD_FEATURES_FAM10h_REV_C_ECX AMD_FEATURES_FAM10h_REV_B_ECX
+#define AMD_FEATURES_FAM10h_REV_C_EDX AMD_FEATURES_FAM10h_REV_B_EDX
+#define AMD_EXTFEATURES_FAM10h_REV_C_ECX (AMD_EXTFEATURES_FAM10h_REV_B_ECX |\
+ __bit(X86_FEATURE_SKINIT) | __bit(X86_FEATURE_WDT))
+#define AMD_EXTFEATURES_FAM10h_REV_C_EDX AMD_EXTFEATURES_FAM10h_REV_B_EDX
+
+/* Family 11h, Revision B */
+#define AMD_FEATURES_FAM11h_REV_B_ECX AMD_FEATURES_K8_REV_G_ECX
+#define AMD_FEATURES_FAM11h_REV_B_EDX AMD_FEATURES_K8_REV_G_EDX
+#define AMD_EXTFEATURES_FAM11h_REV_B_ECX (AMD_EXTFEATURES_K8_REV_G_ECX |\
+ __bit(X86_FEATURE_SKINIT))
+#define AMD_EXTFEATURES_FAM11h_REV_B_EDX AMD_EXTFEATURES_K8_REV_G_EDX
+
+#endif /* __AMD_H__ */
diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/arch/x86/hvm/emulate.c
--- a/xen/arch/x86/hvm/emulate.c Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/arch/x86/hvm/emulate.c Mon Sep 01 16:59:43 2008 +0900
@@ -571,11 +571,12 @@ static int hvmemul_rep_movs(
{
struct hvm_emulate_ctxt *hvmemul_ctxt =
container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
- unsigned long saddr, daddr;
+ unsigned long saddr, daddr, bytes;
paddr_t sgpa, dgpa;
uint32_t pfec = PFEC_page_present;
p2m_type_t p2mt;
- int rc;
+ int rc, df = !!(ctxt->regs->eflags & X86_EFLAGS_DF);
+ char *buf;

rc = hvmemul_virtual_to_linear(
src_seg, src_offset, bytes_per_rep, reps, hvm_access_read,
@@ -606,15 +607,56 @@ static int hvmemul_rep_movs(
(void)gfn_to_mfn_current(sgpa >> PAGE_SHIFT, &p2mt);
if ( !p2m_is_ram(p2mt) )
return hvmemul_do_mmio(
- sgpa, reps, bytes_per_rep, dgpa, IOREQ_READ,
- !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL);
+ sgpa, reps, bytes_per_rep, dgpa, IOREQ_READ, df, NULL);

(void)gfn_to_mfn_current(dgpa >> PAGE_SHIFT, &p2mt);
- if ( p2m_is_ram(p2mt) )
+ if ( !p2m_is_ram(p2mt) )
+ return hvmemul_do_mmio(
+ dgpa, reps, bytes_per_rep, sgpa, IOREQ_WRITE, df, NULL);
+
+ /* RAM-to-RAM copy: emulate as equivalent of memmove(dgpa, sgpa, bytes). */
+ bytes = *reps * bytes_per_rep;
+
+ /* Adjust source address for reverse copy. */
+ if ( df )
+ sgpa -= bytes - bytes_per_rep;
+
+ /*
+ * Will first iteration copy fall within source range? If not then entire
+ * copy does not corrupt itself. If so, then this is more complex than
+ * can be emulated by a source-to-buffer-to-destination block copy.
+ */
+ if ( ((dgpa + bytes_per_rep) > sgpa) && (dgpa < (sgpa + bytes)) )
return X86EMUL_UNHANDLEABLE;
- return hvmemul_do_mmio(
- dgpa, reps, bytes_per_rep, sgpa, IOREQ_WRITE,
- !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL);
+
+ /* Adjust destination address for reverse copy. */
+ if ( df )
+ dgpa -= bytes - bytes_per_rep;
+
+ /* Allocate temporary buffer. Fall back to slow emulation if this fails. */
+ buf = xmalloc_bytes(bytes);
+ if ( buf == NULL )
+ return X86EMUL_UNHANDLEABLE;
+
+ /*
+ * We do a modicum of checking here, just for paranoia's sake and to
+ * definitely avoid copying an unitialised buffer into guest address space.
+ */
+ rc = hvm_copy_from_guest_phys(buf, sgpa, bytes);
+ if ( rc == HVMCOPY_okay )
+ rc = hvm_copy_to_guest_phys(dgpa, buf, bytes);
+
+ xfree(buf);
+
+ if ( rc != HVMCOPY_okay )
+ {
+ gdprintk(XENLOG_WARNING, "Failed memory-to-memory REP MOVS: sgpa=%"
+ PRIpaddr" dgpa=%"PRIpaddr" reps=%lu bytes_per_rep=%u\n",
+ sgpa, dgpa, *reps, bytes_per_rep);
+ return X86EMUL_UNHANDLEABLE;
+ }
+
+ return X86EMUL_OKAY;
}

static int hvmemul_read_segment(
diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/arch/x86/irq.c
--- a/xen/arch/x86/irq.c Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/arch/x86/irq.c Mon Sep 01 16:59:43 2008 +0900
@@ -63,7 +63,9 @@ asmlinkage void do_IRQ(struct cpu_user_r

if ( likely(desc->status & IRQ_GUEST) )
{
+ irq_enter();
__do_IRQ_guest(vector);
+ irq_exit();
spin_unlock(&desc->lock);
return;
}
diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/arch/x86/microcode.c
--- a/xen/arch/x86/microcode.c Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/arch/x86/microcode.c Mon Sep 01 16:59:43 2008 +0900
@@ -124,7 +124,7 @@ static DEFINE_SPINLOCK(microcode_update_
/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
static DEFINE_MUTEX(microcode_mutex);

-static void __user *user_buffer; /* user area microcode data buffer */
+static const void __user *user_buffer; /* user area microcode data buffer */
static unsigned int user_buffer_size; /* it's size */

typedef enum mc_error_code {
@@ -455,7 +455,7 @@ out:
return error;
}

-int microcode_update(XEN_GUEST_HANDLE(void) buf, unsigned long len)
+int microcode_update(XEN_GUEST_HANDLE(const_void) buf, unsigned long len)
{
int ret;

diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/arch/x86/mm.c Mon Sep 01 16:59:43 2008 +0900
@@ -3339,6 +3339,7 @@ DEFINE_XEN_GUEST_HANDLE(e820entry_t);

long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
{
+ struct page_info *page = NULL;
switch ( op )
{
case XENMEM_add_to_physmap:
@@ -3389,12 +3390,22 @@ long arch_memory_op(int op, XEN_GUEST_HA

spin_unlock(&d->grant_table->lock);
break;
+ case XENMAPSPACE_mfn:
+ {
+ if ( get_page_from_pagenr(xatp.idx, d) ) {
+ mfn = xatp.idx;
+ page = mfn_to_page(mfn);
+ }
+ break;
+ }
default:
break;
}

if ( !paging_mode_translate(d) || (mfn == 0) )
{
+ if ( page )
+ put_page(page);
rcu_unlock_domain(d);
return -EINVAL;
}
@@ -3420,6 +3431,53 @@ long arch_memory_op(int op, XEN_GUEST_HA

/* Map at new location. */
guest_physmap_add_page(d, xatp.gpfn, mfn, 0);
+
+ domain_unlock(d);
+
+ if ( page )
+ put_page(page);
+
+ rcu_unlock_domain(d);
+
+ break;
+ }
+
+ case XENMEM_remove_from_physmap:
+ {
+ struct xen_remove_from_physmap xrfp;
+ unsigned long mfn;
+ struct domain *d;
+
+ if ( copy_from_guest(&xrfp, arg, 1) )
+ return -EFAULT;
+
+ if ( xrfp.domid == DOMID_SELF )
+ {
+ d = rcu_lock_current_domain();
+ }
+ else
+ {
+ if ( (d = rcu_lock_domain_by_id(xrfp.domid)) == NULL )
+ return -ESRCH;
+ if ( !IS_PRIV_FOR(current->domain, d) )
+ {
+ rcu_unlock_domain(d);
+ return -EPERM;
+ }
+ }
+
+ if ( xsm_remove_from_physmap(current->domain, d) )
+ {
+ rcu_unlock_domain(d);
+ return -EPERM;
+ }
+
+ domain_lock(d);
+
+ mfn = gmfn_to_mfn(d, xrfp.gpfn);
+
+ if ( mfn_valid(mfn) )
+ guest_physmap_remove_page(d, xrfp.gpfn, mfn, 0);

domain_unlock(d);

diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/arch/x86/platform_hypercall.c
--- a/xen/arch/x86/platform_hypercall.c Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/arch/x86/platform_hypercall.c Mon Sep 01 16:59:43 2008 +0900
@@ -147,8 +147,7 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe

case XENPF_microcode_update:
{
- extern int microcode_update(XEN_GUEST_HANDLE(void), unsigned long len);
- XEN_GUEST_HANDLE(void) data;
+ XEN_GUEST_HANDLE(const_void) data;

ret = xsm_microcode();
if ( ret )
diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/arch/x86/time.c
--- a/xen/arch/x86/time.c Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/arch/x86/time.c Mon Sep 01 16:59:43 2008 +0900
@@ -840,12 +840,11 @@ struct cpu_calibration {
u64 local_tsc_stamp;
s_time_t stime_local_stamp;
s_time_t stime_master_stamp;
- struct timer softirq_callback;
};
static DEFINE_PER_CPU(struct cpu_calibration, cpu_calibration);

/* Softirq handler for per-CPU time calibration. */
-static void local_time_calibration(void *unused)
+static void local_time_calibration(void)
{
struct cpu_time *t = &this_cpu(cpu_time);
struct cpu_calibration *c = &this_cpu(cpu_calibration);
@@ -1004,13 +1003,12 @@ static void time_calibration_rendezvous(
struct cpu_calibration *c = &this_cpu(cpu_calibration);
struct calibration_rendezvous *r = _r;

- local_irq_disable();
-
if ( smp_processor_id() == 0 )
{
while ( atomic_read(&r->nr_cpus) != (total_cpus - 1) )
cpu_relax();
r->master_stime = read_platform_stime();
+ mb(); /* write r->master_stime /then/ signal */
atomic_inc(&r->nr_cpus);
}
else
@@ -1018,16 +1016,14 @@ static void time_calibration_rendezvous(
atomic_inc(&r->nr_cpus);
while ( atomic_read(&r->nr_cpus) != total_cpus )
cpu_relax();
+ mb(); /* receive signal /then/ read r->master_stime */
}

rdtscll(c->local_tsc_stamp);
c->stime_local_stamp = get_s_time();
c->stime_master_stamp = r->master_stime;

- local_irq_enable();
-
- /* Callback in softirq context as soon as possible. */
- set_timer(&c->softirq_callback, c->stime_local_stamp);
+ raise_softirq(TIME_CALIBRATE_SOFTIRQ);
}

static void time_calibration(void *unused)
@@ -1036,6 +1032,7 @@ static void time_calibration(void *unuse
.nr_cpus = ATOMIC_INIT(0)
};

+ /* @wait=1 because we must wait for all cpus before freeing @r. */
on_each_cpu(time_calibration_rendezvous, &r, 0, 1);
}

@@ -1053,9 +1050,6 @@ void init_percpu_time(void)
t->stime_master_stamp = now;
t->stime_local_stamp = now;

- init_timer(&this_cpu(cpu_calibration).softirq_callback,
- local_time_calibration, NULL, smp_processor_id());
-
if ( smp_processor_id() == 0 )
{
init_timer(&calibration_timer, time_calibration, NULL, 0);
@@ -1072,6 +1066,8 @@ int __init init_xen_time(void)
this is a new feature introduced by Nehalem*/
if ( cpuid_edx(0x80000007) & (1u<<8) )
tsc_invariant = 1;
+
+ open_softirq(TIME_CALIBRATE_SOFTIRQ, local_time_calibration);

init_percpu_time();

@@ -1180,7 +1176,7 @@ int time_suspend(void)
}

/* Better to cancel calibration timer for accuracy. */
- kill_timer(&this_cpu(cpu_calibration).softirq_callback);
+ clear_bit(TIME_CALIBRATE_SOFTIRQ, &softirq_pending(smp_processor_id()));

return 0;
}
diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/arch/x86/x86_64/compat/mm.c
--- a/xen/arch/x86/x86_64/compat/mm.c Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/arch/x86/x86_64/compat/mm.c Mon Sep 01 16:59:43 2008 +0900
@@ -64,6 +64,20 @@ int compat_arch_memory_op(int op, XEN_GU
return -EFAULT;

XLAT_add_to_physmap(nat, &cmp);
+ rc = arch_memory_op(op, guest_handle_from_ptr(nat, void));
+
+ break;
+ }
+
+ case XENMEM_remove_from_physmap:
+ {
+ struct compat_remove_from_physmap cmp;
+ struct xen_remove_from_physmap *nat = (void *)COMPAT_ARG_XLAT_VIRT_BASE;
+
+ if ( copy_from_guest(&cmp, arg, 1) )
+ return -EFAULT;
+
+ XLAT_remove_from_physmap(nat, &cmp);
rc = arch_memory_op(op, guest_handle_from_ptr(nat, void));

break;
diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/common/softirq.c
--- a/xen/common/softirq.c Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/common/softirq.c Mon Sep 01 16:59:43 2008 +0900
@@ -49,6 +49,7 @@ asmlinkage void do_softirq(void)

void open_softirq(int nr, softirq_handler handler)
{
+ ASSERT(nr < NR_SOFTIRQS);
softirq_handlers[nr] = handler;
}

diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/common/timer.c
--- a/xen/common/timer.c Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/common/timer.c Mon Sep 01 16:59:43 2008 +0900
@@ -30,6 +30,7 @@ struct timers {
struct timers {
spinlock_t lock;
struct timer **heap;
+ struct timer *list;
struct timer *running;
} __cacheline_aligned;

@@ -86,12 +87,10 @@ static void up_heap(struct timer **heap,


/* Delete @t from @heap. Return TRUE if new top of heap. */
-static int remove_entry(struct timer **heap, struct timer *t)
+static int remove_from_heap(struct timer **heap, struct timer *t)
{
int sz = GET_HEAP_SIZE(heap);
int pos = t->heap_offset;
-
- t->heap_offset = 0;

if ( unlikely(pos == sz) )
{
@@ -115,7 +114,7 @@ static int remove_entry(struct timer **h


/* Add new entry @t to @heap. Return TRUE if new top of heap. */
-static int add_entry(struct timer ***pheap, struct timer *t)
+static int add_to_heap(struct timer ***pheap, struct timer *t)
{
struct timer **heap = *pheap;
int sz = GET_HEAP_SIZE(heap);
@@ -126,8 +125,11 @@ static int add_entry(struct timer ***phe
/* old_limit == (2^n)-1; new_limit == (2^(n+4))-1 */
int old_limit = GET_HEAP_LIMIT(heap);
int new_limit = ((old_limit + 1) << 4) - 1;
+ if ( in_irq() )
+ goto out;
heap = xmalloc_array(struct timer *, new_limit + 1);
- BUG_ON(heap == NULL);
+ if ( heap == NULL )
+ goto out;
memcpy(heap, *pheap, (old_limit + 1) * sizeof(*heap));
SET_HEAP_LIMIT(heap, new_limit);
if ( old_limit != 0 )
@@ -139,7 +141,38 @@ static int add_entry(struct timer ***phe
heap[sz] = t;
t->heap_offset = sz;
up_heap(heap, sz);
+ out:
return (t->heap_offset == 1);
+}
+
+
+/****************************************************************************
+ * LINKED LIST OPERATIONS.
+ */
+
+static int remove_from_list(struct timer **pprev, struct timer *t)
+{
+ struct timer *curr, **_pprev = pprev;
+
+ while ( (curr = *_pprev) != t )
+ _pprev = &curr->list_next;
+
+ *_pprev = t->list_next;
+
+ return (_pprev == pprev);
+}
+
+static int add_to_list(struct timer **pprev, struct timer *t)
+{
+ struct timer *curr, **_pprev = pprev;
+
+ while ( ((curr = *_pprev) != NULL) && (curr->expires <= t->expires) )
+ _pprev = &curr->list_next;
+
+ t->list_next = curr;
+ *_pprev = t;
+
+ return (_pprev == pprev);
}


@@ -147,18 +180,56 @@ static int add_entry(struct timer ***phe
* TIMER OPERATIONS.
*/

+static int remove_entry(struct timers *timers, struct timer *t)
+{
+ int rc;
+
+ switch ( t->status )
+ {
+ case TIMER_STATUS_in_heap:
+ rc = remove_from_heap(timers->heap, t);
+ break;
+ case TIMER_STATUS_in_list:
+ rc = remove_from_list(&timers->list, t);
+ break;
+ default:
+ rc = 0;
+ BUG();
+ }
+
+ t->status = TIMER_STATUS_inactive;
+ return rc;
+}
+
+static int add_entry(struct timers *timers, struct timer *t)
+{
+ int rc;
+
+ ASSERT(t->status == TIMER_STATUS_inactive);
+
+ /* Try to add to heap. t->heap_offset indicates whether we succeed. */
+ t->heap_offset = 0;
+ t->status = TIMER_STATUS_in_heap;
+ rc = add_to_heap(&timers->heap, t);
+ if ( t->heap_offset != 0 )
+ return rc;
+
+ /* Fall back to adding to the slower linked list. */
+ t->status = TIMER_STATUS_in_list;
+ return add_to_list(&timers->list, t);
+}
+
static inline void __add_timer(struct timer *timer)
{
int cpu = timer->cpu;
- if ( add_entry(&per_cpu(timers, cpu).heap, timer) )
+ if ( add_entry(&per_cpu(timers, cpu), timer) )
cpu_raise_softirq(cpu, TIMER_SOFTIRQ);
}

-
static inline void __stop_timer(struct timer *timer)
{
int cpu = timer->cpu;
- if ( remove_entry(per_cpu(timers, cpu).heap, timer) )
+ if ( remove_entry(&per_cpu(timers, cpu), timer) )
cpu_raise_softirq(cpu, TIMER_SOFTIRQ);
}

@@ -203,7 +274,7 @@ void set_timer(struct timer *timer, s_ti

timer->expires = expires;

- if ( likely(!timer->killed) )
+ if ( likely(timer->status != TIMER_STATUS_killed) )
__add_timer(timer);

timer_unlock_irqrestore(timer, flags);
@@ -278,7 +349,7 @@ void kill_timer(struct timer *timer)

if ( active_timer(timer) )
__stop_timer(timer);
- timer->killed = 1;
+ timer->status = TIMER_STATUS_killed;

timer_unlock_irqrestore(timer, flags);

@@ -290,43 +361,76 @@ void kill_timer(struct timer *timer)

static void timer_softirq_action(void)
{
- struct timer *t, **heap;
+ struct timer *t, **heap, *next;
struct timers *ts;
- s_time_t now;
+ s_time_t now, deadline;
void (*fn)(void *);
void *data;

ts = &this_cpu(timers);

spin_lock_irq(&ts->lock);
+
+ /* Try to move timers from overflow linked list to more efficient heap. */
+ next = ts->list;
+ ts->list = NULL;
+ while ( unlikely((t = next) != NULL) )
+ {
+ next = t->list_next;
+ t->status = TIMER_STATUS_inactive;
+ add_entry(ts, t);
+ }

- do {
+ heap = ts->heap;
+ now = NOW();
+
+ while ( (GET_HEAP_SIZE(heap) != 0) &&
+ ((t = heap[1])->expires < (now + TIMER_SLOP)) )
+ {
+ remove_entry(ts, t);
+
+ ts->running = t;
+
+ fn = t->function;
+ data = t->data;
+
+ spin_unlock_irq(&ts->lock);
+ (*fn)(data);
+ spin_lock_irq(&ts->lock);
+
+ /* Heap may have grown while the lock was released. */
heap = ts->heap;
- now = NOW();
-
- while ( (GET_HEAP_SIZE(heap) != 0) &&
- ((t = heap[1])->expires < (now + TIMER_SLOP)) )
+ }
+
+ deadline = GET_HEAP_SIZE(heap) ? heap[1]->expires : 0;
+
+ while ( unlikely((t = ts->list) != NULL) )
+ {
+ if ( t->expires >= (now + TIMER_SLOP) )
{
- remove_entry(heap, t);
-
- ts->running = t;
-
- fn = t->function;
- data = t->data;
-
- spin_unlock_irq(&ts->lock);
- (*fn)(data);
- spin_lock_irq(&ts->lock);
-
- /* Heap may have grown while the lock was released. */
- heap = ts->heap;
+ if ( (deadline == 0) || (deadline > t->expires) )
+ deadline = t->expires;
+ break;
}

- ts->running = NULL;
-
- this_cpu(timer_deadline) = GET_HEAP_SIZE(heap) ? heap[1]->expires : 0;
- }
- while ( !reprogram_timer(this_cpu(timer_deadline)) );
+ ts->list = t->list_next;
+ t->status = TIMER_STATUS_inactive;
+
+ ts->running = t;
+
+ fn = t->function;
+ data = t->data;
+
+ spin_unlock_irq(&ts->lock);
+ (*fn)(data);
+ spin_lock_irq(&ts->lock);
+ }
+
+ ts->running = NULL;
+
+ this_cpu(timer_deadline) = deadline;
+ if ( !reprogram_timer(deadline) )
+ raise_softirq(TIMER_SOFTIRQ);

spin_unlock_irq(&ts->lock);
}
@@ -364,6 +468,9 @@ static void dump_timerq(unsigned char ke
printk (" %d : %p ex=0x%08X%08X %p\n",
j, t, (u32)(t->expires>>32), (u32)t->expires, t->data);
}
+ for ( t = ts->list, j = 0; t != NULL; t = t->list_next, j++ )
+ printk (" L%d : %p ex=0x%08X%08X %p\n",
+ j, t, (u32)(t->expires>>32), (u32)t->expires, t->data);
spin_unlock_irqrestore(&ts->lock, flags);
printk("\n");
}
diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/common/xmalloc.c
--- a/xen/common/xmalloc.c Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/common/xmalloc.c Mon Sep 01 16:59:43 2008 +0900
@@ -136,15 +136,14 @@ static void *xmalloc_new_page(size_t siz
static void *xmalloc_new_page(size_t size)
{
struct xmalloc_hdr *hdr;
- unsigned long flags;

hdr = alloc_xenheap_page();
if ( hdr == NULL )
return NULL;

- spin_lock_irqsave(&freelist_lock, flags);
+ spin_lock(&freelist_lock);
maybe_split(hdr, size, PAGE_SIZE);
- spin_unlock_irqrestore(&freelist_lock, flags);
+ spin_unlock(&freelist_lock);

return data_from_header(hdr);
}
@@ -175,7 +174,6 @@ void *_xmalloc(size_t size, size_t align
void *_xmalloc(size_t size, size_t align)
{
struct xmalloc_hdr *i;
- unsigned long flags;

ASSERT(!in_irq());

@@ -196,17 +194,17 @@ void *_xmalloc(size_t size, size_t align
return xmalloc_whole_pages(size);

/* Search free list. */
- spin_lock_irqsave(&freelist_lock, flags);
+ spin_lock(&freelist_lock);
list_for_each_entry( i, &freelist, freelist )
{
if ( i->size < size )
continue;
del_from_freelist(i);
maybe_split(i, size, i->size);
- spin_unlock_irqrestore(&freelist_lock, flags);
+ spin_unlock(&freelist_lock);
return data_from_header(i);
}
- spin_unlock_irqrestore(&freelist_lock, flags);
+ spin_unlock(&freelist_lock);

/* Alloc a new page and return from that. */
return xmalloc_new_page(size);
@@ -214,7 +212,6 @@ void *_xmalloc(size_t size, size_t align

void xfree(void *p)
{
- unsigned long flags;
struct xmalloc_hdr *i, *tmp, *hdr;

ASSERT(!in_irq());
@@ -238,7 +235,7 @@ void xfree(void *p)
}

/* Merge with other free block, or put in list. */
- spin_lock_irqsave(&freelist_lock, flags);
+ spin_lock(&freelist_lock);
list_for_each_entry_safe( i, tmp, &freelist, freelist )
{
unsigned long _i = (unsigned long)i;
@@ -275,7 +272,7 @@ void xfree(void *p)
add_to_freelist(hdr);
}

- spin_unlock_irqrestore(&freelist_lock, flags);
+ spin_unlock(&freelist_lock);
}

/*
diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/drivers/passthrough/vtd/intremap.c
--- a/xen/drivers/passthrough/vtd/intremap.c Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/drivers/passthrough/vtd/intremap.c Mon Sep 01 16:59:43 2008 +0900
@@ -43,7 +43,7 @@ u16 apicid_to_bdf(int apic_id)
return 0;
}

-static void remap_entry_to_ioapic_rte(
+static int remap_entry_to_ioapic_rte(
struct iommu *iommu, struct IO_APIC_route_entry *old_rte)
{
struct iremap_entry *iremap_entry = NULL, *iremap_entries;
@@ -56,15 +56,19 @@ static void remap_entry_to_ioapic_rte(
{
dprintk(XENLOG_ERR VTDPREFIX,
"remap_entry_to_ioapic_rte: ir_ctl is not ready\n");
- return;
+ return -EFAULT;
}

remap_rte = (struct IO_APIC_route_remap_entry *) old_rte;
index = (remap_rte->index_15 << 15) | remap_rte->index_0_14;

if ( index > ir_ctrl->iremap_index )
- panic("%s: index (%d) is larger than remap table entry size (%d)!\n",
- __func__, index, ir_ctrl->iremap_index);
+ {
+ dprintk(XENLOG_ERR VTDPREFIX,
+ "%s: index (%d) is larger than remap table entry size (%d)!\n",
+ __func__, index, ir_ctrl->iremap_index);
+ return -EFAULT;
+ }

spin_lock_irqsave(&ir_ctrl->iremap_lock, flags);

@@ -82,9 +86,10 @@ static void remap_entry_to_ioapic_rte(

unmap_vtd_domain_page(iremap_entries);
spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags);
-}
-
-static void ioapic_rte_to_remap_entry(struct iommu *iommu,
+ return 0;
+}
+
+static int ioapic_rte_to_remap_entry(struct iommu *iommu,
int apic_id, struct IO_APIC_route_entry *old_rte,
unsigned int rte_upper, unsigned int value)
{
@@ -108,7 +113,14 @@ static void ioapic_rte_to_remap_entry(st
index = (remap_rte->index_15 << 15) | remap_rte->index_0_14;

if ( index > IREMAP_ENTRY_NR - 1 )
- panic("ioapic_rte_to_remap_entry: intremap index is more than 256!\n");
+ {
+ dprintk(XENLOG_ERR VTDPREFIX,
+ "%s: intremap index (%d) is larger than"
+ " the maximum index (%ld)!\n",
+ __func__, index, IREMAP_ENTRY_NR - 1);
+ spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags);
+ return -EFAULT;
+ }

iremap_entries =
(struct iremap_entry *)map_vtd_domain_page(ir_ctrl->iremap_maddr);
@@ -159,7 +171,7 @@ static void ioapic_rte_to_remap_entry(st

unmap_vtd_domain_page(iremap_entries);
spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags);
- return;
+ return 0;
}

unsigned int io_apic_read_remap_rte(
@@ -189,23 +201,22 @@ unsigned int io_apic_read_remap_rte(

remap_rte = (struct IO_APIC_route_remap_entry *) &old_rte;

- if ( remap_rte->mask || (remap_rte->format == 0) )
- {
- *IO_APIC_BASE(apic) = reg;
+ if ( remap_rte->format == 0 )
+ {
+ *IO_APIC_BASE(apic) = rte_upper ? (reg + 1) : reg;
return *(IO_APIC_BASE(apic)+4);
}

- remap_entry_to_ioapic_rte(iommu, &old_rte);
+ if ( remap_entry_to_ioapic_rte(iommu, &old_rte) )
+ {
+ *IO_APIC_BASE(apic) = rte_upper ? (reg + 1) : reg;
+ return *(IO_APIC_BASE(apic)+4);
+ }
+
if ( rte_upper )
- {
- *IO_APIC_BASE(apic) = reg + 1;
return (*(((u32 *)&old_rte) + 1));
- }
else
- {
- *IO_APIC_BASE(apic) = reg;
return (*(((u32 *)&old_rte) + 0));
- }
}

void io_apic_write_remap_rte(
@@ -243,8 +254,13 @@ void io_apic_write_remap_rte(
*(IO_APIC_BASE(apic)+4) = *(((int *)&old_rte)+0);
remap_rte->mask = saved_mask;

- ioapic_rte_to_remap_entry(iommu, mp_ioapics[apic].mpc_apicid,
- &old_rte, rte_upper, value);
+ if ( ioapic_rte_to_remap_entry(iommu, mp_ioapics[apic].mpc_apicid,
+ &old_rte, rte_upper, value) )
+ {
+ *IO_APIC_BASE(apic) = rte_upper ? (reg + 1) : reg;
+ *(IO_APIC_BASE(apic)+4) = value;
+ return;
+ }

/* write new entry to ioapic */
*IO_APIC_BASE(apic) = reg;
@@ -253,7 +269,7 @@ void io_apic_write_remap_rte(
*(IO_APIC_BASE(apic)+4) = *(((u32 *)&old_rte)+1);
}

-static void remap_entry_to_msi_msg(
+static int remap_entry_to_msi_msg(
struct iommu *iommu, struct msi_msg *msg)
{
struct iremap_entry *iremap_entry = NULL, *iremap_entries;
@@ -266,7 +282,7 @@ static void remap_entry_to_msi_msg(
{
dprintk(XENLOG_ERR VTDPREFIX,
"remap_entry_to_msi_msg: ir_ctl == NULL");
- return;
+ return -EFAULT;
}

remap_rte = (struct msi_msg_remap_entry *) msg;
@@ -274,8 +290,12 @@ static void remap_entry_to_msi_msg(
remap_rte->address_lo.index_0_14;

if ( index > ir_ctrl->iremap_index )
- panic("%s: index (%d) is larger than remap table entry size (%d)\n",
- __func__, index, ir_ctrl->iremap_index);
+ {
+ dprintk(XENLOG_ERR VTDPREFIX,
+ "%s: index (%d) is larger than remap table entry size (%d)\n",
+ __func__, index, ir_ctrl->iremap_index);
+ return -EFAULT;
+ }

spin_lock_irqsave(&ir_ctrl->iremap_lock, flags);

@@ -304,9 +324,10 @@ static void remap_entry_to_msi_msg(

unmap_vtd_domain_page(iremap_entries);
spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags);
-}
-
-static void msi_msg_to_remap_entry(
+ return 0;
+}
+
+static int msi_msg_to_remap_entry(
struct iommu *iommu, struct pci_dev *pdev, struct msi_msg *msg)
{
struct iremap_entry *iremap_entry = NULL, *iremap_entries;
@@ -343,7 +364,15 @@ static void msi_msg_to_remap_entry(
index = i;

if ( index > IREMAP_ENTRY_NR - 1 )
- panic("msi_msg_to_remap_entry: intremap index is more than 256!\n");
+ {
+ dprintk(XENLOG_ERR VTDPREFIX,
+ "%s: intremap index (%d) is larger than"
+ " the maximum index (%ld)!\n",
+ __func__, index, IREMAP_ENTRY_NR - 1);
+ unmap_vtd_domain_page(iremap_entries);
+ spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags);
+ return -EFAULT;
+ }

iremap_entry = &iremap_entries[index];
memcpy(&new_ire, iremap_entry, sizeof(struct iremap_entry));
@@ -385,7 +414,7 @@ static void msi_msg_to_remap_entry(

unmap_vtd_domain_page(iremap_entries);
spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags);
- return;
+ return 0;
}

void msi_msg_read_remap_rte(
diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/drivers/passthrough/vtd/iommu.c
--- a/xen/drivers/passthrough/vtd/iommu.c Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/drivers/passthrough/vtd/iommu.c Mon Sep 01 16:59:43 2008 +0900
@@ -624,15 +624,10 @@ static int iommu_set_root_entry(struct i
unsigned long flags;
s_time_t start_time;

- if ( iommu->root_maddr != 0 )
- {
- free_pgtable_maddr(iommu->root_maddr);
- iommu->root_maddr = 0;
- }
-
spin_lock_irqsave(&iommu->register_lock, flags);

- iommu->root_maddr = alloc_pgtable_maddr();
+ if ( iommu->root_maddr == 0 )
+ iommu->root_maddr = alloc_pgtable_maddr();
if ( iommu->root_maddr == 0 )
{
spin_unlock_irqrestore(&iommu->register_lock, flags);
@@ -1864,37 +1859,31 @@ static int intel_iommu_group_id(u8 bus,
return -1;
}

-u8 iommu_state[MAX_IOMMU_REGS * MAX_IOMMUS];
+static u32 iommu_state[MAX_IOMMUS][MAX_IOMMU_REGS];
int iommu_suspend(void)
{
struct acpi_drhd_unit *drhd;
struct iommu *iommu;
- int i = 0;
+ u32 i;
+
+ if ( !vtd_enabled )
+ return 0;

iommu_flush_all();

for_each_drhd_unit ( drhd )
{
iommu = drhd->iommu;
- iommu_state[DMAR_RTADDR_REG * i] =
- (u64) dmar_readq(iommu->reg, DMAR_RTADDR_REG);
- iommu_state[DMAR_FECTL_REG * i] =
+ i = iommu->index;
+
+ iommu_state[i][DMAR_FECTL_REG] =
(u32) dmar_readl(iommu->reg, DMAR_FECTL_REG);
- iommu_state[DMAR_FEDATA_REG * i] =
+ iommu_state[i][DMAR_FEDATA_REG] =
(u32) dmar_readl(iommu->reg, DMAR_FEDATA_REG);
- iommu_state[DMAR_FEADDR_REG * i] =
+ iommu_state[i][DMAR_FEADDR_REG] =
(u32) dmar_readl(iommu->reg, DMAR_FEADDR_REG);
- iommu_state[DMAR_FEUADDR_REG * i] =
+ iommu_state[i][DMAR_FEUADDR_REG] =
(u32) dmar_readl(iommu->reg, DMAR_FEUADDR_REG);
- iommu_state[DMAR_PLMBASE_REG * i] =
- (u32) dmar_readl(iommu->reg, DMAR_PLMBASE_REG);
- iommu_state[DMAR_PLMLIMIT_REG * i] =
- (u32) dmar_readl(iommu->reg, DMAR_PLMLIMIT_REG);
- iommu_state[DMAR_PHMBASE_REG * i] =
- (u64) dmar_readq(iommu->reg, DMAR_PHMBASE_REG);
- iommu_state[DMAR_PHMLIMIT_REG * i] =
- (u64) dmar_readq(iommu->reg, DMAR_PHMLIMIT_REG);
- i++;
}

return 0;
@@ -1904,37 +1893,34 @@ int iommu_resume(void)
{
struct acpi_drhd_unit *drhd;
struct iommu *iommu;
- int i = 0;
+ u32 i;
+
+ if ( !vtd_enabled )
+ return 0;

iommu_flush_all();

- init_vtd_hw();
+ if ( init_vtd_hw() != 0 && force_iommu )
+ panic("IOMMU setup failed, crash Xen for security purpose!\n");
+
for_each_drhd_unit ( drhd )
{
iommu = drhd->iommu;
- dmar_writeq( iommu->reg, DMAR_RTADDR_REG,
- (u64) iommu_state[DMAR_RTADDR_REG * i]);
+ i = iommu->index;
+
dmar_writel(iommu->reg, DMAR_FECTL_REG,
- (u32) iommu_state[DMAR_FECTL_REG * i]);
+ (u32) iommu_state[i][DMAR_FECTL_REG]);
dmar_writel(iommu->reg, DMAR_FEDATA_REG,
- (u32) iommu_state[DMAR_FEDATA_REG * i]);
+ (u32) iommu_state[i][DMAR_FEDATA_REG]);
dmar_writel(iommu->reg, DMAR_FEADDR_REG,
- (u32) iommu_state[DMAR_FEADDR_REG * i]);
+ (u32) iommu_state[i][DMAR_FEADDR_REG]);
dmar_writel(iommu->reg, DMAR_FEUADDR_REG,
- (u32) iommu_state[DMAR_FEUADDR_REG * i]);
- dmar_writel(iommu->reg, DMAR_PLMBASE_REG,
- (u32) iommu_state[DMAR_PLMBASE_REG * i]);
- dmar_writel(iommu->reg, DMAR_PLMLIMIT_REG,
- (u32) iommu_state[DMAR_PLMLIMIT_REG * i]);
- dmar_writeq(iommu->reg, DMAR_PHMBASE_REG,
- (u64) iommu_state[DMAR_PHMBASE_REG * i]);
- dmar_writeq(iommu->reg, DMAR_PHMLIMIT_REG,
- (u64) iommu_state[DMAR_PHMLIMIT_REG * i]);
+ (u32) iommu_state[i][DMAR_FEUADDR_REG]);

if ( iommu_enable_translation(iommu) )
return -EIO;
- i++;
- }
+ }
+
return 0;
}

diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/include/asm-x86/io_apic.h
--- a/xen/include/asm-x86/io_apic.h Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/include/asm-x86/io_apic.h Mon Sep 01 16:59:43 2008 +0900
@@ -125,7 +125,7 @@ extern int mpc_default_type;

static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
{
- if (vtd_enabled)
+ if (iommu_enabled)
return io_apic_read_remap_rte(apic, reg);
*IO_APIC_BASE(apic) = reg;
return *(IO_APIC_BASE(apic)+4);
@@ -152,6 +152,8 @@ extern int sis_apic_bug;
#endif
static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
{
+ if (iommu_enabled)
+ return iommu_update_ire_from_apic(apic, reg, value);
if (sis_apic_bug)
*IO_APIC_BASE(apic) = reg;
*(IO_APIC_BASE(apic)+4) = value;
diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/include/asm-x86/msr-index.h
--- a/xen/include/asm-x86/msr-index.h Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/include/asm-x86/msr-index.h Mon Sep 01 16:59:43 2008 +0900
@@ -186,6 +186,9 @@
#define MSR_K8_ENABLE_C1E 0xc0010055
#define MSR_K8_VM_CR 0xc0010114
#define MSR_K8_VM_HSAVE_PA 0xc0010117
+
+#define MSR_K8_FEATURE_MASK 0xc0011004
+#define MSR_K8_EXT_FEATURE_MASK 0xc0011005

/* MSR_K8_VM_CR bits: */
#define _K8_VMCR_SVME_DISABLE 4
diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/include/asm-x86/processor.h
--- a/xen/include/asm-x86/processor.h Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/include/asm-x86/processor.h Mon Sep 01 16:59:43 2008 +0900
@@ -583,6 +583,8 @@ int wrmsr_hypervisor_regs(
int wrmsr_hypervisor_regs(
uint32_t idx, uint32_t eax, uint32_t edx);

+int microcode_update(XEN_GUEST_HANDLE(const_void), unsigned long len);
+
#endif /* !__ASSEMBLY__ */

#endif /* __ASM_X86_PROCESSOR_H */
diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/include/asm-x86/softirq.h
--- a/xen/include/asm-x86/softirq.h Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/include/asm-x86/softirq.h Mon Sep 01 16:59:43 2008 +0900
@@ -1,8 +1,9 @@
#ifndef __ASM_SOFTIRQ_H__
#define __ASM_SOFTIRQ_H__

-#define NMI_MCE_SOFTIRQ (NR_COMMON_SOFTIRQS + 0)
+#define NMI_MCE_SOFTIRQ (NR_COMMON_SOFTIRQS + 0)
+#define TIME_CALIBRATE_SOFTIRQ (NR_COMMON_SOFTIRQS + 1)

-#define NR_ARCH_SOFTIRQS 1
+#define NR_ARCH_SOFTIRQS 2

#endif /* __ASM_SOFTIRQ_H__ */
diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/include/public/memory.h
--- a/xen/include/public/memory.h Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/include/public/memory.h Mon Sep 01 16:59:43 2008 +0900
@@ -204,6 +204,7 @@ struct xen_add_to_physmap {
/* Source mapping space. */
#define XENMAPSPACE_shared_info 0 /* shared info page */
#define XENMAPSPACE_grant_table 1 /* grant table page */
+#define XENMAPSPACE_mfn 2 /* usual MFN */
unsigned int space;

/* Index into source mapping space. */
@@ -214,6 +215,22 @@ struct xen_add_to_physmap {
};
typedef struct xen_add_to_physmap xen_add_to_physmap_t;
DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t);
+
+/*
+ * Unmaps the page appearing at a particular GPFN from the specified guest's
+ * pseudophysical address space.
+ * arg == addr of xen_remove_from_physmap_t.
+ */
+#define XENMEM_remove_from_physmap 15
+struct xen_remove_from_physmap {
+ /* Which domain to change the mapping for. */
+ domid_t domid;
+
+ /* GPFN of the current mapping of the page. */
+ xen_pfn_t gpfn;
+};
+typedef struct xen_remove_from_physmap xen_remove_from_physmap_t;
+DEFINE_XEN_GUEST_HANDLE(xen_remove_from_physmap_t);

/*
* Translates a list of domain-specific GPFNs into MFNs. Returns a -ve error
diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/include/public/platform.h
--- a/xen/include/public/platform.h Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/include/public/platform.h Mon Sep 01 16:59:43 2008 +0900
@@ -97,7 +97,7 @@ DEFINE_XEN_GUEST_HANDLE(xenpf_read_memty
#define XENPF_microcode_update 35
struct xenpf_microcode_update {
/* IN variables. */
- XEN_GUEST_HANDLE(void) data; /* Pointer to microcode data */
+ XEN_GUEST_HANDLE(const_void) data;/* Pointer to microcode data */
uint32_t length; /* Length of microcode data. */
};
typedef struct xenpf_microcode_update xenpf_microcode_update_t;
diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/include/xen/compat.h
--- a/xen/include/xen/compat.h Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/include/xen/compat.h Mon Sep 01 16:59:43 2008 +0900
@@ -19,7 +19,9 @@
type *_[0] __attribute__((__packed__)); \
} __compat_handle_ ## name

-#define DEFINE_COMPAT_HANDLE(name) __DEFINE_COMPAT_HANDLE(name, name)
+#define DEFINE_COMPAT_HANDLE(name) \
+ __DEFINE_COMPAT_HANDLE(name, name); \
+ __DEFINE_COMPAT_HANDLE(const_ ## name, const name)
#define COMPAT_HANDLE(name) __compat_handle_ ## name

/* Is the compat handle a NULL reference? */
diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/include/xen/iommu.h
--- a/xen/include/xen/iommu.h Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/include/xen/iommu.h Mon Sep 01 16:59:43 2008 +0900
@@ -109,4 +109,8 @@ struct iommu_ops {

void iommu_update_ire_from_apic(unsigned int apic, unsigned int reg, unsigned int value);
void iommu_update_ire_from_msi(struct msi_desc *msi_desc, struct msi_msg *msg);
+
+int iommu_suspend(void);
+int iommu_resume(void);
+
#endif /* _IOMMU_H_ */
diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/include/xen/timer.h
--- a/xen/include/xen/timer.h Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/include/xen/timer.h Mon Sep 01 16:59:43 2008 +0900
@@ -14,16 +14,29 @@

struct timer {
/* System time expiry value (nanoseconds since boot). */
- s_time_t expires;
+ s_time_t expires;
+
+ /* Position in active-timer data structure. */
+ union {
+ /* Timer-heap offset. */
+ unsigned int heap_offset;
+ /* Overflow linked list. */
+ struct timer *list_next;
+ };
+
+ /* On expiry, '(*function)(data)' will be executed in softirq context. */
+ void (*function)(void *);
+ void *data;
+
/* CPU on which this timer will be installed and executed. */
- unsigned int cpu;
- /* On expiry, '(*function)(data)' will be executed in softirq context. */
- void (*function)(void *);
- void *data;
- /* Timer-heap offset. */
- unsigned int heap_offset;
- /* Has this timer been killed (cannot be activated)? */
- int killed;
+ uint16_t cpu;
+
+ /* Timer status. */
+#define TIMER_STATUS_inactive 0 /* Not in use; can be activated. */
+#define TIMER_STATUS_killed 1 /* Not in use; canot be activated. */
+#define TIMER_STATUS_in_heap 2 /* In use; on timer heap. */
+#define TIMER_STATUS_in_list 3 /* In use; on overflow linked list. */
+ uint8_t status;
};

/*
@@ -37,7 +50,7 @@ struct timer {
*/
static inline int active_timer(struct timer *timer)
{
- return (timer->heap_offset != 0);
+ return (timer->status >= TIMER_STATUS_in_heap);
}

/*
diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/include/xlat.lst
--- a/xen/include/xlat.lst Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/include/xlat.lst Mon Sep 01 16:59:43 2008 +0900
@@ -33,6 +33,7 @@
! kexec_image kexec.h
! kexec_range kexec.h
! add_to_physmap memory.h
+! remove_from_physmap memory.h
! foreign_memory_map memory.h
! memory_exchange memory.h
! memory_map memory.h
diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/include/xsm/xsm.h
--- a/xen/include/xsm/xsm.h Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/include/xsm/xsm.h Mon Sep 01 16:59:43 2008 +0900
@@ -136,6 +136,7 @@ struct xsm_operations {
int (*mmu_machphys_update) (struct domain *d, unsigned long mfn);
int (*update_va_mapping) (struct domain *d, l1_pgentry_t pte);
int (*add_to_physmap) (struct domain *d1, struct domain *d2);
+ int (*remove_from_physmap) (struct domain *d1, struct domain *d2);
#endif
};

@@ -532,6 +533,11 @@ static inline int xsm_add_to_physmap(str
{
return xsm_call(add_to_physmap(d1, d2));
}
+
+static inline int xsm_remove_from_physmap(struct domain *d1, struct domain *d2)
+{
+ return xsm_call(remove_from_physmap(d1, d2));
+}
#endif /* CONFIG_X86 */

#endif /* __XSM_H */
diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/xsm/dummy.c
--- a/xen/xsm/dummy.c Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/xsm/dummy.c Mon Sep 01 16:59:43 2008 +0900
@@ -382,6 +382,11 @@ static int dummy_update_va_mapping (stru
}

static int dummy_add_to_physmap (struct domain *d1, struct domain *d2)
+{
+ return 0;
+}
+
+static int dummy_remove_from_physmap (struct domain *d1, struct domain *d2)
{
return 0;
}
@@ -484,5 +489,6 @@ void xsm_fixup_ops (struct xsm_operation
set_to_dummy_if_null(ops, mmu_machphys_update);
set_to_dummy_if_null(ops, update_va_mapping);
set_to_dummy_if_null(ops, add_to_physmap);
+ set_to_dummy_if_null(ops, remove_from_physmap);
#endif
}
diff -r 48db4eee7d58 -r d0a544d8a3f3 xen/xsm/flask/hooks.c
--- a/xen/xsm/flask/hooks.c Mon Aug 25 19:04:37 2008 +0900
+++ b/xen/xsm/flask/hooks.c Mon Sep 01 16:59:43 2008 +0900
@@ -1025,6 +1025,11 @@ static int flask_update_va_mapping(struc
}

static int flask_add_to_physmap(struct domain *d1, struct domain *d2)
+{
+ return domain_has_perm(d1, d2, SECCLASS_MMU, MMU__PHYSMAP);
+}
+
+static int flask_remove_from_physmap(struct domain *d1, struct domain *d2)
{
return domain_has_perm(d1, d2, SECCLASS_MMU, MMU__PHYSMAP);
}
@@ -1115,6 +1120,7 @@ static struct xsm_operations flask_ops =
.mmu_machphys_update = flask_mmu_machphys_update,
.update_va_mapping = flask_update_va_mapping,
.add_to_physmap = flask_add_to_physmap,
+ .remove_from_physmap = flask_remove_from_physmap,
#endif
};


_______________________________________________
Xen-changelog mailing list
Xen-changelog@lists.xensource.com
http://lists.xensource.com/xen-changelog
[xen-unstable] merge with xen-unstable.hg [ In reply to ]
# HG changeset patch
# User Isaku Yamahata <yamahata@valinux.co.jp>
# Date 1221567930 -32400
# Node ID 4a381ddc764a635e9242686ef8cefb5af363c873
# Parent ec8eaab557d867dca3e8cbb3e0384d797929102a
# Parent 3eb7a0cfffc20146c1676b001acbd86b449dc58f
merge with xen-unstable.hg
---
Config.mk | 16
Makefile | 7
stubdom/Makefile | 53 +-
tools/Makefile | 9
tools/python/xen/util/xsm/dummy/dummy.py | 3
xen/arch/x86/Makefile | 2
xen/arch/x86/acpi/cpufreq/cpufreq.c | 287 ++++++++------
xen/arch/x86/acpi/cpufreq/cpufreq_ondemand.c | 7
xen/arch/x86/acpi/cpufreq/powernow.c | 10
xen/arch/x86/acpi/cpufreq/utility.c | 186 ++++-----
xen/arch/x86/acpi/pmstat.c | 2
xen/arch/x86/acpi/power.c | 6
xen/arch/x86/domain.c | 6
xen/arch/x86/microcode.c | 541 ++++++---------------------
xen/arch/x86/microcode_amd.c | 371 ++++++++++++++++++
xen/arch/x86/microcode_intel.c | 370 ++++++++++++++++++
xen/arch/x86/mm/shadow/common.c | 21 -
xen/arch/x86/mm/shadow/multi.c | 3
xen/arch/x86/platform_hypercall.c | 39 +
xen/arch/x86/smpboot.c | 5
xen/common/gdbstub.c | 15
xen/drivers/char/console.c | 14
xen/drivers/char/ns16550.c | 34 +
xen/drivers/char/serial.c | 10
xen/drivers/passthrough/vtd/iommu.c | 1
xen/include/acpi/cpufreq/cpufreq.h | 74 +++
xen/include/acpi/cpufreq/processor_perf.h | 19
xen/include/asm-x86/microcode.h | 93 ++++
xen/include/asm-x86/msr-index.h | 4
xen/include/asm-x86/processor.h | 35 -
xen/include/public/platform.h | 2
31 files changed, 1500 insertions(+), 745 deletions(-)

diff -r ec8eaab557d8 -r 4a381ddc764a Config.mk
--- a/Config.mk Fri Sep 12 14:47:40 2008 +0900
+++ b/Config.mk Tue Sep 16 21:25:30 2008 +0900
@@ -54,6 +54,22 @@ define cc-ver-check-closure
endif
endef

+define absolutify_xen_root
+ case "$(XEN_ROOT)" in \
+ /*) XEN_ROOT=$(XEN_ROOT) ;; \
+ *) xen_root_lhs=`pwd`; \
+ xen_root_rhs=$(XEN_ROOT)/; \
+ while [ "x$${xen_root_rhs#../}" != "x$$xen_root_rhs" ]; do \
+ xen_root_rhs="$${xen_root_rhs#../}"; \
+ xen_root_rhs="$${xen_root_rhs#/}"; \
+ xen_root_rhs="$${xen_root_rhs#/}"; \
+ xen_root_lhs="$${xen_root_lhs%/*}"; \
+ done; \
+ XEN_ROOT="$$xen_root_lhs/$$xen_root_rhs" ;; \
+ esac; \
+ export XEN_ROOT
+endef
+
ifeq ($(debug),y)
CFLAGS += -g
endif
diff -r ec8eaab557d8 -r 4a381ddc764a Makefile
--- a/Makefile Fri Sep 12 14:47:40 2008 +0900
+++ b/Makefile Tue Sep 16 21:25:30 2008 +0900
@@ -64,7 +64,7 @@ install-xen:
$(MAKE) -C xen install

.PHONY: install-tools
-install-tools:
+install-tools: tools/ioemu-dir
$(MAKE) -C tools install

.PHONY: install-kernels
@@ -72,11 +72,14 @@ install-kernels:
for i in $(XKERNELS) ; do $(MAKE) $$i-install || exit 1; done

.PHONY: install-stubdom
-install-stubdom:
+install-stubdom: tools/ioemu-dir
$(MAKE) -C stubdom install
ifeq (x86_64,$(XEN_TARGET_ARCH))
XEN_TARGET_ARCH=x86_32 $(MAKE) -C stubdom install-grub
endif
+
+tools/ioemu-dir:
+ make -C tools ioemu-dir-find

.PHONY: install-docs
install-docs:
diff -r ec8eaab557d8 -r 4a381ddc764a stubdom/Makefile
--- a/stubdom/Makefile Fri Sep 12 14:47:40 2008 +0900
+++ b/stubdom/Makefile Tue Sep 16 21:25:30 2008 +0900
@@ -6,8 +6,6 @@ export stubdom=y
export stubdom=y
export debug=y
include $(XEN_ROOT)/Config.mk
-
-override CONFIG_QEMU=ioemu

IOEMU_OPTIONS=--disable-sdl --disable-opengl --disable-vnc-tls --disable-brlapi --disable-kqemu
ZLIB_URL?=http://www.zlib.net
@@ -59,8 +57,8 @@ TARGET_CPPFLAGS += -isystem $(CURDIR)/$(
TARGET_CPPFLAGS += -isystem $(CURDIR)/$(MINI_OS)/include/posix
TARGET_CPPFLAGS += -isystem $(CROSS_PREFIX)/$(GNU_TARGET_ARCH)-xen-elf/include
TARGET_CPPFLAGS += -isystem $(GCC_INSTALL)include
-TARGET_CPPFLAGS += -isystem $(CURDIR)/lwip/src/include
-TARGET_CPPFLAGS += -isystem $(CURDIR)/lwip/src/include/ipv4
+TARGET_CPPFLAGS += -isystem $(CURDIR)/lwip-$(XEN_TARGET_ARCH)/src/include
+TARGET_CPPFLAGS += -isystem $(CURDIR)/lwip-$(XEN_TARGET_ARCH)/src/include/ipv4
TARGET_CPPFLAGS += -I$(CURDIR)/include

TARGET_LDFLAGS += -nostdlib -L$(CROSS_PREFIX)/$(GNU_TARGET_ARCH)-xen-elf/lib
@@ -164,7 +162,29 @@ lwip-$(XEN_TARGET_ARCH): lwip-$(LWIP_VER
.PHONY: $(CROSS_ROOT)
$(CROSS_ROOT): cross-newlib cross-zlib cross-libpci

-mk-headers-$(XEN_TARGET_ARCH):
+$(XEN_ROOT)/tools/ioemu-dir:
+ make -C $(XEN_ROOT)/tools ioemu-dir-find
+
+ioemu/linkfarm.stamp: $(XEN_ROOT)/tools/ioemu-dir
+ mkdir -p ioemu
+ifeq ($(CONFIG_QEMU),ioemu)
+ [ -h ioemu/Makefile ] || ( cd ioemu && \
+ ln -sf ../$(XEN_ROOT)/tools/ioemu/* .)
+else
+ set -e; \
+ $(absolutify_xen_root); \
+ cd ioemu; \
+ src="$$XEN_ROOT/tools/ioemu-dir"; export src; \
+ (cd $$src && find * -type d -print) | xargs mkdir -p; \
+ (cd $$src && find * ! -type l -type f $(addprefix ! -name , \
+ '*.[oda1]' 'config-*' config.mak qemu-dm qemu-img-xen \
+ '*.html' '*.pod' \
+ )) | \
+ while read f; do rm -f "$$f"; ln -s "$$src/$$f" "$$f"; done
+endif
+ touch ioemu/linkfarm.stamp
+
+mk-headers-$(XEN_TARGET_ARCH): ioemu/linkfarm.stamp
mkdir -p include/xen && \
ln -sf $(addprefix ../../,$(wildcard $(XEN_ROOT)/xen/include/public/*.h)) include/xen && \
ln -sf $(addprefix ../../$(XEN_ROOT)/xen/include/public/,arch-ia64 arch-x86 hvm io xsm) include/xen && \
@@ -183,22 +203,6 @@ mk-headers-$(XEN_TARGET_ARCH):
ln -sf ../$(XEN_ROOT)/tools/libxc/$(XEN_TARGET_ARCH)/*.c . && \
ln -sf ../$(XEN_ROOT)/tools/libxc/$(XEN_TARGET_ARCH)/*.h . && \
ln -sf ../$(XEN_ROOT)/tools/libxc/$(XEN_TARGET_ARCH)/Makefile . )
- mkdir -p ioemu
-ifeq ($(CONFIG_QEMU),ioemu)
- [ -h ioemu/Makefile ] || ( cd ioemu && \
- ln -sf ../$(XEN_ROOT)/tools/ioemu/* .)
-else
- [ -h ioemu/Makefile ] || ( cd ioemu && \
- ln -sf $(CONFIG_QEMU)/* . && \
- rm -fr i386-dm && \
- rm -fr i386-stubdom && \
- mkdir i386-dm && \
- mkdir i386-stubdom && \
- ln -sf $(CONFIG_QEMU)/i386-dm/* i386-dm/ && \
- ln -sf $(CONFIG_QEMU)/i386-stubdom/* i386-stubdom/ )
-endif
- [ ! -h ioemu/config-host.h ] || rm -f ioemu/config-host.h
- [ ! -h ioemu/config-host.mak ] || rm -f ioemu/config-host.mak
$(MAKE) -C $(MINI_OS) links
touch mk-headers-$(XEN_TARGET_ARCH)

@@ -231,8 +235,9 @@ ifeq ($(CONFIG_QEMU),ioemu)
CPPFLAGS="$(TARGET_CPPFLAGS)" $(MAKE) -C ioemu LWIPDIR=$(CURDIR)/lwip-$(XEN_TARGET_ARCH) TOOLS=
else
[ -f ioemu/config-host.mak ] || \
- ( cd ioemu ; \
- CONFIG_STUBDOM=yes XEN_ROOT=$(abspath $(XEN_ROOT)) XEN_TARGET_ARCH=$(XEN_TARGET_ARCH) CFLAGS="$(TARGET_CFLAGS)" sh ./xen-setup --cc=$(CC) --disable-gcc-check $(IOEMU_OPTIONS))
+ ( $(absolutify_xen_root); \
+ cd ioemu ; \
+ CONFIG_STUBDOM=yes XEN_TARGET_ARCH=$(XEN_TARGET_ARCH) CFLAGS="$(TARGET_CFLAGS)" sh ./xen-setup --cc=$(CC) --disable-gcc-check $(IOEMU_OPTIONS))
CPPFLAGS= TARGET_CPPFLAGS="$(TARGET_CPPFLAGS)" $(MAKE) -C ioemu LWIPDIR=$(CURDIR)/lwip-$(XEN_TARGET_ARCH) TOOLS= CONFIG_STUBDOM=yes
endif

@@ -336,7 +341,7 @@ clean:
$(MAKE) -C c clean
rm -fr grub-$(XEN_TARGET_ARCH)
[ ! -d libxc-$(XEN_TARGET_ARCH) ] || $(MAKE) -C libxc-$(XEN_TARGET_ARCH) clean
- [ ! -d ioemu ] || $(MAKE) -C ioemu clean
+ -[ ! -d ioemu ] || $(MAKE) -C ioemu clean

# clean the cross-compilation result
.PHONY: crossclean
diff -r ec8eaab557d8 -r 4a381ddc764a tools/Makefile
--- a/tools/Makefile Fri Sep 12 14:47:40 2008 +0900
+++ b/tools/Makefile Tue Sep 16 21:25:30 2008 +0900
@@ -93,17 +93,14 @@ ioemu-dir-find:
ln -sf ioemu-remote ioemu-dir; \
fi
set -e; \
- case "$(XEN_ROOT)" in \
- /*) XEN_ROOT=$(XEN_ROOT) ;; \
- *) XEN_ROOT=`pwd`/$(XEN_ROOT) ;; \
- esac; \
- export XEN_ROOT; \
+ $(absolutify_xen_root); \
cd ioemu-dir; \
./xen-setup $(IOEMU_CONFIGURE_CROSS)

subdir-all-ioemu-dir subdir-install-ioemu-dir: ioemu-dir-find

subdir-clean-ioemu-dir:
- if test -d ioemu-dir/.; then \
+ set -e; if test -d ioemu-dir/.; then \
+ $(absolutify_xen_root); \
$(MAKE) -C ioemu-dir clean; \
fi
diff -r ec8eaab557d8 -r 4a381ddc764a tools/python/xen/util/xsm/dummy/dummy.py
--- a/tools/python/xen/util/xsm/dummy/dummy.py Fri Sep 12 14:47:40 2008 +0900
+++ b/tools/python/xen/util/xsm/dummy/dummy.py Tue Sep 16 21:25:30 2008 +0900
@@ -131,3 +131,6 @@ def dump_policy_file():

def get_ssid(domain):
err("No ssid has been assigned to any domain under xsm dummy module.")
+
+def security_label_to_details(res_label):
+ return ("","","")
diff -r ec8eaab557d8 -r 4a381ddc764a xen/arch/x86/Makefile
--- a/xen/arch/x86/Makefile Fri Sep 12 14:47:40 2008 +0900
+++ b/xen/arch/x86/Makefile Tue Sep 16 21:25:30 2008 +0900
@@ -28,6 +28,8 @@ obj-y += ioport_emulate.o
obj-y += ioport_emulate.o
obj-y += irq.o
obj-y += microcode.o
+obj-y += microcode_amd.o
+obj-y += microcode_intel.o
obj-y += mm.o
obj-y += mpparse.o
obj-y += nmi.o
diff -r ec8eaab557d8 -r 4a381ddc764a xen/arch/x86/acpi/cpufreq/cpufreq.c
--- a/xen/arch/x86/acpi/cpufreq/cpufreq.c Fri Sep 12 14:47:40 2008 +0900
+++ b/xen/arch/x86/acpi/cpufreq/cpufreq.c Tue Sep 16 21:25:30 2008 +0900
@@ -32,6 +32,7 @@
#include <xen/errno.h>
#include <xen/delay.h>
#include <xen/cpumask.h>
+#include <xen/sched.h>
#include <xen/timer.h>
#include <xen/xmalloc.h>
#include <asm/bug.h>
@@ -44,12 +45,8 @@
#include <acpi/acpi.h>
#include <acpi/cpufreq/cpufreq.h>

-struct processor_pminfo processor_pminfo[NR_CPUS];
-struct cpufreq_policy xen_px_policy[NR_CPUS];
-
-static cpumask_t *cpufreq_dom_pt;
-static unsigned long *cpufreq_dom_mask;
-static unsigned int cpufreq_dom_max;
+/* TODO: change to link list later as domain number may be sparse */
+static cpumask_t cpufreq_dom_map[NR_CPUS];

enum {
UNDEFINED_CAPABLE = 0,
@@ -335,7 +332,7 @@ static int acpi_cpufreq_target(struct cp
if (unlikely(result))
return -ENODEV;

- online_policy_cpus = policy->cpus;
+ cpus_and(online_policy_cpus, cpu_online_map, policy->cpus);

next_perf_state = data->freq_table[next_state].index;
if (perf->state == next_perf_state) {
@@ -388,6 +385,20 @@ static int acpi_cpufreq_target(struct cp
policy->cur = freqs.new;

return result;
+}
+
+static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
+{
+ struct acpi_cpufreq_data *data = drv_data[policy->cpu];
+ struct processor_performance *perf = &processor_pminfo[policy->cpu].perf;
+
+ if (!policy || !data)
+ return -EINVAL;
+
+ cpufreq_verify_within_limits(policy, 0,
+ perf->states[perf->platform_limit].core_frequency * 1000);
+
+ return cpufreq_frequency_table_verify(policy, data->freq_table);
}

static unsigned long
@@ -441,14 +452,6 @@ acpi_cpufreq_cpu_init(struct cpufreq_pol
perf = data->acpi_data;
policy->shared_type = perf->shared_type;

- /*
- * Currently the latest linux (kernel version 2.6.26)
- * still has issue when handle the situation _psd HW_ALL coordination.
- * In Xen hypervisor, we handle _psd HW_ALL coordination in same way as
- * _psd SW_ALL coordination for the seek of safety.
- */
- policy->cpus = perf->shared_cpu_map;
-
/* capability check */
if (perf->state_count <= 1) {
printk("No P-States\n");
@@ -496,6 +499,7 @@ acpi_cpufreq_cpu_init(struct cpufreq_pol
policy->cpuinfo.transition_latency =
perf->states[i].transition_latency * 1000;
}
+ policy->governor = CPUFREQ_DEFAULT_GOVERNOR;

data->max_freq = perf->states[0].core_frequency * 1000;
/* table init */
@@ -554,114 +558,173 @@ err_unreg:
return result;
}

+static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
+{
+ struct acpi_cpufreq_data *data = drv_data[policy->cpu];
+
+ if (data) {
+ drv_data[policy->cpu] = NULL;
+ xfree(data->freq_table);
+ xfree(data);
+ }
+
+ return 0;
+}
+
static struct cpufreq_driver acpi_cpufreq_driver = {
+ .verify = acpi_cpufreq_verify,
.target = acpi_cpufreq_target,
.init = acpi_cpufreq_cpu_init,
+ .exit = acpi_cpufreq_cpu_exit,
};

-void cpufreq_dom_exit(void)
-{
- cpufreq_dom_max = 0;
- if (cpufreq_dom_mask)
- xfree(cpufreq_dom_mask);
- if (cpufreq_dom_pt)
- xfree(cpufreq_dom_pt);
-}
-
-int cpufreq_dom_init(void)
-{
- unsigned int i;
-
- cpufreq_dom_max = 0;
-
- for_each_online_cpu(i) {
- if (cpufreq_dom_max < processor_pminfo[i].perf.domain_info.domain)
- cpufreq_dom_max = processor_pminfo[i].perf.domain_info.domain;
- }
- cpufreq_dom_max++;
-
- cpufreq_dom_mask = xmalloc_array(unsigned long,
- BITS_TO_LONGS(cpufreq_dom_max));
- if (!cpufreq_dom_mask)
- return -ENOMEM;
- bitmap_zero(cpufreq_dom_mask, cpufreq_dom_max);
-
- cpufreq_dom_pt = xmalloc_array(cpumask_t, cpufreq_dom_max);
- if (!cpufreq_dom_pt)
- return -ENOMEM;
- memset(cpufreq_dom_pt, 0, cpufreq_dom_max * sizeof(cpumask_t));
-
- for_each_online_cpu(i) {
- __set_bit(processor_pminfo[i].perf.domain_info.domain, cpufreq_dom_mask);
- cpu_set(i, cpufreq_dom_pt[processor_pminfo[i].perf.domain_info.domain]);
- }
-
- for_each_online_cpu(i)
- processor_pminfo[i].perf.shared_cpu_map =
- cpufreq_dom_pt[processor_pminfo[i].perf.domain_info.domain];
+int cpufreq_limit_change(unsigned int cpu)
+{
+ struct processor_performance *perf = &processor_pminfo[cpu].perf;
+ struct cpufreq_policy *data = cpufreq_cpu_policy[cpu];
+ struct cpufreq_policy policy;
+
+ if (!cpu_online(cpu) || !data)
+ return -ENODEV;
+
+ if ((perf->platform_limit < 0) ||
+ (perf->platform_limit >= perf->state_count))
+ return -EINVAL;
+
+ memcpy(&policy, data, sizeof(struct cpufreq_policy));
+
+ policy.max =
+ perf->states[perf->platform_limit].core_frequency * 1000;
+
+ return __cpufreq_set_policy(data, &policy);
+}
+
+int cpufreq_add_cpu(unsigned int cpu)
+{
+ int ret = 0;
+ unsigned int firstcpu;
+ unsigned int dom;
+ unsigned int j;
+ struct cpufreq_policy new_policy;
+ struct cpufreq_policy *policy;
+ struct processor_performance *perf = &processor_pminfo[cpu].perf;
+
+ /* to protect the case when Px was not controlled by xen */
+ if (!(perf->init & XEN_PX_INIT))
+ return 0;
+
+ if (cpu_is_offline(cpu) || cpufreq_cpu_policy[cpu])
+ return -EINVAL;
+
+ ret = px_statistic_init(cpu);
+ if (ret)
+ return ret;
+
+ dom = perf->domain_info.domain;
+ if (cpus_weight(cpufreq_dom_map[dom])) {
+ /* share policy with the first cpu since on same boat */
+ firstcpu = first_cpu(cpufreq_dom_map[dom]);
+ policy = cpufreq_cpu_policy[firstcpu];
+
+ cpufreq_cpu_policy[cpu] = policy;
+ cpu_set(cpu, cpufreq_dom_map[dom]);
+ cpu_set(cpu, policy->cpus);
+
+ printk(KERN_EMERG"adding CPU %u\n", cpu);
+ } else {
+ /* for the first cpu, setup policy and do init work */
+ policy = xmalloc(struct cpufreq_policy);
+ if (!policy) {
+ px_statistic_exit(cpu);
+ return -ENOMEM;
+ }
+ memset(policy, 0, sizeof(struct cpufreq_policy));
+
+ cpufreq_cpu_policy[cpu] = policy;
+ cpu_set(cpu, cpufreq_dom_map[dom]);
+ cpu_set(cpu, policy->cpus);
+
+ policy->cpu = cpu;
+ ret = cpufreq_driver->init(policy);
+ if (ret)
+ goto err1;
+ printk(KERN_EMERG"CPU %u initialization completed\n", cpu);
+ }
+
+ /*
+ * After get full cpumap of the coordination domain,
+ * we can safely start gov here.
+ */
+ if (cpus_weight(cpufreq_dom_map[dom]) ==
+ perf->domain_info.num_processors) {
+ memcpy(&new_policy, policy, sizeof(struct cpufreq_policy));
+ policy->governor = NULL;
+ ret = __cpufreq_set_policy(policy, &new_policy);
+ if (ret)
+ goto err2;
+ }

return 0;
-}
-
-static int cpufreq_cpu_init(void)
-{
- int i, ret = 0;
-
- for_each_online_cpu(i) {
- xen_px_policy[i].cpu = i;
-
- ret = px_statistic_init(i);
- if (ret)
- return ret;
-
- ret = acpi_cpufreq_cpu_init(&xen_px_policy[i]);
- if (ret)
- return ret;
- }
+
+err2:
+ cpufreq_driver->exit(policy);
+err1:
+ for_each_cpu_mask(j, cpufreq_dom_map[dom]) {
+ cpufreq_cpu_policy[j] = NULL;
+ px_statistic_exit(j);
+ }
+
+ cpus_clear(cpufreq_dom_map[dom]);
+ xfree(policy);
return ret;
}

-int cpufreq_dom_dbs(unsigned int event)
-{
- unsigned int cpu, dom;
+int cpufreq_del_cpu(unsigned int cpu)
+{
+ unsigned int dom;
+ struct cpufreq_policy *policy;
+ struct processor_performance *perf = &processor_pminfo[cpu].perf;
+
+ /* to protect the case when Px was not controlled by xen */
+ if (!(perf->init & XEN_PX_INIT))
+ return 0;
+
+ if (cpu_is_offline(cpu) || !cpufreq_cpu_policy[cpu])
+ return -EINVAL;
+
+ dom = perf->domain_info.domain;
+ policy = cpufreq_cpu_policy[cpu];
+
+ printk(KERN_EMERG"deleting CPU %u\n", cpu);
+
+ /* for the first cpu of the domain, stop gov */
+ if (cpus_weight(cpufreq_dom_map[dom]) ==
+ perf->domain_info.num_processors)
+ __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
+
+ cpufreq_cpu_policy[cpu] = NULL;
+ cpu_clear(cpu, policy->cpus);
+ cpu_clear(cpu, cpufreq_dom_map[dom]);
+ px_statistic_exit(cpu);
+
+ /* for the last cpu of the domain, clean room */
+ /* It's safe here to free freq_table, drv_data and policy */
+ if (!cpus_weight(cpufreq_dom_map[dom])) {
+ cpufreq_driver->exit(policy);
+ xfree(policy);
+ }
+
+ return 0;
+}
+
+static int __init cpufreq_driver_init(void)
+{
int ret = 0;

- for (dom = 0; dom < cpufreq_dom_max; dom++) {
- if (!test_bit(dom, cpufreq_dom_mask))
- continue;
- cpu = first_cpu(cpufreq_dom_pt[dom]);
- ret = cpufreq_governor_dbs(&xen_px_policy[cpu], event);
- if (ret)
- return ret;
- }
+ if ((cpufreq_controller == FREQCTL_xen) &&
+ (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL))
+ ret = cpufreq_register_driver(&acpi_cpufreq_driver);
+
return ret;
}
-
-int acpi_cpufreq_init(void)
-{
- int ret = 0;
-
- /* setup cpumask of psd dom and shared cpu map of cpu */
- ret = cpufreq_dom_init();
- if (ret)
- goto err;
-
- /* setup cpufreq driver */
- cpufreq_driver = &acpi_cpufreq_driver;
-
- /* setup cpufreq infrastructure */
- ret = cpufreq_cpu_init();
- if (ret)
- goto err;
-
- /* setup cpufreq dbs according to dom coordiation */
- ret = cpufreq_dom_dbs(CPUFREQ_GOV_START);
- if (ret)
- goto err;
-
- return ret;
-
-err:
- cpufreq_dom_exit();
- return ret;
-}
+__initcall(cpufreq_driver_init);
diff -r ec8eaab557d8 -r 4a381ddc764a xen/arch/x86/acpi/cpufreq/cpufreq_ondemand.c
--- a/xen/arch/x86/acpi/cpufreq/cpufreq_ondemand.c Fri Sep 12 14:47:40 2008 +0900
+++ b/xen/arch/x86/acpi/cpufreq/cpufreq_ondemand.c Tue Sep 16 21:25:30 2008 +0900
@@ -238,4 +238,9 @@ int cpufreq_governor_dbs(struct cpufreq_
break;
}
return 0;
-}
+}
+
+struct cpufreq_governor cpufreq_gov_dbs = {
+ .name = "ondemand",
+ .governor = cpufreq_governor_dbs,
+};
diff -r ec8eaab557d8 -r 4a381ddc764a xen/arch/x86/acpi/cpufreq/powernow.c
--- a/xen/arch/x86/acpi/cpufreq/powernow.c Fri Sep 12 14:47:40 2008 +0900
+++ b/xen/arch/x86/acpi/cpufreq/powernow.c Tue Sep 16 21:25:30 2008 +0900
@@ -50,7 +50,7 @@
#define MSR_PSTATE_CUR_LIMIT 0xc0010061 /* pstate current limit MSR */

extern struct processor_pminfo processor_pminfo[NR_CPUS];
-extern struct cpufreq_policy xen_px_policy[NR_CPUS];
+extern struct cpufreq_policy *cpufreq_cpu_policy[NR_CPUS];

struct powernow_cpufreq_data {
struct processor_performance *acpi_data;
@@ -281,9 +281,9 @@ int powernow_cpufreq_init(void)

/* setup cpufreq infrastructure */
for_each_online_cpu(i) {
- xen_px_policy[i].cpu = i;
-
- ret = powernow_cpufreq_cpu_init(&xen_px_policy[i]);
+ cpufreq_cpu_policy[i]->cpu = i;
+
+ ret = powernow_cpufreq_cpu_init(cpufreq_cpu_policy[i]);
if (ret)
goto cpufreq_init_out;
}
@@ -293,7 +293,7 @@ int powernow_cpufreq_init(void)
if (!cpu_isset(dom, dom_mask))
continue;
i = first_cpu(pt[dom]);
- ret = cpufreq_governor_dbs(&xen_px_policy[i], CPUFREQ_GOV_START);
+ ret = cpufreq_governor_dbs(cpufreq_cpu_policy[i], CPUFREQ_GOV_START);
if (ret)
goto cpufreq_init_out;
}
diff -r ec8eaab557d8 -r 4a381ddc764a xen/arch/x86/acpi/cpufreq/utility.c
--- a/xen/arch/x86/acpi/cpufreq/utility.c Fri Sep 12 14:47:40 2008 +0900
+++ b/xen/arch/x86/acpi/cpufreq/utility.c Tue Sep 16 21:25:30 2008 +0900
@@ -31,46 +31,13 @@
#include <acpi/cpufreq/cpufreq.h>
#include <public/sysctl.h>

-struct cpufreq_driver *cpufreq_driver;
+struct cpufreq_driver *cpufreq_driver;
+struct processor_pminfo processor_pminfo[NR_CPUS];
+struct cpufreq_policy *cpufreq_cpu_policy[NR_CPUS];

/*********************************************************************
* Px STATISTIC INFO *
*********************************************************************/
-
-void px_statistic_suspend(void)
-{
- int cpu;
- uint64_t now;
-
- now = NOW();
-
- for_each_online_cpu(cpu) {
- struct pm_px *pxpt = &px_statistic_data[cpu];
- uint64_t total_idle_ns;
- uint64_t tmp_idle_ns;
-
- total_idle_ns = get_cpu_idle_time(cpu);
- tmp_idle_ns = total_idle_ns - pxpt->prev_idle_wall;
-
- pxpt->u.pt[pxpt->u.cur].residency +=
- now - pxpt->prev_state_wall;
- pxpt->u.pt[pxpt->u.cur].residency -= tmp_idle_ns;
- }
-}
-
-void px_statistic_resume(void)
-{
- int cpu;
- uint64_t now;
-
- now = NOW();
-
- for_each_online_cpu(cpu) {
- struct pm_px *pxpt = &px_statistic_data[cpu];
- pxpt->prev_state_wall = now;
- pxpt->prev_idle_wall = get_cpu_idle_time(cpu);
- }
-}

void px_statistic_update(cpumask_t cpumask, uint8_t from, uint8_t to)
{
@@ -101,7 +68,7 @@ void px_statistic_update(cpumask_t cpuma
}
}

-int px_statistic_init(int cpuid)
+int px_statistic_init(unsigned int cpuid)
{
uint32_t i, count;
struct pm_px *pxpt = &px_statistic_data[cpuid];
@@ -123,7 +90,7 @@ int px_statistic_init(int cpuid)
memset(pxpt->u.pt, 0, count * (sizeof(struct pm_px_val)));

pxpt->u.total = pmpt->perf.state_count;
- pxpt->u.usable = pmpt->perf.state_count - pmpt->perf.ppc;
+ pxpt->u.usable = pmpt->perf.state_count - pmpt->perf.platform_limit;

for (i=0; i < pmpt->perf.state_count; i++)
pxpt->u.pt[i].freq = pmpt->perf.states[i].core_frequency;
@@ -134,7 +101,16 @@ int px_statistic_init(int cpuid)
return 0;
}

-void px_statistic_reset(int cpuid)
+void px_statistic_exit(unsigned int cpuid)
+{
+ struct pm_px *pxpt = &px_statistic_data[cpuid];
+
+ xfree(pxpt->u.trans_pt);
+ xfree(pxpt->u.pt);
+ memset(pxpt, 0, sizeof(struct pm_px));
+}
+
+void px_statistic_reset(unsigned int cpuid)
{
uint32_t i, j, count;
struct pm_px *pxpt = &px_statistic_data[cpuid];
@@ -182,6 +158,38 @@ int cpufreq_frequency_table_cpuinfo(stru
return -EINVAL;
else
return 0;
+}
+
+int cpufreq_frequency_table_verify(struct cpufreq_policy *policy,
+ struct cpufreq_frequency_table *table)
+{
+ unsigned int next_larger = ~0;
+ unsigned int i;
+ unsigned int count = 0;
+
+ if (!cpu_online(policy->cpu))
+ return -EINVAL;
+
+ cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
+ policy->cpuinfo.max_freq);
+
+ for (i=0; (table[i].frequency != CPUFREQ_TABLE_END); i++) {
+ unsigned int freq = table[i].frequency;
+ if (freq == CPUFREQ_ENTRY_INVALID)
+ continue;
+ if ((freq >= policy->min) && (freq <= policy->max))
+ count++;
+ else if ((next_larger > freq) && (freq > policy->max))
+ next_larger = freq;
+ }
+
+ if (!count)
+ policy->max = next_larger;
+
+ cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
+ policy->cpuinfo.max_freq);
+
+ return 0;
}

int cpufreq_frequency_table_target(struct cpufreq_policy *policy,
@@ -289,57 +297,51 @@ int __cpufreq_driver_getavg(struct cpufr


/*********************************************************************
- * CPUFREQ SUSPEND/RESUME *
- *********************************************************************/
-
-void cpufreq_suspend(void)
-{
- int cpu;
-
- /* to protect the case when Px was not controlled by xen */
- for_each_online_cpu(cpu) {
- struct processor_performance *perf = &processor_pminfo[cpu].perf;
-
- if (!(perf->init & XEN_PX_INIT))
- return;
- }
-
- cpufreq_dom_dbs(CPUFREQ_GOV_STOP);
-
- cpufreq_dom_exit();
-
- px_statistic_suspend();
-}
-
-int cpufreq_resume(void)
-{
- int cpu, ret = 0;
-
- /* 1. to protect the case when Px was not controlled by xen */
- /* 2. set state and resume flag to sync cpu to right state and freq */
- for_each_online_cpu(cpu) {
- struct processor_performance *perf = &processor_pminfo[cpu].perf;
- struct cpufreq_policy *policy = &xen_px_policy[cpu];
-
- if (!(perf->init & XEN_PX_INIT))
- goto err;
- perf->state = 0;
- policy->resume = 1;
- }
-
- px_statistic_resume();
-
- ret = cpufreq_dom_init();
+ * POLICY *
+ *********************************************************************/
+
+/*
+ * data : current policy.
+ * policy : policy to be set.
+ */
+int __cpufreq_set_policy(struct cpufreq_policy *data,
+ struct cpufreq_policy *policy)
+{
+ int ret = 0;
+
+ memcpy(&policy->cpuinfo, &data->cpuinfo, sizeof(struct cpufreq_cpuinfo));
+
+ if (policy->min > data->min && policy->min > policy->max)
+ return -EINVAL;
+
+ /* verify the cpu speed can be set within this limit */
+ ret = cpufreq_driver->verify(policy);
if (ret)
- goto err;
-
- ret = cpufreq_dom_dbs(CPUFREQ_GOV_START);
- if (ret)
- goto err;
-
- return ret;
-
-err:
- cpufreq_dom_exit();
- return ret;
-}
+ return ret;
+
+ data->min = policy->min;
+ data->max = policy->max;
+
+ if (policy->governor != data->governor) {
+ /* save old, working values */
+ struct cpufreq_governor *old_gov = data->governor;
+
+ /* end old governor */
+ if (data->governor)
+ __cpufreq_governor(data, CPUFREQ_GOV_STOP);
+
+ /* start new governor */
+ data->governor = policy->governor;
+ if (__cpufreq_governor(data, CPUFREQ_GOV_START)) {
+ /* new governor failed, so re-start old one */
+ if (old_gov) {
+ data->governor = old_gov;
+ __cpufreq_governor(data, CPUFREQ_GOV_START);
+ }
+ return -EINVAL;
+ }
+ /* might be a policy change, too, so fall through */
+ }
+
+ return __cpufreq_governor(data, CPUFREQ_GOV_LIMITS);
+}
diff -r ec8eaab557d8 -r 4a381ddc764a xen/arch/x86/acpi/pmstat.c
--- a/xen/arch/x86/acpi/pmstat.c Fri Sep 12 14:47:40 2008 +0900
+++ b/xen/arch/x86/acpi/pmstat.c Tue Sep 16 21:25:30 2008 +0900
@@ -78,7 +78,7 @@ int do_get_pm_info(struct xen_sysctl_get
tmp_idle_ns = total_idle_ns - pxpt->prev_idle_wall;

now = NOW();
- pxpt->u.usable = pmpt->perf.state_count - pmpt->perf.ppc;
+ pxpt->u.usable = pmpt->perf.state_count - pmpt->perf.platform_limit;
pxpt->u.pt[pxpt->u.cur].residency += now - pxpt->prev_state_wall;
pxpt->u.pt[pxpt->u.cur].residency -= tmp_idle_ns;
pxpt->prev_state_wall = now;
diff -r ec8eaab557d8 -r 4a381ddc764a xen/arch/x86/acpi/power.c
--- a/xen/arch/x86/acpi/power.c Fri Sep 12 14:47:40 2008 +0900
+++ b/xen/arch/x86/acpi/power.c Tue Sep 16 21:25:30 2008 +0900
@@ -133,14 +133,14 @@ static int enter_state(u32 state)

freeze_domains();

- cpufreq_suspend();
-
disable_nonboot_cpus();
if ( num_online_cpus() != 1 )
{
error = -EBUSY;
goto enable_cpu;
}
+
+ cpufreq_del_cpu(0);

hvm_cpu_down();

@@ -189,8 +189,8 @@ static int enter_state(u32 state)
BUG();

enable_cpu:
+ cpufreq_add_cpu(0);
enable_nonboot_cpus();
- cpufreq_resume();
thaw_domains();
spin_unlock(&pm_lock);
return error;
diff -r ec8eaab557d8 -r 4a381ddc764a xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c Fri Sep 12 14:47:40 2008 +0900
+++ b/xen/arch/x86/domain.c Tue Sep 16 21:25:30 2008 +0900
@@ -86,6 +86,12 @@ static void default_idle(void)

static void play_dead(void)
{
+ /*
+ * Flush pending softirqs if any. They can be queued up before this CPU
+ * was taken out of cpu_online_map in __cpu_disable().
+ */
+ do_softirq();
+
/* This must be done before dead CPU ack */
cpu_exit_clear();
hvm_cpu_down();
diff -r ec8eaab557d8 -r 4a381ddc764a xen/arch/x86/microcode.c
--- a/xen/arch/x86/microcode.c Fri Sep 12 14:47:40 2008 +0900
+++ b/xen/arch/x86/microcode.c Tue Sep 16 21:25:30 2008 +0900
@@ -1,72 +1,24 @@
/*
- * Intel CPU Microcode Update Driver for Linux
+ * Intel CPU Microcode Update Driver for Linux
*
- * Copyright (C) 2000-2004 Tigran Aivazian
+ * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ * 2006 Shaohua Li <shaohua.li@intel.com> *
+ * This driver allows to upgrade microcode on Intel processors
+ * belonging to IA-32 family - PentiumPro, Pentium II,
+ * Pentium III, Xeon, Pentium 4, etc.
*
- * This driver allows to upgrade microcode on Intel processors
- * belonging to IA-32 family - PentiumPro, Pentium II,
- * Pentium III, Xeon, Pentium 4, etc.
+ * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
+ * Software Developer's Manual
+ * Order Number 253668 or free download from:
*
- * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual,
- * Order Number 245472 or free download from:
- *
- * http://developer.intel.com/design/pentium4/manuals/245472.htm
+ * http://developer.intel.com/design/pentium4/manuals/253668.htm
*
- * For more information, go to http://www.urbanmyth.org/microcode
+ * For more information, go to http://www.urbanmyth.org/microcode
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Initial release.
- * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Added read() support + cleanups.
- * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Added 'device trimming' support. open(O_WRONLY) zeroes
- * and frees the saved copy of applied microcode.
- * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Made to use devfs (/dev/cpu/microcode) + cleanups.
- * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
- * Added misc device support (now uses both devfs and misc).
- * Added MICROCODE_IOCFREE ioctl to clear memory.
- * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
- * Messages for error cases (non Intel & no suitable microcode).
- * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
- * Removed ->release(). Removed exclusive open and status bitmap.
- * Added microcode_rwsem to serialize read()/write()/ioctl().
- * Removed global kernel lock usage.
- * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
- * Write 0 to 0x8B msr and then cpuid before reading revision,
- * so that it works even if there were no update done by the
- * BIOS. Otherwise, reading from 0x8B gives junk (which happened
- * to be 0 on my machine which is why it worked even when I
- * disabled update by the BIOS)
- * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
- * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
- * Tigran Aivazian <tigran@veritas.com>
- * Intel Pentium 4 processor support and bugfixes.
- * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
- * Bugfix for HT (Hyper-Threading) enabled processors
- * whereby processor resources are shared by all logical processors
- * in a single CPU package.
- * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
- * Tigran Aivazian <tigran@veritas.com>,
- * Serialize updates as required on HT processors due to speculative
- * nature of implementation.
- * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
- * Fix the panic when writing zero-length microcode chunk.
- * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
- * Jun Nakajima <jun.nakajima@intel.com>
- * Support for the microcode updates in the new format.
- * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
- * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
- * because we no longer hold a copy of applied microcode
- * in kernel memory.
- * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
- * Fix sigmatch() macro to handle old CPUs with pf == 0.
- * Thanks to Stuart Swales for pointing out this bug.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
*/

#include <xen/config.h>
@@ -76,402 +28,169 @@
#include <xen/sched.h>
#include <xen/smp.h>
#include <xen/spinlock.h>
+#include <xen/guest_access.h>

#include <asm/current.h>
#include <asm/msr.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
+#include <asm/microcode.h>

-#define pr_debug(x...) ((void)0)
-#define DEFINE_MUTEX(_m) DEFINE_SPINLOCK(_m)
-#define mutex_lock(_m) spin_lock(_m)
-#define mutex_unlock(_m) spin_unlock(_m)
-#define vmalloc(_s) xmalloc_bytes(_s)
-#define vfree(_p) xfree(_p)
+const struct microcode_ops *microcode_ops;

-#if 0
-MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
-MODULE_AUTHOR("Tigran Aivazian <tigran@veritas.com>");
-MODULE_LICENSE("GPL");
-#endif
+static DEFINE_SPINLOCK(microcode_mutex);

-static int verbose;
-boolean_param("microcode.verbose", verbose);
+struct ucode_cpu_info ucode_cpu_info[NR_CPUS];

-#define MICROCODE_VERSION "1.14a"
+struct microcode_buffer {
+ void *buf;
+ size_t size;
+};

-#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */
-#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */
-#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
-#define EXT_HEADER_SIZE (sizeof (struct extended_sigtable)) /* 20 bytes */
-#define EXT_SIGNATURE_SIZE (sizeof (struct extended_signature)) /* 12 bytes */
-#define DWSIZE (sizeof (u32))
-#define get_totalsize(mc) \
- (((microcode_t *)mc)->hdr.totalsize ? \
- ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE)
-#define get_datasize(mc) \
- (((microcode_t *)mc)->hdr.datasize ? \
- ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
+static struct microcode_buffer microcode_buffer;
+static bool_t microcode_error;

-#define sigmatch(s1, s2, p1, p2) \
- (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
+static void microcode_fini_cpu(int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;

-#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
-
-/* serialize access to the physical write to MSR 0x79 */
-static DEFINE_SPINLOCK(microcode_update_lock);
-
-/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
-static DEFINE_MUTEX(microcode_mutex);
-
-static const void __user *user_buffer; /* user area microcode data buffer */
-static unsigned int user_buffer_size; /* it's size */
-
-typedef enum mc_error_code {
- MC_SUCCESS = 0,
- MC_IGNORED = 1,
- MC_NOTFOUND = 2,
- MC_MARKED = 3,
- MC_ALLOCATED = 4,
-} mc_error_code_t;
-
-static struct ucode_cpu_info {
- unsigned int sig;
- unsigned int pf, orig_pf;
- unsigned int rev;
- unsigned int cksum;
- mc_error_code_t err;
- microcode_t *mc;
-} ucode_cpu_info[NR_CPUS];
-
-static void collect_cpu_info (void *unused)
-{
- int cpu_num = smp_processor_id();
- struct cpuinfo_x86 *c = cpu_data + cpu_num;
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
- unsigned int val[2];
-
- uci->sig = uci->pf = uci->rev = uci->cksum = 0;
- uci->err = MC_NOTFOUND;
- uci->mc = NULL;
-
- if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
- cpu_has(c, X86_FEATURE_IA64)) {
- printk(KERN_ERR "microcode: CPU%d not a capable Intel processor\n", cpu_num);
- return;
- } else {
- uci->sig = cpuid_eax(0x00000001);
-
- if ((c->x86_model >= 5) || (c->x86 > 6)) {
- /* get processor flags from MSR 0x17 */
- rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
- uci->pf = 1 << ((val[1] >> 18) & 7);
- }
- uci->orig_pf = uci->pf;
- }
-
- wrmsr(MSR_IA32_UCODE_REV, 0, 0);
- /* see notes above for revision 1.07. Apparent chip bug */
- sync_core();
- /* get the current revision from MSR 0x8B */
- rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev);
- pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
- uci->sig, uci->pf, uci->rev);
+ spin_lock(&microcode_mutex);
+ xfree(uci->mc.valid_mc);
+ uci->mc.valid_mc = NULL;
+ uci->valid = 0;
+ spin_unlock(&microcode_mutex);
}

-static inline void mark_microcode_update (int cpu_num, microcode_header_t *mc_header, int sig, int pf, int cksum)
+static int collect_cpu_info(int cpu)
{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
+ int err = 0;
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;

- pr_debug("Microcode Found.\n");
- pr_debug(" Header Revision 0x%x\n", mc_header->hdrver);
- pr_debug(" Loader Revision 0x%x\n", mc_header->ldrver);
- pr_debug(" Revision 0x%x \n", mc_header->rev);
- pr_debug(" Date %x/%x/%x\n",
- ((mc_header->date >> 24 ) & 0xff),
- ((mc_header->date >> 16 ) & 0xff),
- (mc_header->date & 0xFFFF));
- pr_debug(" Signature 0x%x\n", sig);
- pr_debug(" Type 0x%x Family 0x%x Model 0x%x Stepping 0x%x\n",
- ((sig >> 12) & 0x3),
- ((sig >> 8) & 0xf),
- ((sig >> 4) & 0xf),
- ((sig & 0xf)));
- pr_debug(" Processor Flags 0x%x\n", pf);
- pr_debug(" Checksum 0x%x\n", cksum);
+ memset(uci, 0, sizeof(*uci));
+ err = microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig);
+ if ( !err )
+ uci->valid = 1;

- if (mc_header->rev < uci->rev) {
- if (uci->err == MC_NOTFOUND) {
- uci->err = MC_IGNORED;
- uci->cksum = mc_header->rev;
- } else if (uci->err == MC_IGNORED && uci->cksum < mc_header->rev)
- uci->cksum = mc_header->rev;
- } else if (mc_header->rev == uci->rev) {
- if (uci->err < MC_MARKED) {
- /* notify the caller of success on this cpu */
- uci->err = MC_SUCCESS;
- }
- } else if (uci->err != MC_ALLOCATED || mc_header->rev > uci->mc->hdr.rev) {
- pr_debug("microcode: CPU%d found a matching microcode update with "
- " revision 0x%x (current=0x%x)\n", cpu_num, mc_header->rev, uci->rev);
- uci->cksum = cksum;
- uci->pf = pf; /* keep the original mc pf for cksum calculation */
- uci->err = MC_MARKED; /* found the match */
- for_each_online_cpu(cpu_num) {
- if (ucode_cpu_info + cpu_num != uci
- && ucode_cpu_info[cpu_num].mc == uci->mc) {
- uci->mc = NULL;
- break;
- }
- }
- if (uci->mc != NULL) {
- vfree(uci->mc);
- uci->mc = NULL;
- }
- }
- return;
+ return err;
}

-static int find_matching_ucodes (void)
+static int microcode_resume_cpu(int cpu)
{
- int cursor = 0;
- int error = 0;
+ int err = 0;
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ struct cpu_signature nsig;

- while (cursor + MC_HEADER_SIZE < user_buffer_size) {
- microcode_header_t mc_header;
- void *newmc = NULL;
- int i, sum, cpu_num, allocated_flag, total_size, data_size, ext_table_size;
+ gdprintk(XENLOG_INFO, "microcode: CPU%d resumed\n", cpu);

- if (copy_from_user(&mc_header, user_buffer + cursor, MC_HEADER_SIZE)) {
- printk(KERN_ERR "microcode: error! Can not read user data\n");
- error = -EFAULT;
- goto out;
- }
+ if ( !uci->mc.valid_mc )
+ return -EIO;

- total_size = get_totalsize(&mc_header);
- if (cursor + total_size > user_buffer_size) {
- printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
- error = -EINVAL;
- goto out;
- }
+ /*
+ * Let's verify that the 'cached' ucode does belong
+ * to this cpu (a bit of paranoia):
+ */
+ err = microcode_ops->collect_cpu_info(cpu, &nsig);
+ if ( err )
+ {
+ microcode_fini_cpu(cpu);
+ return err;
+ }

- data_size = get_datasize(&mc_header);
- if (data_size + MC_HEADER_SIZE > total_size) {
- printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
- error = -EINVAL;
- goto out;
- }
+ if ( memcmp(&nsig, &uci->cpu_sig, sizeof(nsig)) )
+ {
+ microcode_fini_cpu(cpu);
+ /* Should we look for a new ucode here? */
+ return -EIO;
+ }

- if (mc_header.ldrver != 1 || mc_header.hdrver != 1) {
- printk(KERN_ERR "microcode: error! Unknown microcode update format\n");
- error = -EINVAL;
- goto out;
- }
+ err = microcode_ops->apply_microcode(cpu);

- for_each_online_cpu(cpu_num) {
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
-
- if (sigmatch(mc_header.sig, uci->sig, mc_header.pf, uci->orig_pf))
- mark_microcode_update(cpu_num, &mc_header, mc_header.sig, mc_header.pf, mc_header.cksum);
- }
-
- ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
- if (ext_table_size) {
- struct extended_sigtable ext_header;
- struct extended_signature ext_sig;
- int ext_sigcount;
-
- if ((ext_table_size < EXT_HEADER_SIZE)
- || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
- printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
- error = -EINVAL;
- goto out;
- }
- if (copy_from_user(&ext_header, user_buffer + cursor
- + MC_HEADER_SIZE + data_size, EXT_HEADER_SIZE)) {
- printk(KERN_ERR "microcode: error! Can not read user data\n");
- error = -EFAULT;
- goto out;
- }
- if (ext_table_size != exttable_size(&ext_header)) {
- printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
- error = -EFAULT;
- goto out;
- }
-
- ext_sigcount = ext_header.count;
-
- for (i = 0; i < ext_sigcount; i++) {
- if (copy_from_user(&ext_sig, user_buffer + cursor + MC_HEADER_SIZE + data_size + EXT_HEADER_SIZE
- + EXT_SIGNATURE_SIZE * i, EXT_SIGNATURE_SIZE)) {
- printk(KERN_ERR "microcode: error! Can not read user data\n");
- error = -EFAULT;
- goto out;
- }
- for_each_online_cpu(cpu_num) {
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
-
- if (sigmatch(ext_sig.sig, uci->sig, ext_sig.pf, uci->orig_pf)) {
- mark_microcode_update(cpu_num, &mc_header, ext_sig.sig, ext_sig.pf, ext_sig.cksum);
- }
- }
- }
- }
- /* now check if any cpu has matched */
- allocated_flag = 0;
- sum = 0;
- for_each_online_cpu(cpu_num) {
- if (ucode_cpu_info[cpu_num].err == MC_MARKED) {
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
- if (!allocated_flag) {
- allocated_flag = 1;
- newmc = vmalloc(total_size);
- if (!newmc) {
- printk(KERN_ERR "microcode: error! Can not allocate memory\n");
- error = -ENOMEM;
- goto out;
- }
- if (copy_from_user(newmc + MC_HEADER_SIZE,
- user_buffer + cursor + MC_HEADER_SIZE,
- total_size - MC_HEADER_SIZE)) {
- printk(KERN_ERR "microcode: error! Can not read user data\n");
- vfree(newmc);
- error = -EFAULT;
- goto out;
- }
- memcpy(newmc, &mc_header, MC_HEADER_SIZE);
- /* check extended table checksum */
- if (ext_table_size) {
- int ext_table_sum = 0;
- int * ext_tablep = (((void *) newmc) + MC_HEADER_SIZE + data_size);
- i = ext_table_size / DWSIZE;
- while (i--) ext_table_sum += ext_tablep[i];
- if (ext_table_sum) {
- printk(KERN_WARNING "microcode: aborting, bad extended signature table checksum\n");
- vfree(newmc);
- error = -EINVAL;
- goto out;
- }
- }
-
- /* calculate the checksum */
- i = (MC_HEADER_SIZE + data_size) / DWSIZE;
- while (i--) sum += ((int *)newmc)[i];
- sum -= (mc_header.sig + mc_header.pf + mc_header.cksum);
- }
- ucode_cpu_info[cpu_num].mc = newmc;
- ucode_cpu_info[cpu_num].err = MC_ALLOCATED; /* mc updated */
- if (sum + uci->sig + uci->pf + uci->cksum != 0) {
- printk(KERN_ERR "microcode: CPU%d aborting, bad checksum\n", cpu_num);
- error = -EINVAL;
- goto out;
- }
- }
- }
- cursor += total_size; /* goto the next update patch */
- } /* end of while */
-out:
- return error;
+ return err;
}

-static void do_update_one (void * unused)
+static int microcode_update_cpu(int cpu, const void *buf, size_t size)
{
- unsigned long flags;
- unsigned int val[2];
- int cpu_num = smp_processor_id();
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
+ int err = 0;
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;

- if (uci->mc == NULL) {
- if (verbose) {
- if (uci->err == MC_SUCCESS)
- printk(KERN_INFO "microcode: CPU%d already at revision 0x%x\n",
- cpu_num, uci->rev);
- else
- printk(KERN_INFO "microcode: No new microcode data for CPU%d\n", cpu_num);
- }
- return;
- }
+ /* We should bind the task to the CPU */
+ BUG_ON(raw_smp_processor_id() != cpu);

- /* serialize access to the physical write to MSR 0x79 */
- spin_lock_irqsave(&microcode_update_lock, flags);
+ spin_lock(&microcode_mutex);

- /* write microcode via MSR 0x79 */
- wrmsr(MSR_IA32_UCODE_WRITE,
- (unsigned long) uci->mc->bits,
- (unsigned long) uci->mc->bits >> 16 >> 16);
- wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+ /*
+ * Check if the system resume is in progress (uci->valid != NULL),
+ * otherwise just request a firmware:
+ */
+ if ( uci->valid )
+ {
+ err = microcode_resume_cpu(cpu);
+ }
+ else
+ {
+ err = collect_cpu_info(cpu);
+ if ( !err && uci->valid )
+ err = microcode_ops->cpu_request_microcode(cpu, buf, size);
+ }

- /* see notes above for revision 1.07. Apparent chip bug */
- sync_core();
+ spin_unlock(&microcode_mutex);

- /* get the current revision from MSR 0x8B */
- rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-
- /* notify the caller of success on this cpu */
- uci->err = MC_SUCCESS;
- spin_unlock_irqrestore(&microcode_update_lock, flags);
- printk(KERN_INFO "microcode: CPU%d updated from revision "
- "0x%x to 0x%x, date = %08x \n",
- cpu_num, uci->rev, val[1], uci->mc->hdr.date);
- return;
+ return err;
}

-static int do_microcode_update (void)
+static void do_microcode_update_one(void *info)
{
- int i, error;
+ int error = microcode_update_cpu(
+ smp_processor_id(), microcode_buffer.buf, microcode_buffer.size);
+ if ( error )
+ microcode_error = error;
+}

- if (on_each_cpu(collect_cpu_info, NULL, 1, 1) != 0) {
- printk(KERN_ERR "microcode: Error! Could not run on all processors\n");
- error = -EIO;
- goto out;
- }
+static int do_microcode_update(void)
+{
+ microcode_error = 0;

- if ((error = find_matching_ucodes())) {
- printk(KERN_ERR "microcode: Error in the microcode data\n");
- goto out_free;
- }
+ if ( on_each_cpu(do_microcode_update_one, NULL, 1, 1) != 0 )
+ {
+ printk(KERN_ERR "microcode: Error! Could not run on all processors\n");
+ return -EIO;
+ }

- if (on_each_cpu(do_update_one, NULL, 1, 1) != 0) {
- printk(KERN_ERR "microcode: Error! Could not run on all processors\n");
- error = -EIO;
- }
-
-out_free:
- for_each_online_cpu(i) {
- if (ucode_cpu_info[i].mc) {
- int j;
- void *tmp = ucode_cpu_info[i].mc;
- vfree(tmp);
- for_each_online_cpu(j) {
- if (ucode_cpu_info[j].mc == tmp)
- ucode_cpu_info[j].mc = NULL;
- }
- }
- if (ucode_cpu_info[i].err == MC_IGNORED && verbose)
- printk(KERN_WARNING "microcode: CPU%d not 'upgrading' to earlier revision"
- " 0x%x (current=0x%x)\n", i, ucode_cpu_info[i].cksum, ucode_cpu_info[i].rev);
- }
-out:
- return error;
+ return microcode_error;
}

int microcode_update(XEN_GUEST_HANDLE(const_void) buf, unsigned long len)
{
- int ret;
+ int ret;

- if (len != (typeof(user_buffer_size))len) {
- printk(KERN_ERR "microcode: too much data\n");
- return -E2BIG;
- }
+ /* XXX FIXME: No allocations in interrupt context. */
+ return -EINVAL;

- mutex_lock(&microcode_mutex);
+ if ( len != (typeof(microcode_buffer.size))len )
+ {
+ printk(KERN_ERR "microcode: too much data\n");
+ return -E2BIG;
+ }

- user_buffer = buf.p;
- user_buffer_size = len;
+ if ( microcode_ops == NULL )
+ return -EINVAL;

- ret = do_microcode_update();
+ microcode_buffer.buf = xmalloc_array(uint8_t, len);
+ if ( microcode_buffer.buf == NULL )
+ return -ENOMEM;

- mutex_unlock(&microcode_mutex);
+ ret = copy_from_guest(microcode_buffer.buf, buf, len);
+ if ( ret != 0 )
+ return ret;

- return ret;
+ microcode_buffer.size = len;
+ wmb();
+
+ ret = do_microcode_update();
+
+ xfree(microcode_buffer.buf);
+ microcode_buffer.buf = NULL;
+ microcode_buffer.size = 0;
+
+ return ret;
}
diff -r ec8eaab557d8 -r 4a381ddc764a xen/arch/x86/microcode_amd.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/microcode_amd.c Tue Sep 16 21:25:30 2008 +0900
@@ -0,0 +1,371 @@
+/*
+ * AMD CPU Microcode Update Driver for Linux
+ * Copyright (C) 2008 Advanced Micro Devices Inc.
+ *
+ * Author: Peter Oruba <peter.oruba@amd.com>
+ *
+ * Based on work by:
+ * Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ *
+ * This driver allows to upgrade microcode on AMD
+ * family 0x10 and 0x11 processors.
+ *
+ * Licensed unter the terms of the GNU General Public
+ * License version 2. See file COPYING for details.
+ */
+
+#include <xen/config.h>
+#include <xen/lib.h>
+#include <xen/kernel.h>
+#include <xen/init.h>
+#include <xen/sched.h>
+#include <xen/smp.h>
+#include <xen/spinlock.h>
+
+#include <asm/msr.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/microcode.h>
+
+#define pr_debug(x...) ((void)0)
+
+#define UCODE_MAGIC 0x00414d44
+#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
+#define UCODE_UCODE_TYPE 0x00000001
+
+#define UCODE_MAX_SIZE (2048)
+#define DEFAULT_UCODE_DATASIZE (896)
+#define MC_HEADER_SIZE (sizeof(struct microcode_header_amd))
+#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
+#define DWSIZE (sizeof(uint32_t))
+/* For now we support a fixed ucode total size only */
+#define get_totalsize(mc) \
+ ((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \
+ + MC_HEADER_SIZE)
+
+/* serialize access to the physical write */
+static DEFINE_SPINLOCK(microcode_update_lock);
+
+struct equiv_cpu_entry *equiv_cpu_table;
+
+static long install_equiv_cpu_table(const void *, uint32_t, long);
+
+static int collect_cpu_info(int cpu, struct cpu_signature *csig)
+{
+ struct cpuinfo_x86 *c = &cpu_data[cpu];
+
+ memset(csig, 0, sizeof(*csig));
+
+ if ( (c->x86_vendor != X86_VENDOR_AMD) || (c->x86 < 0x10) )
+ {
+ printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n",
+ cpu);
+ return -1;
+ }
+
+ asm volatile (
+ "movl %1, %%ecx; rdmsr"
+ : "=a" (csig->rev)
+ : "i" (MSR_AMD_PATCHLEVEL) : "ecx" );
+
+ printk(KERN_INFO "microcode: collect_cpu_info: patch_id=0x%x\n",
+ csig->rev);
+
+ return 0;
+}
+
+static int get_matching_microcode(void *mc, int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ struct microcode_header_amd *mc_header = mc;
+ unsigned long total_size = get_totalsize(mc_header);
+ void *new_mc;
+ unsigned int current_cpu_id;
+ unsigned int equiv_cpu_id = 0x00;
+ unsigned int i;
+
+ /* We should bind the task to the CPU */
+ BUG_ON(cpu != raw_smp_processor_id());
+
+ /* This is a tricky part. We might be called from a write operation
+ * to the device file instead of the usual process of firmware
+ * loading. This routine needs to be able to distinguish both
+ * cases. This is done by checking if there already is a equivalent
+ * CPU table installed. If not, we're written through
+ * /dev/cpu/microcode.
+ * Since we ignore all checks. The error case in which going through
+ * firmware loading and that table is not loaded has already been
+ * checked earlier.
+ */
+ if ( equiv_cpu_table == NULL )
+ {
+ printk(KERN_INFO "microcode: CPU%d microcode update with "
+ "version 0x%x (current=0x%x)\n",
+ cpu, mc_header->patch_id, uci->cpu_sig.rev);
+ goto out;
+ }
+
+ current_cpu_id = cpuid_eax(0x00000001);
+
+ for ( i = 0; equiv_cpu_table[i].installed_cpu != 0; i++ )
+ {
+ if ( current_cpu_id == equiv_cpu_table[i].installed_cpu )
+ {
+ equiv_cpu_id = equiv_cpu_table[i].equiv_cpu;
+ break;
+ }
+ }
+
+ if ( !equiv_cpu_id )
+ {
+ printk(KERN_ERR "microcode: CPU%d cpu_id "
+ "not found in equivalent cpu table \n", cpu);
+ return 0;
+ }
+
+ if ( (mc_header->processor_rev_id[0]) != (equiv_cpu_id & 0xff) )
+ {
+ printk(KERN_INFO
+ "microcode: CPU%d patch does not match "
+ "(patch is %x, cpu extended is %x) \n",
+ cpu, mc_header->processor_rev_id[0],
+ (equiv_cpu_id & 0xff));
+ return 0;
+ }
+
+ if ( (mc_header->processor_rev_id[1]) != ((equiv_cpu_id >> 16) & 0xff) )
+ {
+ printk(KERN_INFO "microcode: CPU%d patch does not match "
+ "(patch is %x, cpu base id is %x) \n",
+ cpu, mc_header->processor_rev_id[1],
+ ((equiv_cpu_id >> 16) & 0xff));
+ return 0;
+ }
+
+ if ( mc_header->patch_id <= uci->cpu_sig.rev )
+ return 0;
+
+ printk(KERN_INFO "microcode: CPU%d found a matching microcode "
+ "update with version 0x%x (current=0x%x)\n",
+ cpu, mc_header->patch_id, uci->cpu_sig.rev);
+
+ out:
+ new_mc = xmalloc_bytes(UCODE_MAX_SIZE);
+ if ( new_mc == NULL )
+ {
+ printk(KERN_ERR "microcode: error, can't allocate memory\n");
+ return -ENOMEM;
+ }
+ memset(new_mc, 0, UCODE_MAX_SIZE);
+
+ /* free previous update file */
+ xfree(uci->mc.mc_amd);
+
+ memcpy(new_mc, mc, total_size);
+
+ uci->mc.mc_amd = new_mc;
+ return 1;
+}
+
+static int apply_microcode(int cpu)
+{
+ unsigned long flags;
+ uint32_t eax, edx, rev;
+ int cpu_num = raw_smp_processor_id();
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
+ uint64_t addr;
+
+ /* We should bind the task to the CPU */
+ BUG_ON(cpu_num != cpu);
+
+ if ( uci->mc.mc_amd == NULL )
+ return -EINVAL;
+
+ spin_lock_irqsave(&microcode_update_lock, flags);
+
+ addr = (unsigned long)&uci->mc.mc_amd->hdr.data_code;
+ edx = (uint32_t)(addr >> 32);
+ eax = (uint32_t)addr;
+
+ asm volatile (
+ "movl %0, %%ecx; wrmsr" :
+ : "i" (MSR_AMD_PATCHLOADER), "a" (eax), "d" (edx) : "ecx" );
+
+ /* get patch id after patching */
+ asm volatile (
+ "movl %1, %%ecx; rdmsr"
+ : "=a" (rev)
+ : "i" (MSR_AMD_PATCHLEVEL) : "ecx");
+
+ spin_unlock_irqrestore(&microcode_update_lock, flags);
+
+ /* check current patch id and patch's id for match */
+ if ( rev != uci->mc.mc_amd->hdr.patch_id )
+ {
+ printk(KERN_ERR "microcode: CPU%d update from revision "
+ "0x%x to 0x%x failed\n", cpu_num,
+ uci->mc.mc_amd->hdr.patch_id, rev);
+ return -EIO;
+ }
+
+ printk("microcode: CPU%d updated from revision "
+ "0x%x to 0x%x \n",
+ cpu_num, uci->cpu_sig.rev, uci->mc.mc_amd->hdr.patch_id);
+
+ uci->cpu_sig.rev = rev;
+
+ return 0;
+}
+
+static long get_next_ucode_from_buffer_amd(void **mc, const void *buf,
+ unsigned long size, long offset)
+{
+ struct microcode_header_amd *mc_header;
+ unsigned long total_size;
+ const uint8_t *buf_pos = buf;
+
+ /* No more data */
+ if ( offset >= size )
+ return 0;
+
+ if ( buf_pos[offset] != UCODE_UCODE_TYPE )
+ {
+ printk(KERN_ERR "microcode: error! "
+ "Wrong microcode payload type field\n");
+ return -EINVAL;
+ }
+
+ mc_header = (struct microcode_header_amd *)(&buf_pos[offset+8]);
+
+ total_size = (unsigned long) (buf_pos[offset+4] +
+ (buf_pos[offset+5] << 8));
+
+ printk(KERN_INFO "microcode: size %lu, total_size %lu, offset %ld\n",
+ size, total_size, offset);
+
+ if ( (offset + total_size) > size )
+ {
+ printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
+ return -EINVAL;
+ }
+
+ *mc = xmalloc_bytes(UCODE_MAX_SIZE);
+ if ( *mc == NULL )
+ {
+ printk(KERN_ERR "microcode: error! "
+ "Can not allocate memory for microcode patch\n");
+ return -ENOMEM;
+ }
+
+ memset(*mc, 0, UCODE_MAX_SIZE);
+ memcpy(*mc, (const void *)(buf + offset + 8), total_size);
+
+ return offset + total_size + 8;
+}
+
+static long install_equiv_cpu_table(const void *buf,
+ uint32_t size, long offset)
+{
+ const uint32_t *buf_pos = buf;
+
+ /* No more data */
+ if ( offset >= size )
+ return 0;
+
+ if ( buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE )
+ {
+ printk(KERN_ERR "microcode: error! "
+ "Wrong microcode equivalnet cpu table type field\n");
+ return 0;
+ }
+
+ if ( size == 0 )
+ {
+ printk(KERN_ERR "microcode: error! "
+ "Wrong microcode equivalnet cpu table length\n");
+ return 0;
+ }
+
+ equiv_cpu_table = xmalloc_bytes(size);
+ if ( equiv_cpu_table == NULL )
+ {
+ printk(KERN_ERR "microcode: error, can't allocate "
+ "memory for equiv CPU table\n");
+ return 0;
+ }
+
+ memset(equiv_cpu_table, 0, size);
+ memcpy(equiv_cpu_table, (const void *)&buf_pos[3], size);
+
+ return size + 12; /* add header length */
+}
+
+static int cpu_request_microcode(int cpu, const void *buf, size_t size)
+{
+ const uint32_t *buf_pos;
+ long offset = 0;
+ int error = 0;
+ void *mc;
+
+ /* We should bind the task to the CPU */
+ BUG_ON(cpu != raw_smp_processor_id());
+
+ buf_pos = (const uint32_t *)buf;
+
+ if ( buf_pos[0] != UCODE_MAGIC )
+ {
+ printk(KERN_ERR "microcode: error! Wrong "
+ "microcode patch file magic\n");
+ return -EINVAL;
+ }
+
+ offset = install_equiv_cpu_table(buf, (uint32_t)(buf_pos[2]), offset);
+ if ( !offset )
+ {
+ printk(KERN_ERR "microcode: installing equivalent cpu table failed\n");
+ return -EINVAL;
+ }
+
+ while ( (offset =
+ get_next_ucode_from_buffer_amd(&mc, buf, size, offset)) > 0 )
+ {
+ error = get_matching_microcode(mc, cpu);
+ if ( error < 0 )
+ break;
+ /*
+ * It's possible the data file has multiple matching ucode,
+ * lets keep searching till the latest version
+ */
+ if ( error == 1 )
+ {
+ apply_microcode(cpu);
+ error = 0;
+ }
+ xfree(mc);
+ }
+ if ( offset > 0 )
+ {
+ xfree(mc);
+ xfree(equiv_cpu_table);
+ equiv_cpu_table = NULL;
+ }
+ if ( offset < 0 )
+ error = offset;
+
+ return error;
+}
+
+static struct microcode_ops microcode_amd_ops = {
+ .get_matching_microcode = get_matching_microcode,
+ .cpu_request_microcode = cpu_request_microcode,
+ .collect_cpu_info = collect_cpu_info,
+ .apply_microcode = apply_microcode,
+};
+
+static __init int microcode_init_amd(void)
+{
+ if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD )
+ microcode_ops = &microcode_amd_ops;
+ return 0;
+}
+__initcall(microcode_init_amd);
diff -r ec8eaab557d8 -r 4a381ddc764a xen/arch/x86/microcode_intel.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/microcode_intel.c Tue Sep 16 21:25:30 2008 +0900
@@ -0,0 +1,370 @@
+/*
+ * Intel CPU Microcode Update Driver for Linux
+ *
+ * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ * 2006 Shaohua Li <shaohua.li@intel.com> *
+ * This driver allows to upgrade microcode on Intel processors
+ * belonging to IA-32 family - PentiumPro, Pentium II,
+ * Pentium III, Xeon, Pentium 4, etc.
+ *
+ * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
+ * Software Developer's Manual
+ * Order Number 253668 or free download from:
+ *
+ * http://developer.intel.com/design/pentium4/manuals/253668.htm
+ *
+ * For more information, go to http://www.urbanmyth.org/microcode
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <xen/config.h>
+#include <xen/lib.h>
+#include <xen/kernel.h>
+#include <xen/init.h>
+#include <xen/sched.h>
+#include <xen/smp.h>
+#include <xen/spinlock.h>
+
+#include <asm/msr.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/microcode.h>
+
+#define pr_debug(x...) ((void)0)
+
+#define DEFAULT_UCODE_DATASIZE (2000)
+#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel))
+#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
+#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
+#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
+#define DWSIZE (sizeof(u32))
+#define get_totalsize(mc) \
+ (((struct microcode_intel *)mc)->hdr.totalsize ? \
+ ((struct microcode_intel *)mc)->hdr.totalsize : \
+ DEFAULT_UCODE_TOTALSIZE)
+
+#define get_datasize(mc) \
+ (((struct microcode_intel *)mc)->hdr.datasize ? \
+ ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
+
+#define sigmatch(s1, s2, p1, p2) \
+ (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
+
+#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
+
+/* serialize access to the physical write to MSR 0x79 */
+static DEFINE_SPINLOCK(microcode_update_lock);
+
+static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
+{
+ struct cpuinfo_x86 *c = &cpu_data[cpu_num];
+ unsigned int val[2];
+
+ memset(csig, 0, sizeof(*csig));
+
+ if ( (c->x86_vendor != X86_VENDOR_INTEL) || (c->x86 < 6) ||
+ cpu_has(c, X86_FEATURE_IA64) )
+ {
+ printk(KERN_ERR "microcode: CPU%d not a capable Intel "
+ "processor\n", cpu_num);
+ return -1;
+ }
+
+ csig->sig = cpuid_eax(0x00000001);
+
+ if ( (c->x86_model >= 5) || (c->x86 > 6) )
+ {
+ /* get processor flags from MSR 0x17 */
+ rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+ csig->pf = 1 << ((val[1] >> 18) & 7);
+ }
+
+ wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+ /* see notes above for revision 1.07. Apparent chip bug */
+ sync_core();
+ /* get the current revision from MSR 0x8B */
+ rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
+ pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
+ csig->sig, csig->pf, csig->rev);
+
+ return 0;
+}
+
+static inline int microcode_update_match(
+ int cpu_num, struct microcode_header_intel *mc_header, int sig, int pf)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
+
+ return (sigmatch(sig, uci->cpu_sig.sig, pf, uci->cpu_sig.pf) &&
+ (mc_header->rev > uci->cpu_sig.rev));
+}
+
+static int microcode_sanity_check(void *mc)
+{
+ struct microcode_header_intel *mc_header = mc;
+ struct extended_sigtable *ext_header = NULL;
+ struct extended_signature *ext_sig;
+ unsigned long total_size, data_size, ext_table_size;
+ int sum, orig_sum, ext_sigcount = 0, i;
+
+ total_size = get_totalsize(mc_header);
+ data_size = get_datasize(mc_header);
+ if ( (data_size + MC_HEADER_SIZE) > total_size )
+ {
+ printk(KERN_ERR "microcode: error! "
+ "Bad data size in microcode data file\n");
+ return -EINVAL;
+ }
+
+ if ( (mc_header->ldrver != 1) || (mc_header->hdrver != 1) )
+ {
+ printk(KERN_ERR "microcode: error! "
+ "Unknown microcode update format\n");
+ return -EINVAL;
+ }
+ ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
+ if ( ext_table_size )
+ {
+ if ( (ext_table_size < EXT_HEADER_SIZE) ||
+ ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE) )
+ {
+ printk(KERN_ERR "microcode: error! "
+ "Small exttable size in microcode data file\n");
+ return -EINVAL;
+ }
+ ext_header = mc + MC_HEADER_SIZE + data_size;
+ if ( ext_table_size != exttable_size(ext_header) )
+ {
+ printk(KERN_ERR "microcode: error! "
+ "Bad exttable size in microcode data file\n");
+ return -EFAULT;
+ }
+ ext_sigcount = ext_header->count;
+ }
+
+ /* check extended table checksum */
+ if ( ext_table_size )
+ {
+ int ext_table_sum = 0;
+ int *ext_tablep = (int *)ext_header;
+
+ i = ext_table_size / DWSIZE;
+ while ( i-- )
+ ext_table_sum += ext_tablep[i];
+ if ( ext_table_sum )
+ {
+ printk(KERN_WARNING "microcode: aborting, "
+ "bad extended signature table checksum\n");
+ return -EINVAL;
+ }
+ }
+
+ /* calculate the checksum */
+ orig_sum = 0;
+ i = (MC_HEADER_SIZE + data_size) / DWSIZE;
+ while ( i-- )
+ orig_sum += ((int *)mc)[i];
+ if ( orig_sum )
+ {
+ printk(KERN_ERR "microcode: aborting, bad checksum\n");
+ return -EINVAL;
+ }
+ if ( !ext_table_size )
+ return 0;
+ /* check extended signature checksum */
+ for ( i = 0; i < ext_sigcount; i++ )
+ {
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
+ EXT_SIGNATURE_SIZE * i;
+ sum = orig_sum
+ - (mc_header->sig + mc_header->pf + mc_header->cksum)
+ + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+ if ( sum )
+ {
+ printk(KERN_ERR "microcode: aborting, bad checksum\n");
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+/*
+ * return 0 - no update found
+ * return 1 - found update
+ * return < 0 - error
+ */
+static int get_matching_microcode(void *mc, int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ struct microcode_header_intel *mc_header = mc;
+ struct extended_sigtable *ext_header;
+ unsigned long total_size = get_totalsize(mc_header);
+ int ext_sigcount, i;
+ struct extended_signature *ext_sig;
+ void *new_mc;
+
+ if ( microcode_update_match(cpu, mc_header,
+ mc_header->sig, mc_header->pf) )
+ goto find;
+
+ if ( total_size <= (get_datasize(mc_header) + MC_HEADER_SIZE) )
+ return 0;
+
+ ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
+ ext_sigcount = ext_header->count;
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+ for ( i = 0; i < ext_sigcount; i++ )
+ {
+ if ( microcode_update_match(cpu, mc_header,
+ ext_sig->sig, ext_sig->pf) )
+ goto find;
+ ext_sig++;
+ }
+ return 0;
+ find:
+ pr_debug("microcode: CPU%d found a matching microcode update with"
+ " version 0x%x (current=0x%x)\n",
+ cpu, mc_header->rev, uci->cpu_sig.rev);
+ new_mc = xmalloc_bytes(total_size);
+ if ( new_mc == NULL )
+ {
+ printk(KERN_ERR "microcode: error! Can not allocate memory\n");
+ return -ENOMEM;
+ }
+
+ /* free previous update file */
+ xfree(uci->mc.mc_intel);
+
+ memcpy(new_mc, mc, total_size);
+ uci->mc.mc_intel = new_mc;
+ return 1;
+}
+
+static int apply_microcode(int cpu)
+{
+ unsigned long flags;
+ unsigned int val[2];
+ int cpu_num = raw_smp_processor_id();
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
+
+ /* We should bind the task to the CPU */
+ BUG_ON(cpu_num != cpu);
+
+ if ( uci->mc.mc_intel == NULL )
+ return -EINVAL;
+
+ /* serialize access to the physical write to MSR 0x79 */
+ spin_lock_irqsave(&microcode_update_lock, flags);
+
+ /* write microcode via MSR 0x79 */
+ wrmsr(MSR_IA32_UCODE_WRITE,
+ (unsigned long) uci->mc.mc_intel->bits,
+ (unsigned long) uci->mc.mc_intel->bits >> 16 >> 16);
+ wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+
+ /* see notes above for revision 1.07. Apparent chip bug */
+ sync_core();
+
+ /* get the current revision from MSR 0x8B */
+ rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
+
+ spin_unlock_irqrestore(&microcode_update_lock, flags);
+ if ( val[1] != uci->mc.mc_intel->hdr.rev )
+ {
+ printk(KERN_ERR "microcode: CPU%d update from revision "
+ "0x%x to 0x%x failed\n", cpu_num, uci->cpu_sig.rev, val[1]);
+ return -EIO;
+ }
+ printk(KERN_INFO "microcode: CPU%d updated from revision "
+ "0x%x to 0x%x, date = %04x-%02x-%02x \n",
+ cpu_num, uci->cpu_sig.rev, val[1],
+ uci->mc.mc_intel->hdr.date & 0xffff,
+ uci->mc.mc_intel->hdr.date >> 24,
+ (uci->mc.mc_intel->hdr.date >> 16) & 0xff);
+ uci->cpu_sig.rev = val[1];
+
+ return 0;
+}
+
+static long get_next_ucode_from_buffer(void **mc, const u8 *buf,
+ unsigned long size, long offset)
+{
+ struct microcode_header_intel *mc_header;
+ unsigned long total_size;
+
+ /* No more data */
+ if ( offset >= size )
+ return 0;
+ mc_header = (struct microcode_header_intel *)(buf + offset);
+ total_size = get_totalsize(mc_header);
+
+ if ( (offset + total_size) > size )
+ {
+ printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
+ return -EINVAL;
+ }
+
+ *mc = xmalloc_bytes(total_size);
+ if ( *mc == NULL )
+ {
+ printk(KERN_ERR "microcode: error! Can not allocate memory\n");
+ return -ENOMEM;
+ }
+ memcpy(*mc, (const void *)(buf + offset), total_size);
+ return offset + total_size;
+}
+
+static int cpu_request_microcode(int cpu, const void *buf, size_t size)
+{
+ long offset = 0;
+ int error = 0;
+ void *mc;
+
+ /* We should bind the task to the CPU */
+ BUG_ON(cpu != raw_smp_processor_id());
+
+ while ( (offset = get_next_ucode_from_buffer(&mc, buf, size, offset)) > 0 )
+ {
+ error = microcode_sanity_check(mc);
+ if ( error )
+ break;
+ error = get_matching_microcode(mc, cpu);
+ if ( error < 0 )
+ break;
+ /*
+ * It's possible the data file has multiple matching ucode,
+ * lets keep searching till the latest version
+ */
+ if ( error == 1 )
+ {
+ apply_microcode(cpu);
+ error = 0;
+ }
+ xfree(mc);
+ }
+ if ( offset > 0 )
+ xfree(mc);
+ if ( offset < 0 )
+ error = offset;
+
+ return error;
+}
+
+static struct microcode_ops microcode_intel_ops = {
+ .get_matching_microcode = get_matching_microcode,
+ .cpu_request_microcode = cpu_request_microcode,
+ .collect_cpu_info = collect_cpu_info,
+ .apply_microcode = apply_microcode,
+};
+
+static __init int microcode_init_intel(void)
+{
+ if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+ microcode_ops = &microcode_intel_ops;
+ return 0;
+}
+__initcall(microcode_init_intel);
diff -r ec8eaab557d8 -r 4a381ddc764a xen/arch/x86/mm/shadow/common.c
--- a/xen/arch/x86/mm/shadow/common.c Fri Sep 12 14:47:40 2008 +0900
+++ b/xen/arch/x86/mm/shadow/common.c Tue Sep 16 21:25:30 2008 +0900
@@ -2385,11 +2385,13 @@ int sh_remove_write_access(struct vcpu *
+ ((fault_addr & VADDR_MASK) >> 27), 3); break;
}

- /* 64bit Linux direct map at 0xffff810000000000; older kernels
- * had it at 0x0000010000000000UL */
+ /* 64bit Linux direct map at 0xffff880000000000; older kernels
+ * had it at 0xffff810000000000, and older kernels yet had it
+ * at 0x0000010000000000UL */
gfn = mfn_to_gfn(v->domain, gmfn);
- GUESS(0xffff810000000000UL + (gfn << PAGE_SHIFT), 4);
- GUESS(0x0000010000000000UL + (gfn << PAGE_SHIFT), 4);
+ GUESS(0xffff880000000000UL + (gfn << PAGE_SHIFT), 4);
+ GUESS(0xffff810000000000UL + (gfn << PAGE_SHIFT), 4);
+ GUESS(0x0000010000000000UL + (gfn << PAGE_SHIFT), 4);
/*
* 64bit Solaris kernel page map at
* kpm_vbase; 0xfffffe0000000000UL
@@ -2462,22 +2464,25 @@ int sh_remove_write_access_from_sl1p(str
ASSERT(mfn_valid(smfn));
ASSERT(mfn_valid(gmfn));

- if ( sp->type == SH_type_l1_32_shadow )
+ if ( sp->type == SH_type_l1_32_shadow
+ || sp->type == SH_type_fl1_32_shadow )
{
return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,2)
(v, gmfn, smfn, off);
}
#if CONFIG_PAGING_LEVELS >= 3
- else if ( sp->type == SH_type_l1_pae_shadow )
+ else if ( sp->type == SH_type_l1_pae_shadow
+ || sp->type == SH_type_fl1_pae_shadow )
return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,3)
(v, gmfn, smfn, off);
#if CONFIG_PAGING_LEVELS >= 4
- else if ( sp->type == SH_type_l1_64_shadow )
+ else if ( sp->type == SH_type_l1_64_shadow
+ || sp->type == SH_type_fl1_64_shadow )
return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,4)
(v, gmfn, smfn, off);
#endif
#endif
-
+
return 0;
}
#endif
diff -r ec8eaab557d8 -r 4a381ddc764a xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c Fri Sep 12 14:47:40 2008 +0900
+++ b/xen/arch/x86/mm/shadow/multi.c Tue Sep 16 21:25:30 2008 +0900
@@ -4539,7 +4539,8 @@ int sh_rm_write_access_from_sl1p(struct
sp = mfn_to_shadow_page(smfn);

if ( sp->mbz != 0
- || (sp->type != SH_type_l1_shadow) )
+ || (sp->type != SH_type_l1_shadow
+ && sp->type != SH_type_fl1_shadow) )
goto fail;

sl1p = sh_map_domain_page(smfn);
diff -r ec8eaab557d8 -r 4a381ddc764a xen/arch/x86/platform_hypercall.c
--- a/xen/arch/x86/platform_hypercall.c Fri Sep 12 14:47:40 2008 +0900
+++ b/xen/arch/x86/platform_hypercall.c Tue Sep 16 21:25:30 2008 +0900
@@ -393,7 +393,6 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
memcpy ((void *)&pxpt->status_register,
(void *)&xenpxpt->status_register,
sizeof(struct xen_pct_register));
- pxpt->init |= XEN_PX_PCT;
}
if ( xenpxpt->flags & XEN_PX_PSS )
{
@@ -411,7 +410,6 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
break;
}
pxpt->state_count = xenpxpt->state_count;
- pxpt->init |= XEN_PX_PSS;
}
if ( xenpxpt->flags & XEN_PX_PSD )
{
@@ -419,27 +417,34 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
memcpy ((void *)&pxpt->domain_info,
(void *)&xenpxpt->domain_info,
sizeof(struct xen_psd_package));
- pxpt->init |= XEN_PX_PSD;
}
if ( xenpxpt->flags & XEN_PX_PPC )
{
- pxpt->ppc = xenpxpt->ppc;
- pxpt->init |= XEN_PX_PPC;
- }
-
- if ( pxpt->init == ( XEN_PX_PCT | XEN_PX_PSS |
- XEN_PX_PSD | XEN_PX_PPC ) )
- {
- pxpt->init |= XEN_PX_INIT;
+ pxpt->platform_limit = xenpxpt->platform_limit;
+
+ if ( pxpt->init == XEN_PX_INIT )
+ {
+ ret = cpufreq_limit_change(cpuid);
+ break;
+ }
+ }
+
+ if ( xenpxpt->flags == ( XEN_PX_PCT | XEN_PX_PSS |
+ XEN_PX_PSD | XEN_PX_PPC ) )
+ {
+ pxpt->init = XEN_PX_INIT;
cpu_count++;
- }
- if ( cpu_count == num_online_cpus() )
- {
- if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD )
+
+ /* Currently we only handle Intel and AMD processor */
+ if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+ ret = cpufreq_add_cpu(cpuid);
+ else if ( (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
+ (cpu_count == num_online_cpus()) )
ret = powernow_cpufreq_init();
else
- ret = acpi_cpufreq_init();
- }
+ break;
+ }
+
break;
}

diff -r ec8eaab557d8 -r 4a381ddc764a xen/arch/x86/smpboot.c
--- a/xen/arch/x86/smpboot.c Fri Sep 12 14:47:40 2008 +0900
+++ b/xen/arch/x86/smpboot.c Tue Sep 16 21:25:30 2008 +0900
@@ -55,6 +55,7 @@
#include <mach_wakecpu.h>
#include <smpboot_hooks.h>
#include <xen/stop_machine.h>
+#include <acpi/cpufreq/processor_perf.h>

#define set_kernel_exec(x, y) (0)
#define setup_trampoline() (bootsym_phys(trampoline_realmode_entry))
@@ -1232,6 +1233,8 @@ int __cpu_disable(void)
mdelay(1);
local_irq_disable();

+ cpufreq_del_cpu(cpu);
+
time_suspend();

remove_siblinginfo(cpu);
@@ -1421,6 +1424,8 @@ int __devinit __cpu_up(unsigned int cpu)
mb();
process_pending_timers();
}
+
+ cpufreq_add_cpu(cpu);
return 0;
}

diff -r ec8eaab557d8 -r 4a381ddc764a xen/common/gdbstub.c
--- a/xen/common/gdbstub.c Fri Sep 12 14:47:40 2008 +0900
+++ b/xen/common/gdbstub.c Tue Sep 16 21:25:30 2008 +0900
@@ -65,7 +65,7 @@ static void gdb_smp_pause(void);
static void gdb_smp_pause(void);
static void gdb_smp_resume(void);

-static char opt_gdb[30] = "none";
+static char opt_gdb[30];
string_param("gdb", opt_gdb);

static void gdbstub_console_puts(const char *str);
@@ -625,10 +625,19 @@ void __init
void __init
initialise_gdb(void)
{
+ if ( *opt_gdb == '\0' )
+ return;
+
gdb_ctx->serhnd = serial_parse_handle(opt_gdb);
- if ( gdb_ctx->serhnd != -1 )
- printk("GDB stub initialised.\n");
+ if ( gdb_ctx->serhnd == -1 )
+ {
+ printk("Bad gdb= option '%s'\n", opt_gdb);
+ return;
+ }
+
serial_start_sync(gdb_ctx->serhnd);
+
+ printk("GDB stub initialised.\n");
}

static void gdb_pause_this_cpu(void *unused)
diff -r ec8eaab557d8 -r 4a381ddc764a xen/drivers/char/console.c
--- a/xen/drivers/char/console.c Fri Sep 12 14:47:40 2008 +0900
+++ b/xen/drivers/char/console.c Tue Sep 16 21:25:30 2008 +0900
@@ -543,10 +543,18 @@ void __init init_console(void)
{
if ( *p == ',' )
p++;
- if ( strncmp(p, "com", 3) == 0 )
- sercon_handle = serial_parse_handle(p);
- else if ( strncmp(p, "vga", 3) == 0 )
+ if ( !strncmp(p, "vga", 3) )
vga_init();
+ else if ( strncmp(p, "com", 3) ||
+ (sercon_handle = serial_parse_handle(p)) == -1 )
+ {
+ char *q = strchr(p, ',');
+ if ( q != NULL )
+ *q = '\0';
+ printk("Bad console= option '%s'\n", p);
+ if ( q != NULL )
+ *q = ',';
+ }
}

serial_set_rx_handler(sercon_handle, serial_rx);
diff -r ec8eaab557d8 -r 4a381ddc764a xen/drivers/char/ns16550.c
--- a/xen/drivers/char/ns16550.c Fri Sep 12 14:47:40 2008 +0900
+++ b/xen/drivers/char/ns16550.c Tue Sep 16 21:25:30 2008 +0900
@@ -82,6 +82,7 @@ static struct ns16550 {
#define MCR_DTR 0x01 /* Data Terminal Ready */
#define MCR_RTS 0x02 /* Request to Send */
#define MCR_OUT2 0x08 /* OUT2: interrupt mask */
+#define MCR_LOOP 0x10 /* Enable loopback test mode */

/* Line Status Register */
#define LSR_DR 0x01 /* Data ready */
@@ -293,6 +294,37 @@ static int __init parse_parity_char(int
return PARITY_SPACE;
}
return 0;
+}
+
+static int check_existence(struct ns16550 *uart)
+{
+ unsigned char status, scratch, scratch2, scratch3;
+
+ /*
+ * Do a simple existence test first; if we fail this,
+ * there's no point trying anything else.
+ */
+ scratch = ns_read_reg(uart, IER);
+ ns_write_reg(uart, IER, 0);
+
+ /*
+ * Mask out IER[7:4] bits for test as some UARTs (e.g. TL
+ * 16C754B) allow only to modify them if an EFR bit is set.
+ */
+ scratch2 = ns_read_reg(uart, IER) & 0x0f;
+ ns_write_reg(uart, IER, 0x0F);
+ scratch3 = ns_read_reg(uart, IER) & 0x0f;
+ ns_write_reg(uart, IER, scratch);
+ if ( (scratch2 != 0) || (scratch3 != 0x0F) )
+ return 0;
+
+ /*
+ * Check to see if a UART is really there.
+ * Use loopback test mode.
+ */
+ ns_write_reg(uart, MCR, MCR_LOOP | 0x0A);
+ status = ns_read_reg(uart, MSR) & 0xF0;
+ return (status == 0x90);
}

#define PARSE_ERR(_f, _a...) \
@@ -357,6 +389,8 @@ static void __init ns16550_parse_port_co
PARSE_ERR("%d stop bits are unsupported.", uart->stop_bits);
if ( uart->io_base == 0 )
PARSE_ERR("I/O base address must be specified.");
+ if ( !check_existence(uart) )
+ PARSE_ERR("16550-compatible serial UART not present");

/* Register with generic serial driver. */
serial_register_uart(uart - ns16550_com, &ns16550_driver, uart);
diff -r ec8eaab557d8 -r 4a381ddc764a xen/drivers/char/serial.c
--- a/xen/drivers/char/serial.c Fri Sep 12 14:47:40 2008 +0900
+++ b/xen/drivers/char/serial.c Tue Sep 16 21:25:30 2008 +0900
@@ -258,11 +258,7 @@ int serial_parse_handle(char *conf)
{
int handle;

- /* Silently fail if user has explicitly requested no serial I/O. */
- if ( strcmp(conf, "none") == 0 )
- return -1;
-
- if ( strncmp(conf, "com", 3) != 0 )
+ if ( strncmp(conf, "com", 3) )
goto fail;

switch ( conf[3] )
@@ -277,6 +273,9 @@ int serial_parse_handle(char *conf)
goto fail;
}

+ if ( !com[handle].driver )
+ goto fail;
+
if ( conf[4] == 'H' )
handle |= SERHND_HI;
else if ( conf[4] == 'L' )
@@ -287,7 +286,6 @@ int serial_parse_handle(char *conf)
return handle;

fail:
- printk("ERROR: bad serial-interface specification '%s'\n", conf);
return -1;
}

diff -r ec8eaab557d8 -r 4a381ddc764a xen/drivers/passthrough/vtd/iommu.c
--- a/xen/drivers/passthrough/vtd/iommu.c Fri Sep 12 14:47:40 2008 +0900
+++ b/xen/drivers/passthrough/vtd/iommu.c Tue Sep 16 21:25:30 2008 +0900
@@ -152,6 +152,7 @@ static u64 bus_to_context_maddr(struct i
maddr = alloc_pgtable_maddr();
if ( maddr == 0 )
{
+ unmap_vtd_domain_page(root_entries);
spin_unlock_irqrestore(&iommu->lock, flags);
return 0;
}
diff -r ec8eaab557d8 -r 4a381ddc764a xen/include/acpi/cpufreq/cpufreq.h
--- a/xen/include/acpi/cpufreq/cpufreq.h Fri Sep 12 14:47:40 2008 +0900
+++ b/xen/include/acpi/cpufreq/cpufreq.h Tue Sep 16 21:25:30 2008 +0900
@@ -18,6 +18,8 @@
#include "processor_perf.h"

#define CPUFREQ_NAME_LEN 16
+
+struct cpufreq_governor;

struct cpufreq_cpuinfo {
unsigned int max_freq;
@@ -30,16 +32,21 @@ struct cpufreq_policy {
unsigned int shared_type; /* ANY or ALL affected CPUs
should set cpufreq */
unsigned int cpu; /* cpu nr of registered CPU */
- struct cpufreq_cpuinfo cpuinfo; /* see above */
+ struct cpufreq_cpuinfo cpuinfo;

unsigned int min; /* in kHz */
unsigned int max; /* in kHz */
unsigned int cur; /* in kHz, only needed if cpufreq
* governors are used */
+ struct cpufreq_governor *governor;
+
unsigned int resume; /* flag for cpufreq 1st run
* S3 wakeup, hotplug cpu, etc */
};
-extern struct cpufreq_policy xen_px_policy[NR_CPUS];
+extern struct cpufreq_policy *cpufreq_cpu_policy[NR_CPUS];
+
+extern int __cpufreq_set_policy(struct cpufreq_policy *data,
+ struct cpufreq_policy *policy);

#define CPUFREQ_SHARED_TYPE_NONE (0) /* None */
#define CPUFREQ_SHARED_TYPE_HW (1) /* HW does needed coordination */
@@ -64,11 +71,26 @@ struct cpufreq_freqs {
#define CPUFREQ_GOV_STOP 2
#define CPUFREQ_GOV_LIMITS 3

+struct cpufreq_governor {
+ char name[CPUFREQ_NAME_LEN];
+ int (*governor)(struct cpufreq_policy *policy,
+ unsigned int event);
+};
+
+extern struct cpufreq_governor cpufreq_gov_dbs;
+#define CPUFREQ_DEFAULT_GOVERNOR &cpufreq_gov_dbs
+
/* pass a target to the cpufreq driver */
extern int __cpufreq_driver_target(struct cpufreq_policy *policy,
unsigned int target_freq,
unsigned int relation);
extern int __cpufreq_driver_getavg(struct cpufreq_policy *policy);
+
+static __inline__ int
+__cpufreq_governor(struct cpufreq_policy *policy, unsigned int event)
+{
+ return policy->governor->governor(policy, event);
+}


/*********************************************************************
@@ -91,7 +113,50 @@ struct cpufreq_driver {

extern struct cpufreq_driver *cpufreq_driver;

-void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state);
+static __inline__
+int cpufreq_register_driver(struct cpufreq_driver *driver_data)
+{
+ if (!driver_data ||
+ !driver_data->init ||
+ !driver_data->exit ||
+ !driver_data->verify ||
+ !driver_data->target)
+ return -EINVAL;
+
+ if (cpufreq_driver)
+ return -EBUSY;
+
+ cpufreq_driver = driver_data;
+ return 0;
+}
+
+static __inline__
+int cpufreq_unregister_driver(struct cpufreq_driver *driver)
+{
+ if (!cpufreq_driver || (driver != cpufreq_driver))
+ return -EINVAL;
+
+ cpufreq_driver = NULL;
+ return 0;
+}
+
+static __inline__
+void cpufreq_verify_within_limits(struct cpufreq_policy *policy,
+ unsigned int min, unsigned int max)
+{
+ if (policy->min < min)
+ policy->min = min;
+ if (policy->max < min)
+ policy->max = min;
+ if (policy->min > max)
+ policy->min = max;
+ if (policy->max > max)
+ policy->max = max;
+ if (policy->min > policy->max)
+ policy->min = policy->max;
+ return;
+}
+

/*********************************************************************
* FREQUENCY TABLE HELPERS *
@@ -107,6 +172,9 @@ struct cpufreq_frequency_table {
};

int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy,
+ struct cpufreq_frequency_table *table);
+
+int cpufreq_frequency_table_verify(struct cpufreq_policy *policy,
struct cpufreq_frequency_table *table);

int cpufreq_frequency_table_target(struct cpufreq_policy *policy,
diff -r ec8eaab557d8 -r 4a381ddc764a xen/include/acpi/cpufreq/processor_perf.h
--- a/xen/include/acpi/cpufreq/processor_perf.h Fri Sep 12 14:47:40 2008 +0900
+++ b/xen/include/acpi/cpufreq/processor_perf.h Tue Sep 16 21:25:30 2008 +0900
@@ -7,26 +7,23 @@
#define XEN_PX_INIT 0x80000000

int get_cpu_id(u8);
-int acpi_cpufreq_init(void);
int powernow_cpufreq_init(void);

void px_statistic_update(cpumask_t, uint8_t, uint8_t);
-int px_statistic_init(int);
-void px_statistic_reset(int);
-void px_statistic_suspend(void);
-void px_statistic_resume(void);
+int px_statistic_init(unsigned int);
+void px_statistic_exit(unsigned int);
+void px_statistic_reset(unsigned int);

-void cpufreq_dom_exit(void);
-int cpufreq_dom_init(void);
-int cpufreq_dom_dbs(unsigned int);
-void cpufreq_suspend(void);
-int cpufreq_resume(void);
+int cpufreq_limit_change(unsigned int);
+
+int cpufreq_add_cpu(unsigned int);
+int cpufreq_del_cpu(unsigned int);

uint64_t get_cpu_idle_time(unsigned int);

struct processor_performance {
uint32_t state;
- uint32_t ppc;
+ uint32_t platform_limit;
struct xen_pct_register control_register;
struct xen_pct_register status_register;
uint32_t state_count;
diff -r ec8eaab557d8 -r 4a381ddc764a xen/include/asm-x86/microcode.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/asm-x86/microcode.h Tue Sep 16 21:25:30 2008 +0900
@@ -0,0 +1,93 @@
+#ifndef ASM_X86__MICROCODE_H
+#define ASM_X86__MICROCODE_H
+
+struct cpu_signature;
+
+struct microcode_ops {
+ int (*get_matching_microcode)(void *mc, int cpu);
+ int (*cpu_request_microcode)(int cpu, const void *buf, size_t size);
+ int (*collect_cpu_info)(int cpu_num, struct cpu_signature *csig);
+ int (*apply_microcode)(int cpu);
+};
+
+struct microcode_header_intel {
+ unsigned int hdrver;
+ unsigned int rev;
+ unsigned int date;
+ unsigned int sig;
+ unsigned int cksum;
+ unsigned int ldrver;
+ unsigned int pf;
+ unsigned int datasize;
+ unsigned int totalsize;
+ unsigned int reserved[3];
+};
+
+struct microcode_intel {
+ struct microcode_header_intel hdr;
+ unsigned int bits[0];
+};
+
+/* microcode format is extended from prescott processors */
+struct extended_signature {
+ unsigned int sig;
+ unsigned int pf;
+ unsigned int cksum;
+};
+
+struct extended_sigtable {
+ unsigned int count;
+ unsigned int cksum;
+ unsigned int reserved[3];
+ struct extended_signature sigs[0];
+};
+
+struct equiv_cpu_entry {
+ unsigned int installed_cpu;
+ unsigned int fixed_errata_mask;
+ unsigned int fixed_errata_compare;
+ unsigned int equiv_cpu;
+};
+
+struct microcode_header_amd {
+ unsigned int data_code;
+ unsigned int patch_id;
+ unsigned char mc_patch_data_id[2];
+ unsigned char mc_patch_data_len;
+ unsigned char init_flag;
+ unsigned int mc_patch_data_checksum;
+ unsigned int nb_dev_id;
+ unsigned int sb_dev_id;
+ unsigned char processor_rev_id[2];
+ unsigned char nb_rev_id;
+ unsigned char sb_rev_id;
+ unsigned char bios_api_rev;
+ unsigned char reserved1[3];
+ unsigned int match_reg[8];
+};
+
+struct microcode_amd {
+ struct microcode_header_amd hdr;
+ unsigned int mpb[0];
+};
+
+struct cpu_signature {
+ unsigned int sig;
+ unsigned int pf;
+ unsigned int rev;
+};
+
+struct ucode_cpu_info {
+ struct cpu_signature cpu_sig;
+ int valid;
+ union {
+ struct microcode_intel *mc_intel;
+ struct microcode_amd *mc_amd;
+ void *valid_mc;
+ } mc;
+};
+
+extern struct ucode_cpu_info ucode_cpu_info[];
+extern const struct microcode_ops *microcode_ops;
+
+#endif /* ASM_X86__MICROCODE_H */
diff -r ec8eaab557d8 -r 4a381ddc764a xen/include/asm-x86/msr-index.h
--- a/xen/include/asm-x86/msr-index.h Fri Sep 12 14:47:40 2008 +0900
+++ b/xen/include/asm-x86/msr-index.h Tue Sep 16 21:25:30 2008 +0900
@@ -210,6 +210,10 @@
#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2
#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffff
#define FAM10H_MMIO_CONF_BASE_SHIFT 20
+
+/* AMD Microcode MSRs */
+#define MSR_AMD_PATCHLEVEL 0x0000008b
+#define MSR_AMD_PATCHLOADER 0xc0010020

/* K6 MSRs */
#define MSR_K6_EFER 0xc0000080
diff -r ec8eaab557d8 -r 4a381ddc764a xen/include/asm-x86/processor.h
--- a/xen/include/asm-x86/processor.h Fri Sep 12 14:47:40 2008 +0900
+++ b/xen/include/asm-x86/processor.h Tue Sep 16 21:25:30 2008 +0900
@@ -486,41 +486,6 @@ long set_gdt(struct vcpu *d,
})
long set_debugreg(struct vcpu *p, int reg, unsigned long value);

-struct microcode_header {
- unsigned int hdrver;
- unsigned int rev;
- unsigned int date;
- unsigned int sig;
- unsigned int cksum;
- unsigned int ldrver;
- unsigned int pf;
- unsigned int datasize;
- unsigned int totalsize;
- unsigned int reserved[3];
-};
-
-struct microcode {
- struct microcode_header hdr;
- unsigned int bits[0];
-};
-
-typedef struct microcode microcode_t;
-typedef struct microcode_header microcode_header_t;
-
-/* microcode format is extended from prescott processors */
-struct extended_signature {
- unsigned int sig;
- unsigned int pf;
- unsigned int cksum;
-};
-
-struct extended_sigtable {
- unsigned int count;
- unsigned int cksum;
- unsigned int reserved[3];
- struct extended_signature sigs[0];
-};
-
/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
static always_inline void rep_nop(void)
{
diff -r ec8eaab557d8 -r 4a381ddc764a xen/include/public/platform.h
--- a/xen/include/public/platform.h Fri Sep 12 14:47:40 2008 +0900
+++ b/xen/include/public/platform.h Tue Sep 16 21:25:30 2008 +0900
@@ -289,7 +289,7 @@ struct xen_psd_package {

struct xen_processor_performance {
uint32_t flags; /* flag for Px sub info type */
- uint32_t ppc; /* Platform limitation on freq usage */
+ uint32_t platform_limit; /* Platform limitation on freq usage */
struct xen_pct_register control_register;
struct xen_pct_register status_register;
uint32_t state_count; /* total available performance states */

_______________________________________________
Xen-changelog mailing list
Xen-changelog@lists.xensource.com
http://lists.xensource.com/xen-changelog