Preparation
Import the following script into your MongoDb Docker container:
- Save the script as
dataset_import.json your local hard drive. - Make sure your MongoDb Docker container is running and note it’s name.
For the following commands we assume, your container is named
mongodb. - Open a terminal in the folder where you saved the
dataset_import.jsfile and run:
docker cp dataset_import.js mongodb:/dataset_import.jswheremongodbis the name of your MongoDb container. - Run:
docker exec -it mongodb mongosh /dataset_import.js - This should create a new database named
htl_mongo_relations_labcontaining several collections on your MongoDb server.
/**
* dataset_import.js
* --------------------------
*
* Usage:
* docker cp dataset_import.js mongodb:/dataset_import.js
* docker exec -it mongodb mongosh /dataset_import.js
*/
const DB_NAME = "htl_mongo_relations_lab";
const dbLab = db.getSiblingDB(DB_NAME);
function mulberry32(seed) {
return function() {
let t = seed += 0x6D2B79F5;
t = Math.imul(t ^ (t >>> 15), t | 1);
t ^= t + Math.imul(t ^ (t >>> 7), t | 61);
return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
};
}
function randInt(rng, min, maxInclusive) {
return Math.floor(rng() * (maxInclusive - min + 1)) + min;
}
function pick(rng, arr) {
return arr[randInt(rng, 0, arr.length - 1)];
}
function makeCode(prefix, n, width=4) {
return `${prefix}${String(n).padStart(width, "0")}`;
}
function flushInsert(coll, buffer) {
if (buffer.length > 0) {
coll.insertMany(buffer);
buffer.length = 0;
}
}
print(`Recreating database: ${DB_NAME}`);
dbLab.dropDatabase();
const rng = mulberry32(1337);
// ----------------- Parameters -----------------
const TEACHERS = 120;
const COURSES = 240;
const STUDENTS = 6000;
const ENROLL_PER_STUDENT_MIN = 3;
const ENROLL_PER_STUDENT_MAX = 7;
const ASSIGNMENTS_PER_COURSE = 8;
const SUBMISSIONS_PER_ASSIGNMENT_MIN = 40;
const SUBMISSIONS_PER_ASSIGNMENT_MAX = 70;
// Batch sizes
const BATCH_SMALL = 1000;
const BATCH_MED = 3000;
const BATCH_BIG = 5000;
// ----------------- Dictionaries -----------------
const firstNames = ["Alina","Noah","Mia","Leon","Sofia","Paul","Emma","Elias","Lena","Jakob","Laura","Felix","Nina","Jonas","Sara","David","Anna","Lukas","Clara","Max"];
const lastNames = ["Berger","Huber","Wagner","Pichler","Steiner","Moser","Hofer","Bauer","Schmid","Fischer","Mayer","Leitner","Gruber","Weber","Koch","Winter","Brunner","Baumgartner","Graf","Eder"];
const subjects = ["Networks","Databases","Cloud","Programming","Security","Web","Linux","DevOps","Math","Physics","Electronics","Project"];
const classNames = ["1A","1B","2A","2B","3A","3B","4A","4B","5A","5B"];
const semesters = ["2024W","2025S","2025W","2026S"];
// ----------------- Collections -----------------
const teachers_ref = dbLab.teachers_ref;
const courses_ref = dbLab.courses_ref;
const students_ref = dbLab.students_ref;
const enrollments_ref = dbLab.enrollments_ref;
const assignments_ref = dbLab.assignments_ref;
const submissions_ref = dbLab.submissions_ref;
const students_emb = dbLab.students_emb;
const courses_emb = dbLab.courses_emb;
const submissions_emb = dbLab.submissions_emb;
// -------------------------
// 1) Teachers
// -------------------------
print("Generating teachers...");
let teacherBuf = [];
let teachers = []; // small in memory
for (let i = 1; i <= TEACHERS; i++) {
const fn = pick(rng, firstNames);
const ln = pick(rng, lastNames);
const doc = {
_id: ObjectId(),
teacherNo: makeCode("T", i, 4),
name: `${fn} ${ln}`,
department: pick(rng, ["IT","ET","MECH"]),
email: `${fn.toLowerCase()}.${ln.toLowerCase()}${i}@school.example`
};
teachers.push(doc);
teacherBuf.push(doc);
if (teacherBuf.length >= BATCH_SMALL) flushInsert(teachers_ref, teacherBuf);
}
flushInsert(teachers_ref, teacherBuf);
// -------------------------
// 2) Courses + assignments
// -------------------------
print("Generating courses + assignments...");
let courseBufRef = [];
let courseBufEmb = [];
let assignBufRef = [];
let courses = []; // small in memory
let assignmentsByCourse = new Map(); // courseIdStr -> [{_id, key, maxPoints}]
let teacherNameByCourse = new Map(); // courseIdStr -> teacherName (for snapshots)
for (let i = 1; i <= COURSES; i++) {
const title = `${pick(rng, subjects)} ${randInt(rng, 1, 3)}`;
const teacher = pick(rng, teachers);
const sem = pick(rng, semesters);
const courseId = ObjectId();
const courseRef = {
_id: courseId,
courseCode: makeCode("C", i, 4),
title,
semester: sem,
teacherId: teacher._id,
room: `${randInt(rng, 1, 4)}.${randInt(rng, 1, 30)}`
};
courses.push(courseRef);
teacherNameByCourse.set(courseId.str, teacher.name);
courseBufRef.push(courseRef);
let assignmentsEmbedded = [];
let assignmentMiniList = [];
for (let a = 1; a <= ASSIGNMENTS_PER_COURSE; a++) {
const key = `A${String(a).padStart(2,"0")}`;
const maxPoints = pick(rng, [10, 15, 20, 25]);
assignmentsEmbedded.push({
key,
title: `Assignment ${a}: ${pick(rng, ["Lab","Worksheet","Project","Quiz"])}`,
maxPoints,
dueDaysFromStart: randInt(rng, 7, 70)
});
const aId = ObjectId();
assignBufRef.push({
_id: aId,
courseId: courseId,
key,
title: `Assignment ${a}: ${pick(rng, ["Lab","Worksheet","Project","Quiz"])}`,
maxPoints,
dueDaysFromStart: randInt(rng, 7, 70)
});
assignmentMiniList.push({ _id: aId, key, maxPoints });
if (assignBufRef.length >= BATCH_MED) flushInsert(assignments_ref, assignBufRef);
}
assignmentsByCourse.set(courseId.str, assignmentMiniList);
courseBufEmb.push({
_id: courseId,
courseCode: courseRef.courseCode,
title,
semester: sem,
teacher: {
teacherId: teacher._id,
nameSnapshot: teacher.name,
emailSnapshot: teacher.email
},
room: courseRef.room,
assignments: assignmentsEmbedded
});
if (courseBufRef.length >= BATCH_SMALL) flushInsert(courses_ref, courseBufRef);
if (courseBufEmb.length >= 500) flushInsert(courses_emb, courseBufEmb);
}
flushInsert(courses_ref, courseBufRef);
flushInsert(courses_emb, courseBufEmb);
flushInsert(assignments_ref, assignBufRef);
// -------------------------
// 3) Students + enrollments
// -------------------------
print("Generating students + enrollments...");
let studentBufRef = [];
let studentBufEmb = [];
let enrollBufRef = [];
let enrollmentsByCourse = new Map(); // courseIdStr -> [studentId,...]
for (let i = 1; i <= STUDENTS; i++) {
const fn = pick(rng, firstNames);
const ln = pick(rng, lastNames);
const sid = ObjectId();
const studentNo = makeCode("S", i, 5);
const baseStudent = {
_id: sid,
studentNo,
name: `${fn} ${ln}`,
class: pick(rng, classNames),
yearOfBirth: randInt(rng, 2006, 2011)
};
studentBufRef.push(baseStudent);
const enrollCount = randInt(rng, ENROLL_PER_STUDENT_MIN, ENROLL_PER_STUDENT_MAX);
let enrollmentsEmbedded = [];
for (let e = 0; e < enrollCount; e++) {
const course = pick(rng, courses);
const status = pick(rng, ["active","active","active","dropped"]);
enrollmentsEmbedded.push({
courseId: course._id,
courseTitleSnapshot: course.title,
semesterSnapshot: course.semester,
status
});
enrollBufRef.push({
_id: ObjectId(),
studentId: sid,
courseId: course._id,
status,
since: new Date(2024, randInt(rng, 8, 11), randInt(rng, 1, 28))
});
const k = course._id.str;
if (!enrollmentsByCourse.has(k)) enrollmentsByCourse.set(k, []);
enrollmentsByCourse.get(k).push(sid);
if (enrollBufRef.length >= BATCH_BIG) flushInsert(enrollments_ref, enrollBufRef);
}
studentBufEmb.push({ ...baseStudent, enrollments: enrollmentsEmbedded });
if (studentBufRef.length >= BATCH_MED) flushInsert(students_ref, studentBufRef);
if (studentBufEmb.length >= 1000) flushInsert(students_emb, studentBufEmb);
}
flushInsert(students_ref, studentBufRef);
flushInsert(students_emb, studentBufEmb);
flushInsert(enrollments_ref, enrollBufRef);
// -------------------------
// 4) Submissions (streaming)
// -------------------------
print("Generating submissions (streaming inserts)...");
let subBufRef = [];
let subBufEmb = [];
function pointsFor(maxPoints) {
const p = rng();
if (p < 0.1) return 0;
if (p < 0.2) return Math.floor(maxPoints * 0.4);
if (p < 0.6) return Math.floor(maxPoints * 0.75);
return randInt(rng, Math.floor(maxPoints * 0.8), maxPoints);
}
let submissionCount = 0;
for (const c of courses) {
const enrolledStudents = enrollmentsByCourse.get(c._id.str) || [];
if (enrolledStudents.length === 0) continue;
const teacherNameSnap = teacherNameByCourse.get(c._id.str) || "";
const assList = assignmentsByCourse.get(c._id.str) || [];
for (const a of assList) {
const want = randInt(rng, SUBMISSIONS_PER_ASSIGNMENT_MIN, SUBMISSIONS_PER_ASSIGNMENT_MAX);
for (let k = 0; k < want; k++) {
const studentId = enrolledStudents[randInt(rng, 0, enrolledStudents.length - 1)];
const pts = pointsFor(a.maxPoints);
subBufRef.push({
_id: ObjectId(),
assignmentId: a._id,
courseId: c._id,
studentId,
submittedAt: new Date(2025, randInt(rng, 0, 11), randInt(rng, 1, 28)),
points: pts,
status: pts === 0 ? pick(rng, ["missing","submitted"]) : "submitted"
});
subBufEmb.push({
_id: ObjectId(),
courseId: c._id,
assignmentKey: a.key,
studentId,
submittedAt: new Date(2025, randInt(rng, 0, 11), randInt(rng, 1, 28)),
points: pts,
status: pts === 0 ? pick(rng, ["missing","submitted"]) : "submitted",
courseTitleSnapshot: c.title,
teacherNameSnapshot: teacherNameSnap
});
submissionCount++;
if (subBufRef.length >= BATCH_BIG) flushInsert(submissions_ref, subBufRef);
if (subBufEmb.length >= BATCH_BIG) flushInsert(submissions_emb, subBufEmb);
}
}
}
flushInsert(submissions_ref, subBufRef);
flushInsert(submissions_emb, subBufEmb);
print(`Inserted approx submissions per model: ${submissionCount}`);
// -------------------------
// 5) Minimal indexes
// -------------------------
print("Creating minimal indexes...");
students_ref.createIndex({ studentNo: 1 }, { unique: true });
courses_ref.createIndex({ courseCode: 1 }, { unique: true });
teachers_ref.createIndex({ teacherNo: 1 }, { unique: true });
students_emb.createIndex({ studentNo: 1 }, { unique: true });
courses_emb.createIndex({ courseCode: 1 }, { unique: true });
submissions_ref.createIndex({ studentId: 1 });
submissions_emb.createIndex({ studentId: 1 });
print("Dataset import finished (streaming v2).");
printjson(dbLab.getCollectionNames().sort());Exercise
- Complete the tasks from the following script.
- You can copy the whole script into DataGrip and complete the tasks there.
/**
* tasks.js
* --------
* Open this file in an editor and fill in the queries where you see TODO blocks.
*
* Tip:
* - Use explain("executionStats") for performance comparisons.
* - Measure before/after creating indexes.
*/
const dbLab = db.getSiblingDB("htl_mongo_relations_lab");
// Collections (referenced)
const students_ref = dbLab.students_ref;
const courses_ref = dbLab.courses_ref;
const teachers_ref = dbLab.teachers_ref;
const enrollments_ref = dbLab.enrollments_ref;
const assignments_ref = dbLab.assignments_ref;
const submissions_ref = dbLab.submissions_ref;
// Collections (embedded/hybrid)
const students_emb = dbLab.students_emb;
const courses_emb = dbLab.courses_emb;
const submissions_emb = dbLab.submissions_emb;
// Helper: pick one student + one course to work with
const anyStudent = students_ref.findOne();
const anyCourse = courses_ref.findOne();
print("Using example student:", anyStudent.studentNo, anyStudent.name);
print("Using example course:", anyCourse.courseCode, anyCourse.title);
// ------------------------------------------------------------
// Part A — Relationship modeling + simple queries (warm-up)
// ------------------------------------------------------------
/**
* A1) 1:n with embedding:
* Fetch one course from courses_emb and show only:
* - courseCode, title
* - assignments (only key + title)
*
* TODO: Write the query here.
*/
// TODO_A1
/**
* A2) 1:n with references:
* Fetch the same course from courses_ref and then fetch its assignments from assignments_ref.
*
* TODO: Write TWO queries (course then assignments).
*/
// TODO_A2
/**
* A3) Count: How many assignments exist per course in the referenced model?
* Return top 5 courses with the most assignments (should be equal in this dataset, but still do it).
*
* Hint: aggregation with $group.
*/
// TODO_A3
// ------------------------------------------------------------
// Part B — m:n (students ↔ courses) via join collection vs embedded enrollments
// ------------------------------------------------------------
/**
* B1) Referenced model: For one student, list the titles of all active courses.
*
* Requirements:
* - Use enrollments_ref to find active enrollments
* - Then fetch courses_ref by _id with $in
*
* TODO: implement the multi-query approach.
*/
// TODO_B1
/**
* B2) Referenced model: Do B1 as ONE query using aggregation with $lookup.
*
* Requirements:
* - Start from students_ref
* - $lookup enrollments_ref by studentId
* - $match only "active"
* - $lookup courses_ref using enrollments.courseId
* - Output: studentNo, name, and array of course titles
*/
// TODO_B2
/**
* B3) Embedded/hybrid model: For the same studentNo, list active courses using students_emb.
*
* Requirements:
* - No $lookup allowed.
* - Use the embedded enrollments array and project courseTitleSnapshot.
*/
// TODO_B3
// ------------------------------------------------------------
// Part C — Joining across multiple collections (the "new" part)
// ------------------------------------------------------------
/**
* C1) For ONE course, show:
* - courseCode, title, semester
* - teacher name (from teachers_ref)
*
* Do it in ONE query with $lookup (courses_ref -> teachers_ref).
*/
// TODO_C1
/**
* C2) For ONE student, show the student and all courses PLUS each course's teacher name.
* Referenced model, ONE pipeline.
*
* Join path:
* students_ref -> enrollments_ref -> courses_ref -> teachers_ref
*
* Output fields:
* studentNo, name, courses: [{ title, semester, teacherName }]
*/
// TODO_C2
/**
* C3) Same result as C2, but using MULTIPLE queries (application-side join).
*
* TODO: implement with 3–4 queries (students, enrollments, courses, teachers).
*/
// TODO_C3
// ------------------------------------------------------------
// Part D — Performance lab: submissions (big collection)
// ------------------------------------------------------------
/**
* D1) Top 10 students by total points in semester "2025W" (referenced model).
*
* You need to:
* - filter courses_ref by semester "2025W" to get courseIds
* - filter submissions_ref by those courseIds
* - group by studentId and sum points
* - join students_ref to get studentNo + name
*
* Requirements:
* - Do it as ONE aggregation pipeline using $lookup where needed.
* - Run explain("executionStats") and note executionTimeMillis + totalDocsExamined.
*/
// TODO_D1
/**
* D2) Do the SAME ranking in the embedded/hybrid model using submissions_emb.
*
* Hint: submissions_emb already has courseTitleSnapshot and teacherNameSnapshot,
* but you still need semester. Either:
* - join to courses_emb for semester, OR
* - first get courseIds for semester and filter by those (then no $lookup)
*
* Requirements:
* - Try BOTH approaches:
* (a) $lookup to courses_emb to filter by semester
* (b) pre-fetch courseIds then one pipeline without $lookup
*
* Compare execution stats.
*/
// TODO_D2a
// TODO_D2b
/**
* D3) Index experiment:
* Create indexes and re-run D1.
*
* Suggested indexes:
* - submissions_ref: { courseId: 1, studentId: 1 }
* - courses_ref: { semester: 1 }
* - enrollments_ref: { studentId: 1, status: 1 }
*
* TODO:
* 1) Create indexes
* 2) Re-run D1 explain and compare numbers.
*/
// TODO_D3
/**
* D4) "Student dashboard" query:
* For a given studentNo, show for each active course:
* - course title
* - teacher name
* - average points over all submissions in that course
*
* Implement in referenced model:
* students_ref -> enrollments_ref -> courses_ref -> teachers_ref -> submissions_ref
*
* Requirements:
* - ONE aggregation pipeline.
* - Use $group for avg points.
*/
// TODO_D4
/**
* D5) Same dashboard in embedded/hybrid model:
* - Use students_emb for active course list (snapshot titles)
* - Use submissions_emb for points
* - Optionally join courses_emb for semester/teacher, but try to avoid joins first
*
* Requirements:
* - Provide a version WITHOUT $lookup (best-case denormalized read model)
* - Then provide a version WITH $lookup (if you want semester filtering, etc.)
*/
// TODO_D5_noLookup
// TODO_D5_withLookup
// ------------------------------------------------------------
// Part E — Reflection questions (write short answers in your notes)
// ------------------------------------------------------------
/**
* E1) Where did embedding clearly win (simpler/faster)?
* E2) Where did referencing clearly win (flexible/consistent)?
* E3) Which indexes made the biggest difference and why?
* E4) Identify at least one place where duplicating snapshots is the "correct" choice.
*/
print("Tasks file loaded. Fill in the TODO blocks and run queries interactively.");