Preparation

Import the following script into your MongoDb Docker container:

  • Save the script as dataset_import.js on your local hard drive.
  • Make sure your MongoDb Docker container is running and note it’s name. For the following commands we assume, your container is named mongodb.
  • Open a terminal in the folder where you saved the dataset_import.jsfile and run:
    docker cp dataset_import.js mongodb:/dataset_import.js where mongodb is the name of your MongoDb container.
  • Run: docker exec -it mongodb mongosh /dataset_import.js
  • This should create a new database named htl_mongo_relations_lab containing several collections on your MongoDb server.
/**
 * dataset_import.js
 * --------------------------
 *
 * Usage:
 *   docker cp dataset_import.js mongodb:/dataset_import.js
 *   docker exec -it mongodb mongosh /dataset_import.js
 */
 
const DB_NAME = "htl_mongo_relations_lab";
const dbLab = db.getSiblingDB(DB_NAME);
 
function mulberry32(seed) {
  return function() {
    let t = seed += 0x6D2B79F5;
    t = Math.imul(t ^ (t >>> 15), t | 1);
    t ^= t + Math.imul(t ^ (t >>> 7), t | 61);
    return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
  };
}
function randInt(rng, min, maxInclusive) {
  return Math.floor(rng() * (maxInclusive - min + 1)) + min;
}
function pick(rng, arr) {
  return arr[randInt(rng, 0, arr.length - 1)];
}
function makeCode(prefix, n, width=4) {
  return `${prefix}${String(n).padStart(width, "0")}`;
}
function flushInsert(coll, buffer) {
  if (buffer.length > 0) {
    coll.insertMany(buffer);
    buffer.length = 0;
  }
}
 
print(`Recreating database: ${DB_NAME}`);
dbLab.dropDatabase();
 
const rng = mulberry32(1337);
 
// ----------------- Parameters -----------------
const TEACHERS = 120;
const COURSES  = 240;
const STUDENTS = 6000;
 
const ENROLL_PER_STUDENT_MIN = 3;
const ENROLL_PER_STUDENT_MAX = 7;
 
const ASSIGNMENTS_PER_COURSE = 8;
 
const SUBMISSIONS_PER_ASSIGNMENT_MIN = 40;
const SUBMISSIONS_PER_ASSIGNMENT_MAX = 70;
 
// Batch sizes
const BATCH_SMALL = 1000;
const BATCH_MED   = 3000;
const BATCH_BIG   = 5000;
 
// ----------------- Dictionaries -----------------
const firstNames = ["Alina","Noah","Mia","Leon","Sofia","Paul","Emma","Elias","Lena","Jakob","Laura","Felix","Nina","Jonas","Sara","David","Anna","Lukas","Clara","Max"];
const lastNames  = ["Berger","Huber","Wagner","Pichler","Steiner","Moser","Hofer","Bauer","Schmid","Fischer","Mayer","Leitner","Gruber","Weber","Koch","Winter","Brunner","Baumgartner","Graf","Eder"];
const subjects   = ["Networks","Databases","Cloud","Programming","Security","Web","Linux","DevOps","Math","Physics","Electronics","Project"];
const classNames = ["1A","1B","2A","2B","3A","3B","4A","4B","5A","5B"];
const semesters  = ["2024W","2025S","2025W","2026S"];
 
// ----------------- Collections -----------------
const teachers_ref    = dbLab.teachers_ref;
const courses_ref     = dbLab.courses_ref;
const students_ref    = dbLab.students_ref;
const enrollments_ref = dbLab.enrollments_ref;
const assignments_ref = dbLab.assignments_ref;
const submissions_ref = dbLab.submissions_ref;
 
const students_emb    = dbLab.students_emb;
const courses_emb     = dbLab.courses_emb;
const submissions_emb = dbLab.submissions_emb;
 
// -------------------------
// 1) Teachers
// -------------------------
print("Generating teachers...");
let teacherBuf = [];
let teachers = []; // small in memory
for (let i = 1; i <= TEACHERS; i++) {
  const fn = pick(rng, firstNames);
  const ln = pick(rng, lastNames);
  const doc = {
    _id: ObjectId(),
    teacherNo: makeCode("T", i, 4),
    name: `${fn} ${ln}`,
    department: pick(rng, ["IT","ET","MECH"]),
    email: `${fn.toLowerCase()}.${ln.toLowerCase()}${i}@school.example`
  };
  teachers.push(doc);
  teacherBuf.push(doc);
  if (teacherBuf.length >= BATCH_SMALL) flushInsert(teachers_ref, teacherBuf);
}
flushInsert(teachers_ref, teacherBuf);
 
// -------------------------
// 2) Courses + assignments
// -------------------------
print("Generating courses + assignments...");
let courseBufRef = [];
let courseBufEmb = [];
let assignBufRef = [];
 
let courses = []; // small in memory
let assignmentsByCourse = new Map();  // courseIdStr -> [{_id, key, maxPoints}]
let teacherNameByCourse = new Map();  // courseIdStr -> teacherName (for snapshots)
 
for (let i = 1; i <= COURSES; i++) {
  const title = `${pick(rng, subjects)} ${randInt(rng, 1, 3)}`;
  const teacher = pick(rng, teachers);
  const sem = pick(rng, semesters);
  const courseId = ObjectId();
 
  const courseRef = {
    _id: courseId,
    courseCode: makeCode("C", i, 4),
    title,
    semester: sem,
    teacherId: teacher._id,
    room: `${randInt(rng, 1, 4)}.${randInt(rng, 1, 30)}`
  };
  courses.push(courseRef);
  teacherNameByCourse.set(courseId.str, teacher.name);
 
  courseBufRef.push(courseRef);
 
  let assignmentsEmbedded = [];
  let assignmentMiniList = [];
  for (let a = 1; a <= ASSIGNMENTS_PER_COURSE; a++) {
    const key = `A${String(a).padStart(2,"0")}`;
    const maxPoints = pick(rng, [10, 15, 20, 25]);
 
    assignmentsEmbedded.push({
      key,
      title: `Assignment ${a}: ${pick(rng, ["Lab","Worksheet","Project","Quiz"])}`,
      maxPoints,
      dueDaysFromStart: randInt(rng, 7, 70)
    });
 
    const aId = ObjectId();
    assignBufRef.push({
      _id: aId,
      courseId: courseId,
      key,
      title: `Assignment ${a}: ${pick(rng, ["Lab","Worksheet","Project","Quiz"])}`,
      maxPoints,
      dueDaysFromStart: randInt(rng, 7, 70)
    });
    assignmentMiniList.push({ _id: aId, key, maxPoints });
 
    if (assignBufRef.length >= BATCH_MED) flushInsert(assignments_ref, assignBufRef);
  }
  assignmentsByCourse.set(courseId.str, assignmentMiniList);
 
  courseBufEmb.push({
    _id: courseId,
    courseCode: courseRef.courseCode,
    title,
    semester: sem,
    teacher: {
      teacherId: teacher._id,
      nameSnapshot: teacher.name,
      emailSnapshot: teacher.email
    },
    room: courseRef.room,
    assignments: assignmentsEmbedded
  });
 
  if (courseBufRef.length >= BATCH_SMALL) flushInsert(courses_ref, courseBufRef);
  if (courseBufEmb.length >= 500) flushInsert(courses_emb, courseBufEmb);
}
flushInsert(courses_ref, courseBufRef);
flushInsert(courses_emb, courseBufEmb);
flushInsert(assignments_ref, assignBufRef);
 
// -------------------------
// 3) Students + enrollments
// -------------------------
print("Generating students + enrollments...");
let studentBufRef = [];
let studentBufEmb = [];
let enrollBufRef  = [];
 
let enrollmentsByCourse = new Map(); // courseIdStr -> [studentId,...]
 
for (let i = 1; i <= STUDENTS; i++) {
  const fn = pick(rng, firstNames);
  const ln = pick(rng, lastNames);
  const sid = ObjectId();
  const studentNo = makeCode("S", i, 5);
 
  const baseStudent = {
    _id: sid,
    studentNo,
    name: `${fn} ${ln}`,
    class: pick(rng, classNames),
    yearOfBirth: randInt(rng, 2006, 2011)
  };
 
  studentBufRef.push(baseStudent);
 
  const enrollCount = randInt(rng, ENROLL_PER_STUDENT_MIN, ENROLL_PER_STUDENT_MAX);
  let enrollmentsEmbedded = [];
  for (let e = 0; e < enrollCount; e++) {
    const course = pick(rng, courses);
    const status = pick(rng, ["active","active","active","dropped"]);
 
    enrollmentsEmbedded.push({
      courseId: course._id,
      courseTitleSnapshot: course.title,
      semesterSnapshot: course.semester,
      status
    });
 
    enrollBufRef.push({
      _id: ObjectId(),
      studentId: sid,
      courseId: course._id,
      status,
      since: new Date(2024, randInt(rng, 8, 11), randInt(rng, 1, 28))
    });
 
    const k = course._id.str;
    if (!enrollmentsByCourse.has(k)) enrollmentsByCourse.set(k, []);
    enrollmentsByCourse.get(k).push(sid);
 
    if (enrollBufRef.length >= BATCH_BIG) flushInsert(enrollments_ref, enrollBufRef);
  }
 
  studentBufEmb.push({ ...baseStudent, enrollments: enrollmentsEmbedded });
 
  if (studentBufRef.length >= BATCH_MED) flushInsert(students_ref, studentBufRef);
  if (studentBufEmb.length >= 1000) flushInsert(students_emb, studentBufEmb);
}
flushInsert(students_ref, studentBufRef);
flushInsert(students_emb, studentBufEmb);
flushInsert(enrollments_ref, enrollBufRef);
 
// -------------------------
// 4) Submissions (streaming)
// -------------------------
print("Generating submissions (streaming inserts)...");
let subBufRef = [];
let subBufEmb = [];
 
function pointsFor(maxPoints) {
  const p = rng();
  if (p < 0.1) return 0;
  if (p < 0.2) return Math.floor(maxPoints * 0.4);
  if (p < 0.6) return Math.floor(maxPoints * 0.75);
  return randInt(rng, Math.floor(maxPoints * 0.8), maxPoints);
}
 
let submissionCount = 0;
for (const c of courses) {
  const enrolledStudents = enrollmentsByCourse.get(c._id.str) || [];
  if (enrolledStudents.length === 0) continue;
 
  const teacherNameSnap = teacherNameByCourse.get(c._id.str) || "";
  const assList = assignmentsByCourse.get(c._id.str) || [];
  for (const a of assList) {
    const want = randInt(rng, SUBMISSIONS_PER_ASSIGNMENT_MIN, SUBMISSIONS_PER_ASSIGNMENT_MAX);
 
    for (let k = 0; k < want; k++) {
      const studentId = enrolledStudents[randInt(rng, 0, enrolledStudents.length - 1)];
      const pts = pointsFor(a.maxPoints);
 
      subBufRef.push({
        _id: ObjectId(),
        assignmentId: a._id,
        courseId: c._id,
        studentId,
        submittedAt: new Date(2025, randInt(rng, 0, 11), randInt(rng, 1, 28)),
        points: pts,
        status: pts === 0 ? pick(rng, ["missing","submitted"]) : "submitted"
      });
 
      subBufEmb.push({
        _id: ObjectId(),
        courseId: c._id,
        assignmentKey: a.key,
        studentId,
        submittedAt: new Date(2025, randInt(rng, 0, 11), randInt(rng, 1, 28)),
        points: pts,
        status: pts === 0 ? pick(rng, ["missing","submitted"]) : "submitted",
        courseTitleSnapshot: c.title,
        teacherNameSnapshot: teacherNameSnap
      });
 
      submissionCount++;
 
      if (subBufRef.length >= BATCH_BIG) flushInsert(submissions_ref, subBufRef);
      if (subBufEmb.length >= BATCH_BIG) flushInsert(submissions_emb, subBufEmb);
    }
  }
}
flushInsert(submissions_ref, subBufRef);
flushInsert(submissions_emb, subBufEmb);
 
print(`Inserted approx submissions per model: ${submissionCount}`);
 
// -------------------------
// 5) Minimal indexes
// -------------------------
print("Creating minimal indexes...");
students_ref.createIndex({ studentNo: 1 }, { unique: true });
courses_ref.createIndex({ courseCode: 1 }, { unique: true });
teachers_ref.createIndex({ teacherNo: 1 }, { unique: true });
 
students_emb.createIndex({ studentNo: 1 }, { unique: true });
courses_emb.createIndex({ courseCode: 1 }, { unique: true });
 
submissions_ref.createIndex({ studentId: 1 });
submissions_emb.createIndex({ studentId: 1 });
 
print("Dataset import finished (streaming v2).");
printjson(dbLab.getCollectionNames().sort());

Exercise

  • Complete the tasks from the following script.
  • You can copy the whole script into DataGrip and complete the tasks there.
/**
 * tasks.js
 * --------
 * Open this file in an editor and fill in the queries where you see TODO blocks.
 *
 * Tip:
 * - Use explain("executionStats") for performance comparisons.
 * - Measure before/after creating indexes.
 */
 
const dbLab = db.getSiblingDB("htl_mongo_relations_lab");
 
// Collections (referenced)
const students_ref    = dbLab.students_ref;
const courses_ref     = dbLab.courses_ref;
const teachers_ref    = dbLab.teachers_ref;
const enrollments_ref = dbLab.enrollments_ref;
const assignments_ref = dbLab.assignments_ref;
const submissions_ref = dbLab.submissions_ref;
 
// Collections (embedded/hybrid)
const students_emb    = dbLab.students_emb;
const courses_emb     = dbLab.courses_emb;
const submissions_emb = dbLab.submissions_emb;
 
// Helper: pick one student + one course to work with
const anyStudent = students_ref.findOne();
const anyCourse  = courses_ref.findOne();
 
print("Using example student:", anyStudent.studentNo, anyStudent.name);
print("Using example course:", anyCourse.courseCode, anyCourse.title);
 
// ------------------------------------------------------------
// Part A — Relationship modeling + simple queries (warm-up)
// ------------------------------------------------------------
 
/**
 * A1) 1:n with embedding:
 * Fetch one course from courses_emb and show only:
 *   - courseCode, title
 *   - assignments (only key + title)
 *
 * TODO: Write the query here.
 */
// TODO_A1
 
/**
 * A2) 1:n with references:
 * Fetch the same course from courses_ref and then fetch its assignments from assignments_ref.
 *
 * TODO: Write TWO queries (course then assignments).
 */
// TODO_A2
 
/**
 * A3) Count: How many assignments exist per course in the referenced model?
 * Return top 5 courses with the most assignments (should be equal in this dataset, but still do it).
 *
 * Hint: aggregation with $group.
 */
// TODO_A3
 
// ------------------------------------------------------------
// Part B — m:n (students ↔ courses) via join collection vs embedded enrollments
// ------------------------------------------------------------
 
/**
 * B1) Referenced model: For one student, list the titles of all active courses.
 *
 * Requirements:
 * - Use enrollments_ref to find active enrollments
 * - Then fetch courses_ref by _id with $in
 *
 * TODO: implement the multi-query approach.
 */
// TODO_B1
 
/**
 * B2) Referenced model: Do B1 as ONE query using aggregation with $lookup.
 *
 * Requirements:
 * - Start from students_ref
 * - $lookup enrollments_ref by studentId
 * - $match only "active"
 * - $lookup courses_ref using enrollments.courseId
 * - Output: studentNo, name, and array of course titles
 */
// TODO_B2
 
/**
 * B3) Embedded/hybrid model: For the same studentNo, list active courses using students_emb.
 *
 * Requirements:
 * - No $lookup allowed.
 * - Use the embedded enrollments array and project courseTitleSnapshot.
 */
// TODO_B3
 
// ------------------------------------------------------------
// Part C — Joining across multiple collections (the "new" part)
// ------------------------------------------------------------
 
/**
 * C1) For ONE course, show:
 *   - courseCode, title, semester
 *   - teacher name (from teachers_ref)
 *
 * Do it in ONE query with $lookup (courses_ref -> teachers_ref).
 */
// TODO_C1
 
/**
 * C2) For ONE student, show the student and all courses PLUS each course's teacher name.
 * Referenced model, ONE pipeline.
 *
 * Join path:
 *   students_ref -> enrollments_ref -> courses_ref -> teachers_ref
 *
 * Output fields:
 *   studentNo, name, courses: [{ title, semester, teacherName }]
 */
// TODO_C2
 
/**
 * C3) Same result as C2, but using MULTIPLE queries (application-side join).
 *
 * TODO: implement with 3–4 queries (students, enrollments, courses, teachers).
 */
// TODO_C3
 
// ------------------------------------------------------------
// Part D — Performance lab: submissions (big collection)
// ------------------------------------------------------------
 
/**
 * D1) Top 10 students by total points in semester "2025W" (referenced model).
 *
 * You need to:
 * - filter courses_ref by semester "2025W" to get courseIds
 * - filter submissions_ref by those courseIds
 * - group by studentId and sum points
 * - join students_ref to get studentNo + name
 *
 * Requirements:
 * - Do it as ONE aggregation pipeline using $lookup where needed.
 * - Run explain("executionStats") and note executionTimeMillis + totalDocsExamined.
 */
// TODO_D1
 
/**
 * D2) Do the SAME ranking in the embedded/hybrid model using submissions_emb.
 *
 * Hint: submissions_emb already has courseTitleSnapshot and teacherNameSnapshot,
 * but you still need semester. Either:
 * - join to courses_emb for semester, OR
 * - first get courseIds for semester and filter by those (then no $lookup)
 *
 * Requirements:
 * - Try BOTH approaches:
 *   (a) $lookup to courses_emb to filter by semester
 *   (b) pre-fetch courseIds then one pipeline without $lookup
 *
 * Compare execution stats.
 */
// TODO_D2a
// TODO_D2b
 
/**
 * D3) Index experiment:
 * Create indexes and re-run D1.
 *
 * Suggested indexes:
 * - submissions_ref: { courseId: 1, studentId: 1 }
 * - courses_ref: { semester: 1 }
 * - enrollments_ref: { studentId: 1, status: 1 }
 *
 * TODO:
 * 1) Create indexes
 * 2) Re-run D1 explain and compare numbers.
 */
// TODO_D3
 
/**
 * D4) "Student dashboard" query:
 * For a given studentNo, show for each active course:
 *   - course title
 *   - teacher name
 *   - average points over all submissions in that course
 *
 * Implement in referenced model:
 * students_ref -> enrollments_ref -> courses_ref -> teachers_ref -> submissions_ref
 *
 * Requirements:
 * - ONE aggregation pipeline.
 * - Use $group for avg points.
 */
// TODO_D4
 
/**
 * D5) Same dashboard in embedded/hybrid model:
 * - Use students_emb for active course list (snapshot titles)
 * - Use submissions_emb for points
 * - Optionally join courses_emb for semester/teacher, but try to avoid joins first
 *
 * Requirements:
 * - Provide a version WITHOUT $lookup (best-case denormalized read model)
 * - Then provide a version WITH $lookup (if you want semester filtering, etc.)
 */
// TODO_D5_noLookup
// TODO_D5_withLookup
 
// ------------------------------------------------------------
// Part E — Reflection questions (write short answers in your notes)
// ------------------------------------------------------------
/**
 * E1) Where did embedding clearly win (simpler/faster)?
 * E2) Where did referencing clearly win (flexible/consistent)?
 * E3) Which indexes made the biggest difference and why?
 * E4) Identify at least one place where duplicating snapshots is the "correct" choice.
 */
 
print("Tasks file loaded. Fill in the TODO blocks and run queries interactively.");